In [1]:
# Setup and Configuration
import pandas as pd
import duckdb

# --- Configuration ---
DUCKDB_PATH = '../../dataset/unsupervised_dataset.duckdb' 
SOURCE_TABLE_NAME = 'user_features'
TARGET_TABLE_NAME = 'training_data'

In [2]:
# Columns to Drop (Based on EDA)

# 1. Identifier columns (keeping 'user_id' as our key)
id_cols_to_drop = [
    'psychometric_employee_name',
    'ldap_employee_name'
]

# 2. Columns with >73% missing data
missing_data_cols_to_drop = [
    'total_device_events',
    'connect_count',
    'disconnect_count',
    'device_unique_pcs',
    'after_hours_connects',
    'weekend_connects',
    'connect_ratio',
    'device_after_hours_ratio',
    'device_weekend_ratio',
    'total_file_events',
    'file_unique_pcs',
    'unique_files',
    'unique_file_types',
    'exe_files_accessed',
    'zip_files_accessed',
    'pdf_files_accessed',
    'docx_files_accessed',
    'xlsx_files_accessed',
    'after_hours_file_events',
    'weekend_file_events',
    'file_after_hours_ratio',
    'file_weekend_ratio',
    'exe_ratio',
    'zip_ratio',
    'pdf_ratio',
    'docx_ratio',
    'xlsx_ratio'
]

# 3. Columns with zero variance (all values are 0)
zero_variance_cols_to_drop = [
    'role_changed',
    'department_changed',
    'team_changed'
]

# 4. Redundant features (highly correlated)
redundant_cols_to_drop = [
    'logon_count',
    'logoff_count',
    'emails_with_attachments'
]

# Combine all lists
all_cols_to_drop = (
    id_cols_to_drop +
    missing_data_cols_to_drop +
    zero_variance_cols_to_drop +
    redundant_cols_to_drop
)

print(f"Defined {len(all_cols_to_drop)} total columns to drop.")

Defined 35 total columns to drop.


In [3]:
# Load, Process, and Save Data

try:
    # --- Load Source Data ---
    con = duckdb.connect(database=DUCKDB_PATH, read_only=False) # Read-write access
    print(f"Loading raw data from '{SOURCE_TABLE_NAME}'...")
    df = con.query(f"SELECT * FROM {SOURCE_TABLE_NAME}").to_df()
    
    original_cols = df.columns.tolist()
    original_count = len(original_cols)
    print(f"Loaded {len(df)} users with {original_count} features.")

    # --- Identify columns that actually exist in the DF to avoid errors ---
    cols_to_drop_existing = [col for col in all_cols_to_drop if col in original_cols]
    print(f"Found {len(cols_to_drop_existing)} columns to drop that exist in the table.")

    # --- Preprocessing: Drop the columns ---
    df_preprocessed = df.drop(columns=cols_to_drop_existing)
    final_count = len(df_preprocessed.columns)
    
    print(f"\n--- Preprocessing Summary ---")
    print(f"Original feature count: {original_count}")
    print(f"Dropped feature count:  {len(cols_to_drop_existing)}")
    print(f"Final feature count:    {final_count} (including 'user_id')")

    # --- Save to New Table ---
    print(f"\nSaving preprocessed data to '{TARGET_TABLE_NAME}'...")
    
    # Register the DataFrame to make it available to DuckDB
    con.register('df_preprocessed_view', df_preprocessed)
    
    # Drop the target table if it already exists 
    con.execute(f"DROP TABLE IF EXISTS {TARGET_TABLE_NAME}")
    
    # Create the new table from the registered DataFrame
    con.execute(f"CREATE TABLE {TARGET_TABLE_NAME} AS SELECT * FROM df_preprocessed_view")
    
    print(f"Successfully created table '{TARGET_TABLE_NAME}'.")

except Exception as e:
    print(f"\n--- AN ERROR OCCURRED ---")
    print(f"Error: {e}")
finally:
    # Always close the connection
    if 'con' in locals():
        con.close()
    print("Database connection closed.")


Loading raw data from 'user_features'...
Loaded 1000 users with 71 features.
Found 35 columns to drop that exist in the table.

--- Preprocessing Summary ---
Original feature count: 71
Dropped feature count:  35
Final feature count:    36 (including 'user_id')

Saving preprocessed data to 'training_data'...
Successfully created table 'training_data'.
Database connection closed.


In [6]:
# Verification 
print("\n--- Verification Step ---")
try:
    con = duckdb.connect(database=DUCKDB_PATH, read_only=True)
    verify_df = con.query(f"SELECT * FROM {TARGET_TABLE_NAME} LIMIT 5").to_df()
    
    print(f"Successfully loaded data from new table '{TARGET_TABLE_NAME}'.")
    print("First 5 rows:")
    print(verify_df)

    print("\nVerifying final table structure:")
    # Get full info
    verify_all_df = con.query(f"SELECT * FROM {TARGET_TABLE_NAME}").to_df()
    verify_all_df.info()
    
    # Expected final count: 71 - 33 = 38
    print(f"\nFinal table has {len(verify_all_df.columns)} columns, as expected.")

except Exception as e:
    print(f"Error during verification: {e}")
finally:
    if 'con' in locals():
        con.close()
    print("Verification connection closed.")


--- Verification Step ---
Successfully loaded data from new table 'training_data'.
First 5 rows:
   user_id  total_logon_events  logon_unique_pcs  after_hours_logons  \
0  RAW0915                1203                28               414.0   
1  JTM0223                1378               361               879.0   
2  CCA0046                1608               447              1178.0   
3  CIM0271                 997                 3               354.0   
4  DFH0188                1133                 3               404.0   

   weekend_logons  logon_ratio  logon_after_hours_ratio  logon_weekend_ratio  \
0           147.0     0.633416                 0.344140             0.122195   
1             0.0     0.542816                 0.637881             0.000000   
2             0.0     0.500000                 0.732587             0.000000   
3             0.0     0.635908                 0.355065             0.000000   
4             0.0     0.619594                 0.356575             0