In [1]:
# Setup and Configuration
import pandas as pd
import duckdb
import pickle
from sklearn.ensemble import IsolationForest

# --- Input Configuration ---
INPUT_DB_PATH = '../../dataset/unsupervised_dataset.duckdb'
INPUT_TABLE_NAME = 'training_data'

# --- Output Configuration ---
OUTPUT_DB_PATH = '../../dataset/supervised_dataset.duckdb'
OUTPUT_TABLE_NAME = 'training_table_new_approach'
MODEL_SAVE_PATH = '../../models/unsupervised/iforest_model.pkl'

print(f"Loading data from: {INPUT_DB_PATH} (Table: {INPUT_TABLE_NAME})")
print(f"Saving enhanced data to: {OUTPUT_DB_PATH} (Table: {OUTPUT_TABLE_NAME})")

Loading data from: ../../dataset/unsupervised_dataset.duckdb (Table: training_data)
Saving enhanced data to: ../../dataset/supervised_dataset.duckdb (Table: training_table_new_approach)


In [2]:
# Load Preprocessed Data
try:
    con_input = duckdb.connect(database=INPUT_DB_PATH, read_only=True)
    print(f"\nLoading preprocessed data from '{INPUT_TABLE_NAME}'...")
    df = con_input.query(f"SELECT * FROM {INPUT_TABLE_NAME}").to_df()
    
    if df.empty:
        raise ValueError(f"Table '{INPUT_TABLE_NAME}' is empty. Did 02_preprocessing run?")
    
    print(f"Successfully loaded {len(df)} users with {len(df.columns)} columns.")

except Exception as e:
    print(f"--- CRITICAL ERROR: Could not load data ---")
    print(f"Error: {e}")
    raise
finally:
    if 'con_input' in locals():
        con_input.close()
        print("Input database connection closed.")


Loading preprocessed data from 'training_data'...
Successfully loaded 1000 users with 36 columns.
Input database connection closed.


In [3]:
# Prepare Data for Modeling
user_ids = df['user_id']

X_train = df.drop(columns=['user_id'])

features_used = X_train.columns.tolist()
print(f"\nData prepared. Training model on {len(features_used)} features.")
print(f"First 5 features: {features_used[:5]}")


Data prepared. Training model on 35 features.
First 5 features: ['total_logon_events', 'logon_unique_pcs', 'after_hours_logons', 'weekend_logons', 'logon_ratio']


In [4]:
# Train Isolation Forest Model
print("\n--- Training Isolation Forest ---")

# Instantiate the model
model = IsolationForest(
    n_estimators=100,
    contamination='auto',
    random_state=42,
    n_jobs=-1
)

# Train the model
model.fit(X_train)

print("Model training complete.")


--- Training Isolation Forest ---
Model training complete.


In [5]:
# Generate Anomaly Scores

print("Generating anomaly scores for all users...")

# .decision_function() gets the raw scores (negative = anomaly)
raw_scores = model.decision_function(X_train)

# Invert scores: higher = more anomalous
anomaly_scores = -1 * raw_scores

print("Scores generated.")

Generating anomaly scores for all users...
Scores generated.


In [6]:
# 6. Inspect Results
print("\n--- Model Evaluation (Top 10 Anomalous Users) ---")

# Create a results DataFrame
df_results = pd.DataFrame({
    'user_id': user_ids,
    'anomaly_score': anomaly_scores
})

# Sort by the score to see the "most anomalous" users
df_results_sorted = df_results.sort_values(by='anomaly_score', ascending=False)

# Print the top 10
print(df_results_sorted.head(10))

# Print basic stats for the scores
print("\n--- Anomaly Score Statistics ---")
print(df_results_sorted['anomaly_score'].describe())


--- Model Evaluation (Top 10 Anomalous Users) ---
     user_id  anomaly_score
60   ATE0869       0.156784
64   DLM0051       0.142656
914  RZC0746       0.141692
454  TVS0006       0.128523
921  KBP0008       0.127713
915  WPR0368       0.127697
162  HCS0003       0.124693
65   LBF0214       0.115645
66   NAF0326       0.115515
836  HTH0007       0.115124

--- Anomaly Score Statistics ---
count    1000.000000
mean       -0.064192
std         0.051015
min        -0.135961
25%        -0.097849
50%        -0.078691
75%        -0.051263
max         0.156784
Name: anomaly_score, dtype: float64


In [7]:
# ## 7. Save Model Artifact

print(f"\nSaving trained model to {MODEL_SAVE_PATH}...")
with open(MODEL_SAVE_PATH, 'wb') as f:
    pickle.dump(model, f)
print("Model saved successfully.")


Saving trained model to ../../models/unsupervised/iforest_model.pkl...
Model saved successfully.


In [8]:
# Save Enhanced Data to output db
print(f"\nSaving enhanced data to new database: '{OUTPUT_DB_PATH}'")

# Add the new anomaly_score column to our original preprocessed data
df_enhanced = df.copy() # Start with the 'training_data' (36 cols)
df_enhanced['anomaly_score'] = anomaly_scores # Add the new score (37 cols total)

print(f"Enhanced data has {len(df_enhanced.columns)} columns.")

# --- Save to NEW DuckDB ---
try:
    con_output = duckdb.connect(database=OUTPUT_DB_PATH, read_only=False)
    
    # Register the enhanced DataFrame
    con_output.register('df_enhanced_view', df_enhanced)
    
    # Drop the target table if it already exists
    con_output.execute(f"DROP TABLE IF EXISTS {OUTPUT_TABLE_NAME}")
    
    # Create the new table
    con_output.execute(f"CREATE TABLE {OUTPUT_TABLE_NAME} AS SELECT * FROM df_enhanced_view")
    
    print(f"Successfully created table '{OUTPUT_TABLE_NAME}' in '{OUTPUT_DB_PATH}'.")
    
    # Verify by checking the new table
    print("\nVerifying new table (first 5 rows):")
    final_table = con_output.query(f"SELECT * FROM {OUTPUT_TABLE_NAME} LIMIT 5").to_df()
    print(final_table)
    print("\nNew table info:")
    con_output.table(OUTPUT_TABLE_NAME).to_df().info()


except Exception as e:
    print(f"--- ERROR while saving enhanced data: {e} ---")
finally:
    if 'con_output' in locals():
        con_output.close()
    print(f"\nOutput database connection to '{OUTPUT_DB_PATH}' closed. Script finished.")


Saving enhanced data to new database: '../../dataset/supervised_dataset.duckdb'
Enhanced data has 37 columns.
Successfully created table 'training_table_new_approach' in '../../dataset/supervised_dataset.duckdb'.

Verifying new table (first 5 rows):
   user_id  total_logon_events  logon_unique_pcs  after_hours_logons  \
0  RAW0915                1203                28               414.0   
1  JTM0223                1378               361               879.0   
2  CCA0046                1608               447              1178.0   
3  CIM0271                 997                 3               354.0   
4  DFH0188                1133                 3               404.0   

   weekend_logons  logon_ratio  logon_after_hours_ratio  logon_weekend_ratio  \
0           147.0     0.633416                 0.344140             0.122195   
1             0.0     0.542816                 0.637881             0.000000   
2             0.0     0.500000                 0.732587             0.000000