In [4]:
import os
os.makedirs('model', exist_ok=True)
os.makedirs('data', exist_ok=True)

In [8]:
import pandas as pd
from sklearn.ensemble import IsolationForest
import joblib
import os

# --- Configuration ---
DATA_PATH = 'data/complex_transactions.csv'
MODEL_DIR = 'model'
MODEL_PATH = os.path.join(MODEL_DIR, 'isolation_forest_model_complex.pkl')
FEATURES_PATH = os.path.join(MODEL_DIR, 'feature_list.pkl')

os.makedirs(MODEL_DIR, exist_ok=True)

# --- Load Data ---
print("Loading 100,000 records...")
df = pd.read_csv(DATA_PATH)

# --- Feature Engineering ---
# Note: Full country names are now used
categorical_features = ['Currency', 'Transaction_Type', 'Device_Type', 'Sender_Country', 'Recipient_Country']
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

numerical_features = ['Amount', 'Transactions_Last_24H', 'Avg_Transaction_Value_Last_30D', 'Is_New_Recipient']
encoded_cols = [col for col in df_encoded.columns if any(cat_feat in col for cat_feat in categorical_features)]
final_features = numerical_features + encoded_cols

X = df_encoded.reindex(columns=final_features, fill_value=0)

# --- Train the Model ---
print("Training Isolation Forest model... this may take a few moments.")
contamination_rate = df['Is_Anomaly'].sum() / len(df)
model = IsolationForest(n_estimators=100, contamination=contamination_rate, random_state=42, n_jobs=-1) # Use all CPU cores
model.fit(X)

# --- Save the Model and Feature List ---
joblib.dump(model, MODEL_PATH)
joblib.dump(final_features, FEATURES_PATH)

print(f"✅ Model saved to '{MODEL_PATH}'")
print(f"✅ Feature list saved to '{FEATURES_PATH}'")

Loading 100,000 records...
Training Isolation Forest model... this may take a few moments.
✅ Model saved to 'model/isolation_forest_model_complex.pkl'
✅ Feature list saved to 'model/feature_list.pkl'
