In [4]:
import os
os.makedirs('model', exist_ok=True)
os.makedirs('data', exist_ok=True)

In [5]:
import pandas as pd
from sklearn.ensemble import IsolationForest
import joblib
import os

# --- Configuration ---
DATA_PATH = 'data/complex_transactions.csv'
MODEL_DIR = 'model'
MODEL_PATH = os.path.join(MODEL_DIR, 'isolation_forest_model_complex.pkl')
FEATURES_PATH = os.path.join(MODEL_DIR, 'feature_list.pkl')

# --- Ensure model directory exists ---
os.makedirs(MODEL_DIR, exist_ok=True)

# --- Load Data ---
try:
    df = pd.read_csv(DATA_PATH)
except FileNotFoundError:
    print(f"Error: '{DATA_PATH}' not found. Please run generate_data.py first.")
    exit()

# --- Feature Engineering ---
categorical_features = ['Currency', 'Transaction_Type', 'Device_Type', 'Sender_Country_Code']
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

numerical_features = ['Amount', 'Transactions_Last_24H', 'Avg_Transaction_Value_Last_30D', 'Is_New_Recipient']
encoded_cols = [col for col in df_encoded.columns if any(cat_feat in col for cat_feat in categorical_features)]
final_features = numerical_features + encoded_cols

# Align columns to handle cases where some categories might not be in the data split
X = df_encoded.reindex(columns=final_features, fill_value=0)

# --- Train the Model ---
# Contamination is the expected proportion of anomalies in the data
contamination_rate = df['Is_Anomaly'].sum() / len(df) if 'Is_Anomaly' in df.columns else 'auto'
model = IsolationForest(n_estimators=100, contamination=contamination_rate, random_state=42)
model.fit(X)

# --- Save the Model and Feature List ---
joblib.dump(model, MODEL_PATH)
joblib.dump(final_features, FEATURES_PATH)

print(f"Model saved to '{MODEL_PATH}'")
print(f"Feature list saved to '{FEATURES_PATH}'")

Model saved to 'model/isolation_forest_model_complex.pkl'
Feature list saved to 'model/feature_list.pkl'
