In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report
import pickle
import os

print("--- Step 1: Libraries imported. ---")

# --- Step 2: Load Data ---
csv_file_name = 'realistic_statement.csv'
if not os.path.exists(csv_file_name):
    print(f"ERROR: '{csv_file_name}' not found. Please upload it first.")
else:
    df = pd.read_csv(csv_file_name)
    print(f"--- Step 2: Successfully loaded '{csv_file_name}'. ---")

    # --- Step 3: Create Ground Truth Labels ---
    known_anomalies = ['Casino Withdrawal', 'Unusual Wire Transfer']
    df['is_anomaly_true'] = df['Description'].apply(lambda x: 1 if x in known_anomalies else 0)
    print(f"--- Step 3: Ground truth labels created. ---")

    # --- Step 4: Feature Engineering ---
    df['Date'] = pd.to_datetime(df['Date'])
    df['day_of_week'] = df['Date'].dt.dayofweek
    df['day_of_month'] = df['Date'].dt.day
    df['is_weekend'] = (df['Date'].dt.dayofweek >= 5).astype(int)

    # Use get_dummies to create the final set of features
    df_encoded = pd.get_dummies(df, columns=['Category'], prefix='cat')

    # Define the final feature columns
    feature_columns = ['Amount', 'day_of_week', 'day_of_month', 'is_weekend'] + [col for col in df_encoded if col.startswith('cat_')]

    # Create our final training data, X, with columns in the correct order
    X = df_encoded[feature_columns]
    y = df_encoded['is_anomaly_true']
    print("--- Step 4: Feature engineering complete. ---")

    # --- Step 5: Train/Test Split ---
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    print(f"--- Step 5: Data split complete. ---")

    # --- Step 6: Train the Model ---
    model = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
    model.fit(X_train)
    print("--- Step 6: Model training complete. ---")

    # --- Step 7: Evaluate Accuracy ---
    predictions_test = model.predict(X_test)
    predictions_test_mapped = np.where(predictions_test == -1, 1, 0)
    print("\n================== ACCURACY REPORT ==================\n")
    try:
        report = classification_report(y_test, predictions_test_mapped, target_names=['Normal (0)', 'Anomaly (1)'])
        print(report)
    except Exception as e:
        print(f"Could not generate classification report: {e}")
    print("=====================================================\n")

    # --- Step 8: Save the Model ---
    model_filename = 'personal_finance_model.pkl'
    with open(model_filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"--- Step 8: Model successfully saved to '{model_filename}' ---")

    # --- NEW & CRITICAL Step 9: Save the Feature Column List ---
    column_filename = 'model_columns.pkl'
    with open(column_filename, 'wb') as f:
        pickle.dump(list(X.columns), f)
    print(f"--- Step 9: Model columns successfully saved to '{column_filename}' ---")

    print("\nPROCESS COMPLETE. Please download BOTH 'personal_finance_model.pkl' and 'model_columns.pkl'.")



--- Step 1: Libraries imported. ---
--- Step 2: Successfully loaded 'realistic_statement.csv'. ---
--- Step 3: Ground truth labels created. ---
--- Step 4: Feature engineering complete. ---
--- Step 5: Data split complete. ---
--- Step 6: Model training complete. ---


              precision    recall  f1-score   support

  Normal (0)       0.83      0.71      0.77         7
 Anomaly (1)       0.00      0.00      0.00         1

    accuracy                           0.62         8
   macro avg       0.42      0.36      0.38         8
weighted avg       0.73      0.62      0.67         8


--- Step 8: Model successfully saved to 'personal_finance_model.pkl' ---
--- Step 9: Model columns successfully saved to 'model_columns.pkl' ---

PROCESS COMPLETE. Please download BOTH 'personal_finance_model.pkl' and 'model_columns.pkl'.
