In [3]:
# Import the required modules
#!pip install fastapi uvicorn streamlit

import pickle
import pandas as pd
import numpy as np
from collections import defaultdict
from ast import literal_eval
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt

# Load and preprocess data
df = pd.read_csv("synthetic_fraud_data.csv")

# Drop unnecessary columns
columns_to_drop = [
    'card_number', 'currency', 'device', 'device_fingerprint', 
    'ip_address', 'high_risk_merchant', 'is_fraud'
]
df_cleaned = df.drop(columns=columns_to_drop)

# Process timestamp and extract features
df_cleaned["timestamp"] = pd.to_datetime(df_cleaned["timestamp"], format='mixed')
df_cleaned['transaction_hour'] = df_cleaned['timestamp'].dt.hour
df_cleaned['transaction_day'] = df_cleaned['timestamp'].dt.day
df_cleaned['transaction_weekday'] = df_cleaned['timestamp'].dt.weekday
df_cleaned['is_weekend'] = df_cleaned['transaction_weekday'].apply(lambda x: 1 if x >= 5 else 0)
df_cleaned['transaction_month'] = df_cleaned['timestamp'].dt.month

# Process velocity data
df_cleaned['velocity_last_hour'] = df_cleaned['velocity_last_hour'].apply(literal_eval)
df_cleaned['num_transactions_last_hour'] = df_cleaned['velocity_last_hour'].apply(lambda x: x.get('num_transactions', 0))
df_cleaned['total_spent_last_hour'] = df_cleaned['velocity_last_hour'].apply(lambda x: x.get('total_amount', 0))
df_cleaned = df_cleaned.drop(columns=['velocity_last_hour'])

# Add amount ratio feature
df_cleaned['amount_to_average_ratio'] = df_cleaned['amount'] / df_cleaned.groupby('merchant_category')['amount'].transform('mean')

# Encode categorical features
categorical_columns = ['merchant_category', 'merchant_type', 'merchant', 'city', 'city_size', 'card_type', 'channel']
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df_cleaned[col] = le.fit_transform(df_cleaned[col])
    label_encoders[col] = le

# Scale numerical features
numerical_columns = ['amount', 'distance_from_home', 'num_transactions_last_hour', 'total_spent_last_hour', 'amount_to_average_ratio']
scaler = MinMaxScaler()
df_cleaned[numerical_columns] = scaler.fit_transform(df_cleaned[numerical_columns])

# Define features for model
features_for_model = [
    'amount',
    'merchant_category',
    'merchant_type',
    'num_transactions_last_hour',
    'total_spent_last_hour',
    'amount_to_average_ratio',
    'transaction_hour',
    'is_weekend',
    'distance_from_home'
]

# Train improved Isolation Forest model
iso_forest = IsolationForest(
    n_estimators=200,
    max_samples='auto',
    contamination=0.05,
    max_features=len(features_for_model),
    bootstrap=True,
    n_jobs=-1,
    random_state=42
)

# Add business rules
def apply_business_rules(row):
    high_risk_conditions = [
        row['amount_to_average_ratio'] > 3,  # Transaction 3x above average
        row['distance_from_home'] > 0.8,     # Unusual location
        row['num_transactions_last_hour'] > 5 # High frequency
    ]
    return 1 if any(high_risk_conditions) else 0

# Fit model and make predictions
iso_forest.fit(df_cleaned[features_for_model])
df_cleaned['model_flag'] = iso_forest.predict(df_cleaned[features_for_model])
df_cleaned['business_rules_flag'] = df_cleaned.apply(apply_business_rules, axis=1)
df_cleaned['final_overspending_flag'] = ((df_cleaned['model_flag'] == -1) | 
                                        (df_cleaned['business_rules_flag'] == 1)).astype(int)

# Save processed data and model
df_cleaned.to_csv("processed_data.csv")
with open("improved_isolation_forest_model.pkl", "wb") as model_file:
    pickle.dump({
        'model': iso_forest,
        'features': features_for_model,
        'label_encoders': label_encoders,
        'scaler': scaler
    }, model_file)

# Calculate and display overspending statistics
total_transactions = len(df_cleaned)
overspending_count = df_cleaned['final_overspending_flag'].sum()
print(f"\nOverspending Statistics:")
print(f"Total Transactions: {total_transactions}")
print(f"Flagged Transactions: {overspending_count}")
print(f"Overspending Rate: {(overspending_count/total_transactions)*100:.2f}%")


Overspending Statistics:
Total Transactions: 7483766
Flagged Transactions: 2582419
Overspending Rate: 34.51%
