In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
import mlflow
import mlflow.sklearn

In [2]:
# Load datasets
fraud_data = pd.read_csv('/home/ayalk94/Documents/GitHub/Enhanced-Fraud-Detection--for-E-Commerce-and-Banking-using-ML-and-Geolocation/data/Data/Fraud_Data.csv')
creditcard_data = pd.read_csv('/home/ayalk94/Documents/GitHub/Enhanced-Fraud-Detection--for-E-Commerce-and-Banking-using-ML-and-Geolocation/data/Data/creditcard.csv')

### 1. Data Preparation

In [3]:
# Feature and target separation for fraud data
X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']

In [4]:
# Feature and target separation for credit card data
X_creditcard = creditcard_data.drop(columns=['Class'])
y_creditcard = creditcard_data['Class']

# Train-test split for both datasets (80% train, 20% test)
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)
X_creditcard_train, X_creditcard_test, y_creditcard_train, y_creditcard_test = train_test_split(X_creditcard, y_creditcard, test_size=0.2, random_state=42)

### 2. Model Selection and Training

In [11]:


# Check for non-numeric columns in the dataset
print(fraud_data.dtypes)

# Use Label Encoding for 'device_id' to reduce memory usage
label_encoder = LabelEncoder()
fraud_data['device_id'] = label_encoder.fit_transform(fraud_data['device_id'])

# Check if other categorical columns need encoding
# (already done for 'source', 'browser', and 'sex' in previous steps)

# Recreate the training/test splits after transformation
X_fraud = fraud_data.drop(columns=['class'])  # Exclude target
y_fraud = fraud_data['class']

# Train-test split for fraud_data (80% train, 20% test)
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)

# Define a function to train and evaluate models
def train_evaluate_model(model, X_train, X_test, y_train, y_test):
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = model.predict(X_test)
    
    # Evaluate performance
    print(f"Model: {model.__class__.__name__}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))

# Define models to compare
models = [
    LogisticRegression(max_iter=1000),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    MLPClassifier(max_iter=1000)
]

# Train and evaluate models on fraud_data
print("Fraud Data Model Performance")
for model in models:
    train_evaluate_model(model, X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test)


user_id                 int64
purchase_value          int64
device_id              object
age                     int64
ip_address            float64
class                   int64
signup_hour             int32
purchase_hour           int32
purchase_dayofweek      int32
source_Direct            bool
source_SEO               bool
browser_FireFox          bool
browser_IE               bool
browser_Opera            bool
browser_Safari           bool
sex_M                    bool
dtype: object
Fraud Data Model Performance
Model: LogisticRegression
Accuracy: 0.9057
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     27373
           1       0.00      0.00      0.00      2850

    accuracy                           0.91     30223
   macro avg       0.45      0.50      0.48     30223
weighted avg       0.82      0.91      0.86     30223



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: DecisionTreeClassifier
Accuracy: 0.8970
              precision    recall  f1-score   support

           0       0.95      0.93      0.94     27373
           1       0.46      0.56      0.51      2850

    accuracy                           0.90     30223
   macro avg       0.71      0.75      0.73     30223
weighted avg       0.91      0.90      0.90     30223

Model: RandomForestClassifier
Accuracy: 0.9558
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27373
           1       0.99      0.53      0.70      2850

    accuracy                           0.96     30223
   macro avg       0.97      0.77      0.84     30223
weighted avg       0.96      0.96      0.95     30223

Model: GradientBoostingClassifier
Accuracy: 0.9060
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     27373
           1       0.83      0.00      0.01      2850

    accuracy                         