In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import warnings
import time
warnings.filterwarnings('ignore')

In [None]:
# Load datasets
train_df = pd.read_csv('fraudTrain.csv')
test_df = pd.read_csv('fraudTest.csv')

In [None]:
# Explore data
print("Training Data Shape:", train_df.shape)
print("Test Data Shape:", test_df.shape)
print("Class Distribution in Training Data:")
print(train_df['is_fraud'].value_counts(normalize=True))

Training Data Shape: (1296675, 23)
Test Data Shape: (555719, 23)
Class Distribution in Training Data:
is_fraud
0    0.994211
1    0.005789
Name: proportion, dtype: float64


In [None]:
# Data Preprocessing
# Combine train and test for consistent preprocessing
combined_df = pd.concat([train_df, test_df], axis=0)

In [None]:
# Drop irrelevant columns (e.g., names, IDs, or dates that are too specific)
drop_columns = ['trans_num', 'first', 'last', 'street', 'dob', 'unix_time', 'trans_date_trans_time']
combined_df = combined_df.drop(columns=[col for col in drop_columns if col in combined_df.columns])

In [None]:
# Handle categorical variables
categorical_cols = ['merchant', 'category', 'gender', 'city', 'state', 'job']
for col in categorical_cols:
    if col in combined_df.columns:
        le = LabelEncoder()
        combined_df[col] = le.fit_transform(combined_df[col])

In [None]:
# Split back into train and test
train_df = combined_df.iloc[:len(train_df)]
test_df = combined_df.iloc[len(train_df):]

In [None]:
# Separate features and target
X_train = train_df.drop('is_fraud', axis=1)
y_train = train_df['is_fraud']
X_test = test_df.drop('is_fraud', axis=1)
y_test = test_df['is_fraud']

In [None]:
# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100)
}

In [None]:
# Train and evaluate models
for name, model in models.items():
    print(f"\nTraining {name}...")
    start_time = time.time()

    try:
        # Train model
        model.fit(X_train_sub, y_train_sub)
        print(f"Training completed in {time.time() - start_time:.2f} seconds")

        # Predict
        print("Predicting...")
        start_pred = time.time()
        y_pred = model.predict(X_test)
        print(f"Prediction completed in {time.time() - start_pred:.2f} seconds")

        # Evaluation
        print(f"\nResults for {name}:")
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
        print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))

    except Exception as e:
        print(f"Error in {name}: {str(e)}")

    print(f"Total time for {name}: {time.time() - start_time:.2f} seconds")


Training Logistic Regression...
Error in Logistic Regression: name 'X_train_sub' is not defined
Total time for Logistic Regression: 0.00 seconds

Training Decision Tree...
Error in Decision Tree: name 'X_train_sub' is not defined
Total time for Decision Tree: 0.00 seconds

Training Random Forest...
Error in Random Forest: name 'X_train_sub' is not defined
Total time for Random Forest: 0.00 seconds


In [None]:
# Train and evaluate models
for name, model in models.items():
    print(f"\nTraining {name}...")
    start_time = time.time()

    try:
        # Train model
        model.fit(X_train_sub, y_train_sub)
        print(f"Training completed in {time.time() - start_time:.2f} seconds")

        # Predict
        print("Predicting...")
        start_pred = time.time()
        y_pred = model.predict(X_test)
        print(f"Prediction completed in {time.time() - start_pred:.2f} seconds")

        # Evaluation
        print(f"\nResults for {name}:")
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
        print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))

    except Exception as e:
        print(f"Error in {name}: {str(e)}")

    print(f"Total time for {name}: {time.time() - start_time:.2f} seconds")


Training Logistic Regression...
Error in Logistic Regression: name 'X_train_sub' is not defined
Total time for Logistic Regression: 0.00 seconds

Training Decision Tree...
Error in Decision Tree: name 'X_train_sub' is not defined
Total time for Decision Tree: 0.00 seconds

Training Random Forest...
Error in Random Forest: name 'X_train_sub' is not defined
Total time for Random Forest: 0.00 seconds


In [None]:
# Optional: Hyperparameter tuning for Random Forest (best model candidate)
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='f1', n_jobs=-1)
grid_search.fit(X_train_balanced, y_train_balanced)

print("\nBest Random Forest Parameters:", grid_search.best_params_)
print("Best Random Forest Score:", grid_search.best_score_)



Best Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Best Random Forest Score: 0.6826194458170605


In [None]:
# Evaluate tuned Random Forest
best_rf = grid_search.best_estimator_
y_pred_rf = best_rf.predict(X_test)
print("\nTuned Random Forest Results:")
print(classification_report(y_test, y_pred_rf))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_rf))


Tuned Random Forest Results:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.64      0.52      0.57      2145

    accuracy                           1.00    555719
   macro avg       0.82      0.76      0.78    555719
weighted avg       1.00      1.00      1.00    555719

ROC-AUC Score: 0.7572434410804709
