In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import average_precision_score, precision_recall_curve, f1_score, precision_score, recall_score, fbeta_score, precision_recall_curve, make_scorer, fbeta_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier


In [None]:
df = pd.read_csv(r'C:\Users\jaraneses\OneDrive - 2X LLC\Codes\Portfolio Projects_storage\Day 20 - Credit Card Fraud Detection\creditcard.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
plt.figure(figsize= (20, 20))
sns.heatmap(df.corr(), annot = True, cmap = 'coolwarm', fmt ='.2f')
plt.tight_layout()
plt.show()

In [None]:
# No correlation seen in V with each other, (V is an anonymized PCA). Focus more on correlation of each V to time, amount, and class.
corr = df.corr()
focus_column = ['Time', 'Amount', 'Class']
v_features = [f'V{i}' for i in range(1, 29)]

v_corr = corr.loc[v_features, focus_column]

v_corr

In [None]:

plt.figure(figsize = (10, 10))
sns.heatmap(v_corr, annot= True, fmt = '.2f', cmap='RdYlBu')
plt.tight_layout()
plt.show()

Strongest correlations with Fraud (Class)

V17: -0.326 (Strong negative) → Lower V17 = Higher fraud risk

V14: -0.303 (Strong negative) → Lower V14 = Higher fraud risk

V12: -0.261 (Strong negative) → Lower V12 = Higher fraud risk

V10: -0.217 (Moderate negative) → Lower V10 = Higher fraud risk

V16: -0.197 (Moderate negative) → Lower V16 = Higher fraud risk

V3:  -0.193 (Moderate negative) → Lower V3 = Higher fraud risk

V7:  -0.187 (Moderate negative) → Lower V7 = Higher fraud risk

Time 

V3:  -0.420 (Strong negative) → Later times = Lower V3

V25: -0.233 (Moderate negative) → Later times = Lower V25

V11: -0.248 (Moderate negative) → Later times = Lower V11

Amount

V2:  -0.531 (Very strong negative) → Higher amounts = Lower V2

V20: +0.339 (Strong positive) → Higher amounts = Higher V20

V7:  +0.397 (Strong positive) → Higher amounts = Higher V7

V5:  -0.386 (Strong negative) → Higher amounts = Lower V5

In [None]:
df.Class.value_counts()

# Massive class imbalance, use AUPRC instead of accuracy_score

In [None]:
X = df.drop(columns = 'Class')
y = df.Class

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 2, stratify = y)

In [None]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
models = {
    'Logistic Regression': {
        'model': LogisticRegression(random_state= 2, max_iter=2000),
        'params': {
            'C': [0.001, 0.0001],
            'class_weight': [{0: 1, 1: 100}, {0: 1, 1: 500}, {0: 1, 1: 1000}],
            'solver': ['liblinear']
        }
    },
    'Random Forest':{
        'model': RandomForestClassifier(random_state= 2),
        'params': {
            'n_estimators': [100],
            'max_depth': [10, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [2, 4], 
            'class_weight': ['balanced']
        }
    },
    'XGBoost':{
        'model': XGBClassifier(random_state = 2),
        'params': {
            'max_depth': [3, 4],
            'learning_rate': [0.01, 0.05],
            'n_estimators': [200, 300],
            'scale_pos_weight': [578, 100, 200],
            'subsample': [0.8, 0.9],
            'colsample_bytree': [0.8, 0.9]
        }
    }
}

In [None]:
results = {}
all_models = {}

f2_scorer = make_scorer(fbeta_score, beta=2, average='binary')

for name, config in models.items():
    grid_search = GridSearchCV(
        config['model'],
        config['params'],
        cv = 3,
        scoring = f2_scorer,
        n_jobs= 1,
        verbose = 1
    )
    
    grid_search.fit(X_train_scaled, y_train)
    all_models[name] = grid_search.best_estimator_

    train_pred = grid_search.best_estimator_.predict(X_train_scaled)
    test_pred = grid_search.best_estimator_.predict(X_test_scaled)

    
    results[name] = {
        'best_model': grid_search.best_estimator_,
        'best_params': grid_search.best_params_,
        'Train Accuracy': average_precision_score(y_train, train_pred),
        'Test Accuracy': average_precision_score(y_test, test_pred),
        'Train F2': fbeta_score(y_train, train_pred, beta = 2),
        'Test F2': fbeta_score(y_test, test_pred, beta = 2),
        'Precision': precision_score(y_test, test_pred),
        'Recall': recall_score(y_test, test_pred)
    }

    best_model_name = max(results.keys(), key = lambda x: results[x]['Test F2'])
    best_model = results[best_model_name]['best_model']

In [None]:
summary_data = []

for name, result in results.items():
    summary_data.append({
        'Model': name,
        'Train Accuracy': result['Train Accuracy'],
        'Test Accuracy': result['Test Accuracy'],
        'Train F2': result['Train F2'],
        'Test F2': result['Test F2'],
        'Precision': result['Precision'],
        'Recall': result['Recall']
    })

summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.round(4)
summary_df = summary_df.sort_values('Test F2', ascending= False).reset_index(drop = True)
print(summary_df.to_string())