# Credit Card Fraud Detection – Model Training

# Step 1: Import Required Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

import joblib
import warnings
warnings.filterwarnings("ignore")


# Step 2: Load Feature-Engineered Data

In [2]:
X_train = pd.read_csv("../data/preprocessed/X_train_fe.csv")
X_test  = pd.read_csv("../data/preprocessed/X_test_fe.csv")
y_train = pd.read_csv("../data/preprocessed/y_train.csv").values.ravel()
y_test  = pd.read_csv("../data/preprocessed/y_test.csv").values.ravel()

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)


Train shape: (226980, 35)
Test shape : (56746, 35)


# Step 3: Handle Class Imbalance Using Class Weights

In [3]:
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)

class_weight_dict = {
    0: class_weights[0],
    1: class_weights[1]
}

class_weight_dict


{0: 0.5008340614822464, 1: 300.23809523809524}

# Step 4: Train Baseline Model – Logistic Regression

In [4]:
lr_model = LogisticRegression(
    max_iter=1000,
    class_weight=class_weight_dict,
    n_jobs=-1
)

lr_model.fit(X_train, y_train)


# Step 5: Evaluate Baseline Model (Quick Check)

In [5]:
lr_train_auc = roc_auc_score(y_train, lr_model.predict_proba(X_train)[:,1])
lr_test_auc  = roc_auc_score(y_test, lr_model.predict_proba(X_test)[:,1])

print("Logistic Regression Train AUC:", lr_train_auc)
print("Logistic Regression Test AUC :", lr_test_auc)

Logistic Regression Train AUC: 0.9905787664258463
Logistic Regression Test AUC : 0.962011912271721


# Step 6: Train Advanced Model – Random Forest

In [6]:
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    class_weight=class_weight_dict,
    n_jobs=-1,
    random_state=42
)

rf_model.fit(X_train, y_train)


# Step 7: Evaluate Random Forest (Quick Check)

In [7]:
rf_train_auc = roc_auc_score(y_train, rf_model.predict_proba(X_train)[:,1])
rf_test_auc  = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:,1])

print("Random Forest Train AUC:", rf_train_auc)
print("Random Forest Test AUC :", rf_test_auc)


Random Forest Train AUC: 0.9999498222859006
Random Forest Test AUC : 0.976326891614307


# Step 8: Train Final Model – XGBoost

In [9]:
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    eval_metric="auc",
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train)


# Step 9: XGBoost Sanity Check

In [10]:
xgb_train_auc = roc_auc_score(
    y_train, xgb_model.predict_proba(X_train)[:, 1]
)
xgb_test_auc = roc_auc_score(
    y_test, xgb_model.predict_proba(X_test)[:, 1]
)

print("XGBoost Train AUC:", xgb_train_auc)
print("XGBoost Test AUC :", xgb_test_auc)


XGBoost Train AUC: 1.0
XGBoost Test AUC : 0.9752192788904176


# Step 10: Select Best Model

In [12]:
model_scores = {
    "Logistic Regression": lr_test_auc,
    "Random Forest": rf_test_auc,
    "XGBoost": xgb_test_auc
}

best_model_name = max(model_scores, key=model_scores.get)
best_model_name


'Random Forest'

# Step 11: Save Final Model

In [14]:
best_model = {
    "Logistic Regression": lr_model,
    "Random Forest": rf_model,
    "XGBoost": xgb_model
}[best_model_name]

joblib.dump(best_model, "../models/fraud_model.pkl")


['../models/fraud_model.pkl']

In [15]:
best_model

# Step 12: Training Summary

In [16]:
print("""
Model Training Completed Successfully

Models Trained:
- Logistic Regression (Baseline)
- Random Forest (Non-linear)
- XGBoost (Final Production Model)

Techniques Used:
- Class imbalance handling
- ROC-AUC based model selection
- Production-ready model persistence
""")



Model Training Completed Successfully

Models Trained:
- Logistic Regression (Baseline)
- Random Forest (Non-linear)
- XGBoost (Final Production Model)

Techniques Used:
- Class imbalance handling
- ROC-AUC based model selection
- Production-ready model persistence



# Model Inference Pipeline

# Step 1:Load Trained Model and Preprocessing Objects

In [17]:
# Load the saved scaler and trained model
scaler = joblib.load('../models/scaler.pkl')
rf_model = joblib.load('../models/fraud_model.pkl')

In [18]:
scaler

In [19]:
rf_model

# Step 2.Create the pipeline

In [20]:
# Combine scaler and model into one pipeline
pipeline = Pipeline(steps=[
    ("model", rf_model)
])

In [21]:
pipeline

# Step 3:Save the pipeline

In [22]:
# Save pipeline to models folder

joblib.dump(pipeline, '../models/pipeline.pkl')
print("pipeline.pkl saved successfully ")

pipeline.pkl saved successfully 


# Step 4:Test pipeline

In [23]:
# Load pipeline
pipeline = joblib.load('../models/pipeline.pkl')

# Make sure sample_input has same columns as training
sample_input = X_test.iloc[:5]  # raw, unscaled

# Predict
predictions = pipeline.predict(sample_input)
print("Sample predictions:", predictions)


Sample predictions: [0 0 0 0 0]


# Final Summary