# Step 4: Model Building and Training

In this step, we will train models to detect fraudulent transactions.
We will use stratified train-test splits, handle class imbalance, and evaluate models using appropriate metrics.


In [1]:
import pandas as pd
import numpy as np

# Machine Learning
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_recall_curve, auc

# Imbalanced data
from imblearn.over_sampling import SMOTE

# Ensemble
import xgboost as xgb
import lightgbm as lgb

In [2]:
fraud_df = pd.read_csv("../data/processed/Fraud_Data_cleaned.csv")
fraud_df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758368,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311387,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621473820,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542443,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583117,0


In [10]:
# Separate target
X = fraud_df.drop(columns=['class','signup_time','purchase_time','ip_address','user_id','device_id'], errors='ignore')
y = fraud_df['class']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (120889, 5) Test shape: (30223, 5)


In [11]:
# Identify categorical columns
cat_cols = X_train.select_dtypes(include=['object','category']).columns.tolist()

# One-hot encoding
X_train_encoded = pd.get_dummies(X_train, columns=cat_cols, drop_first=True)
X_test_encoded  = pd.get_dummies(X_test, columns=cat_cols, drop_first=True)

# Align test set with train set
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

In [12]:
num_cols = X_train_encoded.select_dtypes(include=['float64','int64']).columns

scaler = StandardScaler()
X_train_encoded[num_cols] = scaler.fit_transform(X_train_encoded[num_cols])
X_test_encoded[num_cols]  = scaler.transform(X_test_encoded[num_cols])

In [13]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_encoded, y_train)

print("Before SMOTE:\n", y_train.value_counts())
print("After SMOTE:\n", pd.Series(y_train_res).value_counts())

Before SMOTE:
 class
0    109568
1     11321
Name: count, dtype: int64
After SMOTE:
 class
0    109568
1    109568
Name: count, dtype: int64


In [14]:
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_res, y_train_res)

y_pred = lr.predict(X_test_encoded)
y_probs = lr.predict_proba(X_test_encoded)[:,1]

print("F1-Score:", f1_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

precision, recall, _ = precision_recall_curve(y_test, y_probs)
pr_auc = auc(recall, precision)
print("AUC-PR:", pr_auc)


F1-Score: 0.1690174232547114

Confusion Matrix:
 [[14775 12618]
 [ 1404  1426]]
AUC-PR: 0.10261824883756795


In [15]:
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    scale_pos_weight=(y_train==0).sum() / (y_train==1).sum(),
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_model.fit(X_train_res, y_train_res)

y_pred_xgb = xgb_model.predict(X_test_encoded)
y_probs_xgb = xgb_model.predict_proba(X_test_encoded)[:,1]

print("F1-Score (XGB):", f1_score(y_test, y_pred_xgb))
print("\nConfusion Matrix (XGB):\n", confusion_matrix(y_test, y_pred_xgb))

precision, recall, _ = precision_recall_curve(y_test, y_probs_xgb)
pr_auc = auc(recall, precision)
print("AUC-PR (XGB):", pr_auc)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


F1-Score (XGB): 0.1725105621858328

Confusion Matrix (XGB):
 [[  593 26800]
 [   33  2797]]
AUC-PR (XGB): 0.2126844688786687
