In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE  # For imbalance handling



In [None]:
df = pd.read_csv('creditcard.csv')



In [None]:
print(df.columns)




Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')


In [None]:
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df[['Amount']])


In [None]:
X = df.drop(['Class'], axis=1)
y = df['Class']


In [None]:
print(df['Class'].isnull().sum())



1


In [None]:
df = df.dropna(subset=['Class'])


In [None]:
X = df.drop(columns=['Class'])
y = df['Class']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)


In [None]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=42)
X_train_resampled, y_train_resampled = smt.fit_resample(X_train, y_train)


In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    scale_pos_weight=(len(y_train_resampled) / sum(y_train_resampled)),  # class imbalance handling
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train_resampled, y_train_resampled)


Parameters: { "use_label_encoder" } are not used.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
y_probs = model.predict_proba(X_test)[:, 1]


In [None]:
threshold = 0.9 # Try 0.4, 0.3, 0.2 to test what gives the best recall

import numpy as np
y_pred_thresh = (y_probs >= threshold).astype(int)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_thresh))
print("\nClassification Report:\n", classification_report(y_test, y_pred_thresh))
print("AUC-ROC Score:", roc_auc_score(y_test, y_probs))


Confusion Matrix:
 [[49242    11]
 [   20    87]]

Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     49253
         1.0       0.89      0.81      0.85       107

    accuracy                           1.00     49360
   macro avg       0.94      0.91      0.92     49360
weighted avg       1.00      1.00      1.00     49360

AUC-ROC Score: 0.9785936470305618


In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, mean_squared_error
import numpy as np

# Assuming:
# y_test         → actual labels (0 or 1)
# y_pred_thresh  → predicted labels after threshold tuning (0 or 1)

# 1. F1 Score
f1 = f1_score(y_test, y_pred_thresh)
print("F1 Score (fraud class):", f1)

# 2. Precision
precision = precision_score(y_test, y_pred_thresh)
print("Precision (fraud class):", precision)

# 3. Recall
recall = recall_score(y_test, y_pred_thresh)
print("Recall (fraud class):", recall)

# 4. Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_thresh))
print("Root Mean Squared Error (RMSE):", rmse)


F1 Score (fraud class): 0.848780487804878
Precision (fraud class): 0.8877551020408163
Recall (fraud class): 0.8130841121495327
Root Mean Squared Error (RMSE): 0.025060704257722503


In [None]:
from xgboost import XGBClassifier
import joblib

# Save model
joblib.dump(model, 'fraud_detection_model.pkl')

# Load model later
model = joblib.load('fraud_detection_model.pkl')


In [None]:
model.save_model("fraud_detection_model.json")

