In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import compute_sample_weight, compute_class_weight
import numpy as np


Split the data

In [42]:
dataframe = pd.read_csv('../../datasets/credit_card_fraud/creditcard.csv')
X = dataframe.iloc[:,:-1].values
y = dataframe.iloc[:, -1].values

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# (PART 2)
# After we realized the class imbalance in the data (PART 2)
# class_weights = compute_class_weight('balanced', np.unique(y_train), y_train)
# class_weights_dict = dict(zip(np.unique(y_train), class_weights))

Train & Predict

In [81]:
#  --- Attempt 1 -----------------------------------------
'''
# Before realizing the fake class was imbalanced
# model = GradientBoostingClassifier()
'''
# ---- Accuracy and report ---
# 0.9989466661985184
#               precision    recall  f1-score   support

#            0       1.00      1.00      1.00     56864
#            1       0.74      0.60      0.66        98

#     accuracy                           1.00     56962
#    macro avg       0.87      0.80      0.83     56962
# weighted avg       1.00      1.00      1.00     56962

# -------------------------------------------------------



# --- Attempt 2 -----------------------------------------
'''
# After realizing the fake class was imbalanced - using `sample weights`
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
model = GradientBoostingClassifier()
model.fit(X_train, y_train, sample_weight=sample_weights)

# Predict
y_pred = model.predict(X_test)
'''
# ---- Accuracy and report ---
# 0.9927846634598504
#               precision    recall  f1-score   support

#            0       1.00      0.99      1.00     56864
#            1       0.18      0.93      0.31        98

#     accuracy                           0.99     56962
#    macro avg       0.59      0.96      0.65     56962
# weighted avg       1.00      0.99      1.00     56962

# -------------------------------------------------------



# --- Attempt 3 -----------------------------------------


# Initialize
model = GradientBoostingClassifier()

# Train
model.fit(X_train, y_train)

# Array of probabilities from the positive class
y_proba = model.predict_proba(X_test)[:, 1]  

# Adjusting the threshold to optimize precision-recall trade-off
threshold = 0.24

# Cut out probabilities that do not meet threshold
y_pred = (y_proba >= threshold).astype(int)

# Report
# ---- Accuracy and report ---
# 0.9990695551420246
#               precision    recall  f1-score   support

#            0       1.00      1.00      1.00     56864
#            1       0.75      0.69      0.72        98

#     accuracy                           1.00     56962
#    macro avg       0.87      0.85      0.86     56962
# weighted avg       1.00      1.00      1.00     56962

# -------------------------------------------------------


NotFittedError: This GradientBoostingClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [75]:
# y_proba = model.predict_proba(X_test)[:, 1]  # Probability of positive class

# # Adjusting the threshold to optimize precision-recall trade-off
# threshold = 0.24  # Example threshold, adjust as needed
# y_pred = (y_proba >= threshold).astype(int)
# print(y_pred)

[1 0 0 ... 0 0 0]


Predict and evaluate

In [78]:
# Evaluate
print('---- Accuracy and report ---')
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(accuracy)
print(report)


print('---- Evaluate if overfitted ---')
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"Training Score: {train_score}")
print(f"Testing Score: {test_score}")

---- Accuracy and report ---
0.9927846634598504
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     56864
           1       0.18      0.93      0.31        98

    accuracy                           0.99     56962
   macro avg       0.59      0.96      0.65     56962
weighted avg       1.00      0.99      1.00     56962

---- Evaluate if overfitted ---
Training Score: 0.9930874059119138
Testing Score: 0.9927846634598504


Cross Validation Testing

In [None]:
# # Cross validation
# # Perform cross-validation
# cv_scores = cross_val_score(model, X_train, y_train, cv=5)  # cv=5 means 5-fold cross-validation

# # Print the cross-validation scores
# print("Cross-validation scores:", cv_scores)
# print("Mean CV accuracy:", cv_scores.mean())

Export Model

In [None]:
# Export report
# with open('report.txt', 'w') as file:
#     file.write(report)

# Export head
# with open('head.txt', 'w') as file:
#     file.write(str(dataframe.head()))

In [None]:
# Deploy model
# import pickle
# with open('credit_card_fraud_detector.pkl','wb') as f:
#     pickle.dump(model,f)