In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt

In [2]:
df = pd.read_csv("creditcard.csv")[:80_000]
df.head(10)
# Dataset: Credit Card Fraud Detection Anonymized credit card transactions labeled as fraudulent or genuine
# from https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud  

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
5,2.0,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
6,4.0,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,4.99,0
7,7.0,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0
8,7.0,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,93.2,0
9,9.0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0


In [3]:
X = df.drop(columns = ["Time", "Amount", "Class"]).values
y = df["Class"].values
print("Shapes of X =" , X.shape , "y =", y.shape , "and Number of fraud cases =" , y.sum())

Shapes of X = (80000, 28) y = (80000,) and Number of fraud cases = 196


In [4]:
from sklearn.linear_model import LogisticRegression

mod = LogisticRegression(class_weight = {0:1 , 1:2}, max_iter = 1000)
# class_weight = {0 "No fraud": weight = 1  ,  1 "Fraud": weight = 2}

mod.fit(X , y).predict(X).sum()

171

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score , recall_score, make_scorer

def min_recall_precision(est, X, y_true, sample_weight=None):
    y_pred = est.predict(X)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    return min(recall, precision)



grid = GridSearchCV(
    estimator = LogisticRegression(max_iter = 1000),
    param_grid = {"class_weight" : [{0:1 , 1:v} for v in range(1, 10)]},
    scoring = { "precision" : make_scorer(precision_score) , 
               "recall_score" : make_scorer(recall_score),
                "min_both" : min_recall_precision},
    refit = "min_both",
    return_train_score = True,
    cv = 5,
    n_jobs = -1)

grid.fit(X , y)    

GridSearchCV(cv=5, estimator=LogisticRegression(max_iter=1000), n_jobs=-1,
             param_grid={'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 2},
                                          {0: 1, 1: 3}, {0: 1, 1: 4},
                                          {0: 1, 1: 5}, {0: 1, 1: 6},
                                          {0: 1, 1: 7}, {0: 1, 1: 8},
                                          {0: 1, 1: 9}]},
             refit='min_both', return_train_score=True,
             scoring={'min_both': <function min_recall_precision at 0x000001C6A0FFFE50>,
                      'precision': make_scorer(precision_score),
                      'recall_score': make_scorer(recall_score)})

In [7]:
df_res = pd.DataFrame(grid.cv_results_)
df_res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,params,split0_test_precision,split1_test_precision,split2_test_precision,split3_test_precision,...,mean_test_min_both,std_test_min_both,rank_test_min_both,split0_train_min_both,split1_train_min_both,split2_train_min_both,split3_train_min_both,split4_train_min_both,mean_train_min_both,std_train_min_both
0,1.622126,0.216584,0.031252,2e-06,"{0: 1, 1: 1}","{'class_weight': {0: 1, 1: 1}}",0.008059,0.709677,0.958333,0.928571,...,0.390714,0.26939,9,0.738854,0.592357,0.566879,0.585987,0.660256,0.628867,0.063398
1,1.733189,0.378292,0.028125,0.006254,"{0: 1, 1: 2}","{'class_weight': {0: 1, 1: 2}}",0.00533,0.769231,0.961538,0.933333,...,0.491707,0.288047,8,0.847134,0.66242,0.675159,0.675159,0.75,0.721975,0.069829
2,1.720633,0.405172,0.028124,0.006252,"{0: 1, 1: 3}","{'class_weight': {0: 1, 1: 3}}",0.004929,0.767442,0.962963,0.933333,...,0.511397,0.283284,7,0.859873,0.751592,0.732484,0.719745,0.77707,0.768153,0.049763
3,1.681247,0.283082,0.024999,0.007657,"{0: 1, 1: 4}","{'class_weight': {0: 1, 1: 4}}",0.00475,0.733333,0.964286,0.933333,...,0.514668,0.278837,6,0.872611,0.802548,0.764331,0.757962,0.782609,0.796012,0.041331
4,1.948779,0.339659,0.031248,0.009881,"{0: 1, 1: 5}","{'class_weight': {0: 1, 1: 5}}",0.004672,0.73913,0.964286,0.935484,...,0.53094,0.280959,5,0.878981,0.821656,0.78481,0.770701,0.783951,0.80802,0.039325
5,1.512499,0.170993,0.028124,0.00625,"{0: 1, 1: 6}","{'class_weight': {0: 1, 1: 6}}",0.004613,0.73913,0.966667,0.9375,...,0.561313,0.289177,4,0.878981,0.834395,0.793939,0.7875,0.785276,0.816018,0.036188
6,1.521877,0.075004,0.031245,0.009883,"{0: 1, 1: 7}","{'class_weight': {0: 1, 1: 7}}",0.004577,0.73913,0.935484,0.941176,...,0.576562,0.296988,3,0.878981,0.83125,0.795181,0.790123,0.787879,0.816683,0.034902
7,1.660175,0.382493,0.028121,0.006251,"{0: 1, 1: 8}","{'class_weight': {0: 1, 1: 8}}",0.004554,0.73913,0.935484,0.941176,...,0.581557,0.297138,2,0.878981,0.828221,0.796407,0.786585,0.789474,0.815934,0.034839
8,1.560781,0.251589,0.028123,0.006249,"{0: 1, 1: 9}","{'class_weight': {0: 1, 1: 9}}",0.004536,0.73913,0.9375,0.942857,...,0.60681,0.306124,1,0.878981,0.828221,0.796407,0.779762,0.791908,0.815056,0.035746


In [8]:
from sklearn.metrics import precision_score , recall_score
P_S = precision_score(y, grid.predict(X))
print("Precision_score is = " , P_S)
R_S = recall_score(y, grid.predict(X))
print("Recall_score is = " , R_S)  

# precision_score is about given that I predict fraud how accurate am I.
# recall_score is about Did I get all the fraud cases  

Precision_score is =  0.8186274509803921
Recall_score is =  0.8520408163265306


Test Metrics

In [None]:
plt.figure(figsize=(12, 4))
df_results = pd.DataFrame(grid.cv_results_)
for score in ['mean_test_recall', 'mean_test_precision', 'mean_test_min_both']:
    plt.plot(x=[_[1] for _ in df_results['param_class_weight']], 
             y=df_results[score], 
             label=score)
plt.legend();

Train Metrics

In [None]:
plt.figure(figsize=(12, 4))
df_results = pd.DataFrame(grid.cv_results_)
for score in ['mean_train_recall', 'mean_train_precision', 'mean_test_min_both']:
    plt.scatter(x=[_[1] for _ in df_results['param_class_weight']], 
                y=df_results[score.replace('test', 'train')], 
                label=score)
plt.legend();

In [8]:
lr = LogisticRegression()
??lr.score