In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv("Data/creditcard.csv")

In [3]:
df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [7]:
df.shape

(284807, 31)

In [3]:
Y=df.Class
X=df.drop(columns="Class")

In [16]:
Y.sum()/Y.shape[0]

0.001727485630620034

In [21]:
from sklearn.model_selection import train_test_split

In [23]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y)

In [11]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [12]:
cat_features=X.dtypes.loc[X.dtypes=="object"].index
num_features=X.dtypes.loc[X.dtypes!="object"].index

In [15]:
CT=ColumnTransformer([("cat",OneHotEncoder(),cat_features),
                     ("num",StandardScaler(),num_features)])

In [25]:
X_train.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')

In [27]:
X_train=CT.fit_transform(X_train)
X_test=CT.transform(X_test)
import pickle
with open('pipeline.pkl','wb') as f:
    pickle.dump(CT,f)

In [32]:
from sklearn.linear_model import LogisticRegression

In [34]:
LR=LogisticRegression()

In [36]:
LR.fit(X_train,Y_train)

In [38]:
y_pred=LR.predict(X_test)

In [40]:
from sklearn.metrics import confusion_matrix, classification_report

In [42]:
CM=confusion_matrix(Y_test,y_pred)
CM

array([[71069,    12],
       [   51,    70]], dtype=int64)

In [44]:
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     71081
           1       0.85      0.58      0.69       121

    accuracy                           1.00     71202
   macro avg       0.93      0.79      0.84     71202
weighted avg       1.00      1.00      1.00     71202



In [74]:
from sklearn.svm import SVC
svm=SVC(kernel="rbf")
svm.fit(X_train,Y_train)

In [75]:
y_pred=svm.predict(X_test)
CM=confusion_matrix(Y_test,y_pred)
CM

array([[71076,     5],
       [   48,    73]], dtype=int64)

In [76]:
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     71081
           1       0.94      0.60      0.73       121

    accuracy                           1.00     71202
   macro avg       0.97      0.80      0.87     71202
weighted avg       1.00      1.00      1.00     71202



In [80]:
from imblearn.over_sampling import SMOTE,ADASYN
from sklearn.preprocessing import PolynomialFeatures
from imblearn.under_sampling import EditedNearestNeighbours, RandomUnderSampler
S=SMOTE()
#S=ADASYN()
#S=EditedNearestNeighbours()
S=RandomUnderSampler()

In [82]:
X_train_resampled,Y_train_resampled=S.fit_resample(X_train,Y_train)


In [84]:
X_train_resampled.shape

(742, 30)

In [86]:
#PF=PolynomialFeatures()
#X_train_resampled=PF.fit_transform(X_train_resampled)


In [88]:
#X_test=PF.transform(X_test)

In [90]:
Y_train_resampled.sum()/len(Y_train_resampled)

0.5

In [92]:
LR=LogisticRegression(max_iter=1000)
LR.fit(X_train_resampled,Y_train_resampled)

In [94]:
y_pred=LR.predict(X_test)

In [96]:
CM=confusion_matrix(Y_test,y_pred)
CM

array([[68330,  2751],
       [   10,   111]], dtype=int64)

In [98]:
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98     71081
           1       0.04      0.92      0.07       121

    accuracy                           0.96     71202
   macro avg       0.52      0.94      0.53     71202
weighted avg       1.00      0.96      0.98     71202



In [60]:
from sklearn.ensemble import RandomForestClassifier
RFC=RandomForestClassifier(max_depth=5)

In [62]:
RFC.fit(X_train_resampled,Y_train_resampled)

In [63]:
y_pred=RFC.predict(X_test)
CM=confusion_matrix(Y_test,y_pred)
CM

array([[196935,  50372],
       [   656,   2037]], dtype=int64)

In [64]:
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.80      0.89    247307
           1       0.04      0.76      0.07      2693

    accuracy                           0.80    250000
   macro avg       0.52      0.78      0.48    250000
weighted avg       0.99      0.80      0.88    250000



In [102]:
import xgboost as xgb

In [159]:
model = xgb.XGBClassifier(
    objective='binary:logistic',)
"""
    max_depth=max_depth,       
    learning_rate=learning_rate,
    subsample=subsample,        
    colsample_bytree=colsample, 
    n_estimators=num_estimators
)"""

'\n    max_depth=max_depth,       \n    learning_rate=learning_rate,\n    subsample=subsample,        \n    colsample_bytree=colsample, \n    n_estimators=num_estimators\n)'

In [161]:
model.fit(X_train,Y_train)

In [163]:
from sklearn.model_selection import GridSearchCV
param_grid={"max_depth":[5,10,15],       
    "learning_rate":[0.01,0.1,1], 
    "n_estimators":[100,500,1000]}
GCV=GridSearchCV(model,param_grid,scoring="f1_weighted")
GCV.fit(X_train,Y_train)

In [164]:
print(GCV.best_params_)
model=GCV.best_estimator_

{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 500}


In [175]:
import pickle
with open('model.pkl','wb') as f:
    pickle.dump(model,f)

In [169]:
y_pred=model.predict(X_test)
CM=confusion_matrix(Y_test,y_pred)
CM

array([[71075,     6],
       [   30,    91]], dtype=int64)

In [171]:
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     71081
           1       0.94      0.75      0.83       121

    accuracy                           1.00     71202
   macro avg       0.97      0.88      0.92     71202
weighted avg       1.00      1.00      1.00     71202



In [183]:
from sklearn.metrics import roc_auc_score
roc_auc_score(Y_test,1-model.predict_proba(X_test).T[0])

0.9809677610259788

In [115]:
y_pred.shape

(71202,)