In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [5]:
data = pd.read_csv('/content/creditcard.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11959 entries, 0 to 11958
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    11959 non-null  int64  
 1   V1      11959 non-null  float64
 2   V2      11959 non-null  float64
 3   V3      11959 non-null  float64
 4   V4      11959 non-null  float64
 5   V5      11959 non-null  float64
 6   V6      11959 non-null  float64
 7   V7      11959 non-null  float64
 8   V8      11959 non-null  float64
 9   V9      11959 non-null  float64
 10  V10     11959 non-null  float64
 11  V11     11959 non-null  float64
 12  V12     11959 non-null  float64
 13  V13     11959 non-null  float64
 14  V14     11959 non-null  float64
 15  V15     11959 non-null  float64
 16  V16     11959 non-null  float64
 17  V17     11959 non-null  float64
 18  V18     11959 non-null  float64
 19  V19     11959 non-null  float64
 20  V20     11958 non-null  float64
 21  V21     11958 non-null  float64
 22

In [7]:
data.isna().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [8]:
data.dropna(inplace= True)

In [9]:
data.duplicated().sum()

44

In [10]:
 data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Time,11958.0,8008.940458,6203.516081,0.0,2542.0,6661.5,12378.0,20638.0
V1,11958.0,-0.216356,1.58392,-27.670569,-0.978957,-0.340925,1.161229,1.960497
V2,11958.0,0.277139,1.308931,-34.607649,-0.261503,0.256551,0.883785,9.092123
V3,11958.0,0.88952,1.331879,-22.804686,0.417182,0.951225,1.613701,4.101716
V4,11958.0,0.282689,1.478195,-4.657545,-0.621757,0.21305,1.159181,11.927512
V5,11958.0,-0.086537,1.191815,-32.092129,-0.688117,-0.183779,0.346298,34.099309
V6,11958.0,0.140021,1.306334,-23.496714,-0.622575,-0.146773,0.508444,21.393069
V7,11958.0,-0.121902,1.153939,-26.548144,-0.591325,-0.094697,0.431693,34.303177
V8,11958.0,-0.048722,1.246875,-23.632502,-0.185271,0.013642,0.267658,5.499963
V9,11958.0,0.911402,1.192241,-7.175097,0.196736,0.894775,1.588965,10.392889


In [11]:
data.drop_duplicates(inplace= True)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('Class' , axis=1), data['Class'], test_size=0.2, random_state=101)

In [13]:
smote = SMOTE(random_state=101)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [14]:
log_reg = LogisticRegression(solver='liblinear')
log_reg_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

In [25]:
rf = RandomForestClassifier()
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'bootstrap': [True, False]
}

In [16]:
ada = AdaBoostClassifier()
ada_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0, 1.5]
}

In [19]:
def grid_search(model, param_grid):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                               cv=3, n_jobs=-1, scoring='roc_auc', verbose=2)
    grid_search.fit(X_train_smote, y_train_smote)
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    print(f"Best ROC AUC for {model.__class__.__name__}: {grid_search.best_score_}")
    return grid_search.best_estimator_

In [23]:
best_log_reg = grid_search(log_reg, log_reg_param_grid)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for LogisticRegression: {'C': 0.1, 'penalty': 'l1'}
Best ROC AUC for LogisticRegression: 0.9995243903052581


In [26]:
best_rf = grid_search(rf, rf_param_grid)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best parameters for RandomForestClassifier: {'bootstrap': False, 'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
Best ROC AUC for RandomForestClassifier: 0.9999999833356763


In [27]:
best_ada = grid_search(ada, ada_param_grid)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best parameters for AdaBoostClassifier: {'learning_rate': 1.5, 'n_estimators': 200}
Best ROC AUC for AdaBoostClassifier: 0.9999991001265222


In [31]:
models = {
    'Random Forest': best_rf,
    'AdaBoost': best_ada,
    'Logistic Regression': best_log_reg
}

for name, model in models.items():
    Y_pred = model.predict(X_test)
    print(f"--- {name} ---")
    print(f"Accuracy: {accuracy_score(y_test, Y_pred):.4f}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, Y_pred)}\n")
    print(classification_report(y_test, Y_pred))

--- Random Forest ---
Accuracy: 1.0000
Confusion Matrix:
[[2375    0]
 [   0    8]]

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      2375
         1.0       1.00      1.00      1.00         8

    accuracy                           1.00      2383
   macro avg       1.00      1.00      1.00      2383
weighted avg       1.00      1.00      1.00      2383

--- AdaBoost ---
Accuracy: 1.0000
Confusion Matrix:
[[2375    0]
 [   0    8]]

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      2375
         1.0       1.00      1.00      1.00         8

    accuracy                           1.00      2383
   macro avg       1.00      1.00      1.00      2383
weighted avg       1.00      1.00      1.00      2383

--- Logistic Regression ---
Accuracy: 0.9933
Confusion Matrix:
[[2359   16]
 [   0    8]]

              precision    recall  f1-score   support

         0.0       1.00      0.99  