## Model Training

### Import Data and Required Packages

In [5]:
!pip install lightgbm 

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------------------------------------ --- 1.3/1.5 MB 6.7 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 6.4 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

### Importing the DataFrame

In [7]:
df = pd.read_csv('creditcard.csv/creditcard.csv')

In [8]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


### Preparing X and Y variables

In [9]:
X = df.drop('Class', axis=1)
y = df['Class']

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)


In [11]:
X_train.shape, X_test.shape

((227845, 30), (56962, 30))

#### Create an Evaluate Function to give all metrics after model Training

In [17]:
def evaluate_model(true, predicted):
    acc = accuracy_score(true, predicted)
    cm = confusion_matrix(true, predicted)
    precision = precision_score(true, predicted, average='weighted')
    recall = recall_score(true, predicted, average='weighted')
    f1 = f1_score(true, predicted, average='weighted')
    report = classification_report(true, predicted)
    return acc,cm, precision, recall, f1, report

#### Training Various Models

In [19]:
models = {
    "Random Forest Classifier": RandomForestClassifier(n_estimators=100,n_jobs=4,criterion='gini', verbose=False),
    "svm": svm.SVC(),
    "XGBClassifier": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(iterations=500, learning_rate=0.02,depth=12, eval_metric='AUC', 
                                                bagging_temperature = 0.2, od_type='Iter', od_wait=100,verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier(learning_rate=0.8, n_estimators=100,algorithm='SAMME.R', random_state=42),
    "LightBoost Classifier": LGBMClassifier()
}
model_list = []
acc_list =[]
f1_list=[]
prec_list = []
recall_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print("Training Set Evaluation:")
    train_acc, train_cm, train_precision, train_recall, train_f1, train_report = evaluate_model(y_train, y_train_pred)
    print("Accuracy Score: ", train_acc)
    print("Precision Score: ", train_precision)
    print("Recall Score: ", train_recall)
    print("F1 Score: ", train_f1)
    print("\nConfusion Matrix: ", train_cm)
    print("\nClassification report: ",train_report)

    print('----------------------------------')
    
    print("Test Set Evaluation:")
    test_acc, test_cm, test_precision, test_recall, test_f1, test_report = evaluate_model(y_test, y_test_pred)
    print("Accuracy Score: ", test_acc)
    print("Precision Score: ", test_precision)
    print("Recall Score: ", test_recall)
    print("F1 Score: ", test_f1)
    print("\nConfusion Matrix: ", test_cm)
    print("\nClassification report: ",test_report)
    
    acc_list.append(test_acc)
    f1_list.append(test_f1)
    prec_list.append(test_precision)
    recall_list.append(test_recall)
    
    
    print("="*40, "\n")



Random Forest Regressor
Training Set Evaluation:
Accuracy Score:  0.9999956110513727
Precision Score:  0.9999956110706688
Recall Score:  0.9999956110513727
F1 Score:  0.9999956082677922

Confusion Matrix:  [[227451      0]
 [     1    393]]

Classification report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00    227451
           1       1.00      1.00      1.00       394

    accuracy                           1.00    227845
   macro avg       1.00      1.00      1.00    227845
weighted avg       1.00      1.00      1.00    227845

----------------------------------
Test Set Evaluation:
Accuracy Score:  0.9996137776061234
Precision Score:  0.9996059887098661
Recall Score:  0.9996137776061234
F1 Score:  0.9995942800668866

Confusion Matrix:  [[56862     2]
 [   20    78]]

Classification report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.97      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy Score:  0.9982707542408216
Precision Score:  0.9965444987725388
Recall Score:  0.9982707542408216
F1 Score:  0.9974068795808843

Confusion Matrix:  [[227451      0]
 [   394      0]]

Classification report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00    227451
           1       0.00      0.00      0.00       394

    accuracy                           1.00    227845
   macro avg       0.50      0.50      0.50    227845
weighted avg       1.00      1.00      1.00    227845

----------------------------------
Test Set Evaluation:
Accuracy Score:  0.9982795547909132
Precision Score:  0.996562069513544
Recall Score:  0.9982795547909132
F1 Score:  0.9974200728063972

Confusion Matrix:  [[56864     0]
 [   98     0]]

Classification report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.00      0.00      0.00        98

    accuracy            

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


XGBRegressor
Training Set Evaluation:
Accuracy Score:  1.0
Precision Score:  1.0
Recall Score:  1.0
F1 Score:  1.0

Confusion Matrix:  [[227451      0]
 [     0    394]]

Classification report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00    227451
           1       1.00      1.00      1.00       394

    accuracy                           1.00    227845
   macro avg       1.00      1.00      1.00    227845
weighted avg       1.00      1.00      1.00    227845

----------------------------------
Test Set Evaluation:
Accuracy Score:  0.9995786664794073
Precision Score:  0.9995641189449643
Recall Score:  0.9995786664794073
F1 Score:  0.9995624872740402

Confusion Matrix:  [[56859     5]
 [   19    79]]

Classification report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.94      0.81      0.87        98

    accuracy                           1.00  



AdaBoost Regressor
Training Set Evaluation:
Accuracy Score:  0.9994645482674626
Precision Score:  0.9994412791504099
Recall Score:  0.9994645482674626
F1 Score:  0.9994456634533334

Confusion Matrix:  [[227416     35]
 [    87    307]]

Classification report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00    227451
           1       0.90      0.78      0.83       394

    accuracy                           1.00    227845
   macro avg       0.95      0.89      0.92    227845
weighted avg       1.00      1.00      1.00    227845

----------------------------------
Test Set Evaluation:
Accuracy Score:  0.999385555282469
Precision Score:  0.9993525311913192
Recall Score:  0.999385555282469
F1 Score:  0.9993564236565851

Confusion Matrix:  [[56855     9]
 [   26    72]]

Classification report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.89      0.73   

#### From the above results, Random Forest and XGBoost Classifier has best performances among all

### Using Sampling techniques to handle the Imbalance data and then check the performance of both algorithms

In [44]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn->imblearn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.13.0 imblearn-0.0 sklearn-compat-0.1.3


In [45]:
from sklearn.metrics import roc_auc_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [46]:
smt = SMOTE(random_state=42)

In [47]:
X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

In [48]:
print("Before SMOTE:", np.bincount(y_train))
print("After SMOTE:", np.bincount(y_train_res))

Before SMOTE: [227451    394]
After SMOTE: [227451 227451]


In [49]:
rf = RandomForestClassifier(random_state=42,n_jobs=4,verbose=False)
rf.fit(X_train_res, y_train_res)

In [50]:
y_rf_pred_train = rf.predict(X_train)
y_rf_pred_test = rf.predict(X_test)

In [55]:
print("Train set Evaluation results")
print("-----------------------------------------------------------")
print("Accuracy Score:", accuracy_score(y_train, y_rf_pred_train))
print("\nClassification Report:")
print(classification_report(y_train, y_rf_pred_train))

print("Confusion Matrix:")
print(confusion_matrix(y_train, y_rf_pred_train))

print("ROC-AUC Score:", roc_auc_score(y_train, rf.predict_proba(X_train)[:, 1]))

Train set Evaluation results
-----------------------------------------------------------
Accuracy Score: 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    227451
           1       1.00      1.00      1.00       394

    accuracy                           1.00    227845
   macro avg       1.00      1.00      1.00    227845
weighted avg       1.00      1.00      1.00    227845

Confusion Matrix:
[[227451      0]
 [     0    394]]
ROC-AUC Score: 1.0


In [60]:
print("Test set Evaluation results")
print("-----------------------------------------------------------")
print("Accuracy Score:", accuracy_score(y_test, y_rf_pred_test))
print("Recall Score:", recall_score(y_test, y_rf_pred_test))
print("Precision score:", precision_score(y_test, y_rf_pred_test))
print("\nClassification Report:")
print(classification_report(y_test, y_rf_pred_test))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_rf_pred_test))

print("\nROC-AUC Score:", roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]))

Test set Evaluation results
-----------------------------------------------------------
Accuracy Score: 0.9995259997893332
Recall Score: 0.8469387755102041
Precision score: 0.8736842105263158

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.87      0.85      0.86        98

    accuracy                           1.00     56962
   macro avg       0.94      0.92      0.93     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
[[56852    12]
 [   15    83]]

ROC-AUC Score: 0.9848788875426366


In [61]:
xgbc = XGBClassifier(random_state=42)
xgbc.fit(X_train_res, y_train_res)

In [62]:
y_xg_pred_train = xgbc.predict(X_train_res)
y_xg_pred_test = xgbc.predict(X_test)

In [65]:
print("Test set Evaluation results")
print("-----------------------------------------------------------")
print("Accuracy Score:", accuracy_score(y_test, y_xg_pred_test))
print("Recall Score:", recall_score(y_test, y_xg_pred_test))
print("Precision score:", precision_score(y_test, y_xg_pred_test))
print("\nClassification Report:")
print(classification_report(y_test, y_xg_pred_test))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_xg_pred_test))

print("\nROC-AUC Score:", roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]))

Test set Evaluation results
-----------------------------------------------------------
Accuracy Score: 0.999385555282469
Recall Score: 0.8571428571428571
Precision score: 0.8

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.80      0.86      0.83        98

    accuracy                           1.00     56962
   macro avg       0.90      0.93      0.91     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
[[56843    21]
 [   14    84]]

ROC-AUC Score: 0.9848788875426366


--------------------------------------------------------------------------------------------------------------------------------------------------------------

### We are going to Use RandomForest and prior to that we are using a pipeline to scale and sample the data.

In [66]:
from sklearn.model_selection import train_test_split
A_train, A_test, b_train, b_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [69]:
from sklearn.preprocessing import StandardScaler

In [70]:
pipeline = Pipeline(
    [
         ("StandardScaler", StandardScaler()),
        ("SMOTE", SMOTE(random_state=42)),
        ('clf', RandomForestClassifier(random_state=42,n_jobs=4,verbose=False))
    ]
)

In [71]:
pipeline.fit(A_train, b_train)

In [72]:
b_pred = pipeline.predict(A_test)
b_proba = pipeline.predict_proba(A_test)[:, 1]

print("Classification Report:")
print(classification_report(b_test, b_pred))

print("ROC AUC Score:", roc_auc_score(b_test, b_proba))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.91      0.85      0.88        98

    accuracy                           1.00     56962
   macro avg       0.96      0.92      0.94     56962
weighted avg       1.00      1.00      1.00     56962

ROC AUC Score: 0.9911498469674871


In [74]:
print("Confusion matrix: ")
print(confusion_matrix(b_test, b_pred))

Confusion matrix: 
[[56856     8]
 [   15    83]]


# The END