In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline

In [16]:
df = pd.read_csv(r'D:\Python\MLProjectsPW\CreditCardFraud1\notebooks\data\creditcard.csv')

In [17]:
#separate into independent and dependent features
X = df.drop('Class',axis=1)
y = df['Class']

In [18]:
#rectifying target imbalance
from imblearn.over_sampling import SMOTE
resampler = SMOTE(random_state=11)
X , y = resampler.fit_resample(X, y)

In [19]:
X[y==1].shape , X[y==0].shape

((284315, 30), (284315, 30))

In [20]:
numerical_cols = X.columns
numerical_cols

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')

In [21]:
# since all the features are only numerical in nature creating only numerical pipeline
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [22]:
numerical_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline , numerical_cols)
])

In [23]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [24]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [25]:
X_train.shape , X_test.shape

((398041, 30), (170589, 30))

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import  accuracy_score , roc_auc_score , f1_score

In [13]:
# a function to evaluate the model
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    roc_score = roc_auc_score(true, predicted)
    f_1_score = f1_score(true, predicted)

    return accuracy , roc_score , f_1_score

In [14]:
models={
    'LogisticRegression':LogisticRegression(),
    'RidgeClassifier':RidgeClassifier(),
    'BernoulliNB':BernoulliNB(),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'KNeighborsClassifier':KNeighborsClassifier(),
    'AdaBoostClassifier':AdaBoostClassifier(),
    'GradientBoostingClassifier':GradientBoostingClassifier(),
    'BaggingClassifier':BaggingClassifier(),
    'RandomForestClassifier':RandomForestClassifier(),
    'SVC':SVC(),
    'XGBClassifier':XGBClassifier()
}
model_list=[]
roc=[]
acc = []
f1 = []
performance = []
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    accuracy , roc_score  , f_1_score =evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    roc.append(roc_score)
    acc.append(accuracy)
    f1.append(f_1_score)
    performance.append((list(models.keys())[i] , accuracy ))


    print('Model Training Performance')
    print("ROC:",roc_score)
    print("Accuracy:",accuracy)

    
    print('='*35)
    print('\n')

print("Best Model" , sorted(performance , key = lambda x: x[1])[-1])
model = models[sorted(performance , key = lambda x: x[1])[-1][0]]
print(model)

metrics = pd.DataFrame({"models": model_list , "accuracy" : acc , "roc_auc_score": roc , "f1_score" : f1})
print(metrics.sort_values('accuracy',ascending=False))


LogisticRegression
Model Training Performance
ROC: 0.9271995610023779
Accuracy: 0.9256756756756757


RidgeClassifier
Model Training Performance
ROC: 0.8882385220413391
Accuracy: 0.8851351351351351


BernoulliNB
Model Training Performance
ROC: 0.9050210353027255
Accuracy: 0.902027027027027


DecisionTreeClassifier
Model Training Performance
ROC: 0.9158130601792573
Accuracy: 0.9155405405405406


KNeighborsClassifier
Model Training Performance
ROC: 0.9144869215291751
Accuracy: 0.9121621621621622


AdaBoostClassifier
Model Training Performance
ROC: 0.9125663069325041
Accuracy: 0.9121621621621622


GradientBoostingClassifier
Model Training Performance
ROC: 0.9242271812694349
Accuracy: 0.9222972972972973


BaggingClassifier
Model Training Performance
ROC: 0.920157307481251
Accuracy: 0.918918918918919


RandomForestClassifier
Model Training Performance
ROC: 0.9209804280226815
Accuracy: 0.918918918918919


SVC
Model Training Performance
ROC: 0.9242271812694349
Accuracy: 0.9222972972972973


XG