In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot")

In [19]:
df = pd.read_csv(r'data/data.csv')

In [20]:
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
1,50000,1,1,2,37,0,0,0,0,0,...,19394,19619,20024,2500,1815,657,1000,1000,800,0
2,500000,1,1,2,29,0,0,0,0,0,...,542653,483003,473944,55000,40000,38000,20239,13750,13770,0
3,100000,2,2,2,23,0,-1,-1,0,0,...,221,-159,567,380,601,0,581,1687,1542,0
4,140000,2,3,1,28,0,0,2,0,0,...,12211,11793,3719,3329,0,432,1000,1000,1000,0


In [21]:
#separate into independent and dependent features
X = df.drop('default payment next month',axis=1)
y = df['default payment next month']

In [22]:
#rectifying target imbalance
from imblearn.combine import SMOTETomek
resampler = SMOTETomek(random_state=42)
X , y = resampler.fit_resample(X, y)

In [23]:
X[y==1].shape , X[y==0].shape

((739, 23), (739, 23))

In [24]:
numerical_cols = df.columns[:-1]
numerical_cols


Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'],
      dtype='object')

In [25]:
# sex_cat = [1,2]
# edu_cat = [1,2,3,4,5,6]
# marriage_cat = [0,1,2,3]
# pay_0 = [-1,  0, -2,  1,  2,  3,  4,  8]
# pay_2 = [0, -1, -2,  2,  3,  5,  7,  4,  1]
# pay_3 = [-1,  0,  2, -2,  3,  4,  6,  7,  1,  5]
# pay_4 = [0, -2, -1,  2,  3,  4,  5,  7]
# pay_5 = [0, -1,  2, -2,  3,  5,  4,  7]
# pay_6 = [0, -1,  2, -2,  3,  6,  4,  7]

In [26]:
# since all the features are only numerical in nature creating only numerical pipeline
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler #OneHotEncoder # HAndling Feature Scaling
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [27]:
numerical_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

# cat_pipeline=Pipeline(
#     steps=[
#     ('imputer',SimpleImputer(strategy='most_frequent')),
#     ('ordinalencoder',OneHotEncoder(categories=[sex_cat ,edu_cat, marriage_cat ,pay_0 , pay_2, pay_3, pay_4, pay_5, pay_6])),
#     ('scaler',StandardScaler(with_mean=False))
#     ]

# )

preprocessor = ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline , numerical_cols)
    #('categorical_pipeline',cat_pipeline , categorical_cols)
])

In [28]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [29]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [30]:
X_train.shape , X_test.shape

((1034, 23), (444, 23))

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import  accuracy_score , roc_auc_score , f1_score

In [32]:
# a function to evaluate the model
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    roc_score = roc_auc_score(true, predicted)
    f_1_score = f1_score(true, predicted)

    return accuracy , roc_score , f_1_score

In [33]:
models={
    'LogisticRegression':LogisticRegression(),
    'RidgeClassifier':RidgeClassifier(),
    'BernoulliNB':BernoulliNB(),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'KNeighborsClassifier':KNeighborsClassifier(),
    'AdaBoostClassifier':AdaBoostClassifier(),
    'GradientBoostingClassifier':GradientBoostingClassifier(),
    'BaggingClassifier':BaggingClassifier(),
    'RandomForestClassifier':RandomForestClassifier(),
    'SVC':SVC(),
    'XGBClassifier':XGBClassifier()
}
model_list=[]
roc=[]
acc = []
f1 = []
performance = []
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    accuracy , roc_score  , f_1_score =evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    roc.append(roc_score)
    acc.append(accuracy)
    f1.append(f_1_score)
    performance.append((list(models.keys())[i] , accuracy ))


    print('Model Training Performance')
    print("ROC:",roc_score)
    print("Accuracy:",accuracy)

    
    print('='*35)
    print('\n')

print("Best Model" , sorted(performance , key = lambda x: x[1])[-1])
model = models[sorted(performance , key = lambda x: x[1])[-1][0]]
print(model)

metrics = pd.DataFrame({"models": model_list , "accuracy" : acc , "roc_auc_score": roc , "f1_score" : f1}).sort_values('accuracy',ascending=False)
print(metrics)


LogisticRegression
Model Training Performance
ROC: 0.7111743193823649
Accuracy: 0.7117117117117117


RidgeClassifier
Model Training Performance
ROC: 0.7176960585127996
Accuracy: 0.7184684684684685


BernoulliNB
Model Training Performance
ROC: 0.6881349045103617
Accuracy: 0.6891891891891891


DecisionTreeClassifier
Model Training Performance
ROC: 0.7375863470134092
Accuracy: 0.7387387387387387


KNeighborsClassifier
Model Training Performance
ROC: 0.7468305566842748
Accuracy: 0.75




AdaBoostClassifier
Model Training Performance
ROC: 0.7567655424624136
Accuracy: 0.7567567567567568


GradientBoostingClassifier
Model Training Performance
ROC: 0.8130637952052011
Accuracy: 0.8130630630630631


BaggingClassifier
Model Training Performance
ROC: 0.7978464039008533
Accuracy: 0.7972972972972973


RandomForestClassifier
Model Training Performance
ROC: 0.8399634295002032
Accuracy: 0.8400900900900901


SVC
Model Training Performance
ROC: 0.7674725721251524
Accuracy: 0.7680180180180181


XGBClassifier
Model Training Performance
ROC: 0.8287687931735067
Accuracy: 0.8288288288288288


Best Model ('RandomForestClassifier', 0.8400900900900901)
RandomForestClassifier()
                        models  accuracy  roc_auc_score  f1_score
8       RandomForestClassifier  0.840090       0.839963  0.845316
10               XGBClassifier  0.828829       0.828769  0.834061
6   GradientBoostingClassifier  0.813063       0.813064  0.818381
7            BaggingClassifier  0.797297       0.797846 

In [34]:
metrics

Unnamed: 0,models,accuracy,roc_auc_score,f1_score
8,RandomForestClassifier,0.84009,0.839963,0.845316
10,XGBClassifier,0.828829,0.828769,0.834061
6,GradientBoostingClassifier,0.813063,0.813064,0.818381
7,BaggingClassifier,0.797297,0.797846,0.8
9,SVC,0.768018,0.767473,0.777538
5,AdaBoostClassifier,0.756757,0.756766,0.763158
4,KNeighborsClassifier,0.75,0.746831,0.775758
3,DecisionTreeClassifier,0.738739,0.737586,0.753191
1,RidgeClassifier,0.718468,0.717696,0.731183
0,LogisticRegression,0.711712,0.711174,0.722944
