In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv(r"D:\Credit_Card\notebook\data\raw.csv")

In [3]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [4]:
df.drop(columns=['ID'], axis=1, inplace=True)

In [5]:
df.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default.payment.next.month'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   LIMIT_BAL                   30000 non-null  float64
 1   SEX                         30000 non-null  int64  
 2   EDUCATION                   30000 non-null  int64  
 3   MARRIAGE                    30000 non-null  int64  
 4   AGE                         30000 non-null  int64  
 5   PAY_0                       30000 non-null  int64  
 6   PAY_2                       30000 non-null  int64  
 7   PAY_3                       30000 non-null  int64  
 8   PAY_4                       30000 non-null  int64  
 9   PAY_5                       30000 non-null  int64  
 10  PAY_6                       30000 non-null  int64  
 11  BILL_AMT1                   30000 non-null  float64
 12  BILL_AMT2                   30000 non-null  float64
 13  BILL_AMT3                   300

In [7]:
X=df.drop(columns=["default.payment.next.month"], axis=1)
y=df['default.payment.next.month']

In [8]:

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Preprocessing for numerical data
preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [9]:
X = preprocessor.fit_transform(X)

In [10]:
X.shape

(30000, 23)

In [11]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(24000, 23)
(6000, 23)
(24000,)
(6000,)


#### Create an Evaluate Function to give all metrics after model Training

In [13]:
# Define classifiers
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(probability=True),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)
}




In [14]:
# Iterate over the models and train/evaluate each one
from sklearn.metrics import precision_score

results_list=[]
for name, model in  models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred)


    print(f"\n{name}:")
    print("Accuracy:", acc)
    print("Confusion Matrix:")
    print(confusion_mat)
    print("Precision:", precision)

        # Append results to the list
    results_list.append({
        "Model": name,
        "Accuracy": acc,
        "Confusion Matrix": confusion_mat,
        "Precision": precision
    })

# Create a DataFrame from the list
results_df = pd.DataFrame(results_list)



Logistic Regression:
Accuracy: 0.8096666666666666
Confusion Matrix:
[[4549  138]
 [1004  309]]
Precision: 0.6912751677852349

K-Nearest Neighbors:
Accuracy: 0.795
Confusion Matrix:
[[4302  385]
 [ 845  468]]
Precision: 0.5486518171160609

Support Vector Machine:
Accuracy: 0.8196666666666667
Confusion Matrix:
[[4482  205]
 [ 877  436]]
Precision: 0.6801872074882995

Decision Tree:
Accuracy: 0.7231666666666666
Confusion Matrix:
[[3810  877]
 [ 784  529]]
Precision: 0.3762446657183499

Random Forest:
Accuracy: 0.8161666666666667
Confusion Matrix:
[[4423  264]
 [ 839  474]]
Precision: 0.6422764227642277

Gradient Boosting:
Accuracy: 0.82
Confusion Matrix:
[[4455  232]
 [ 848  465]]
Precision: 0.667144906743185

AdaBoost:
Accuracy: 0.816
Confusion Matrix:
[[4491  196]
 [ 908  405]]
Precision: 0.6738768718801996

XGBoost:
Accuracy: 0.8166666666666667
Confusion Matrix:
[[4406  281]
 [ 819  494]]
Precision: 0.6374193548387097

CatBoost:
Accuracy: 0.8228333333333333
Confusion Matrix:
[[4453  2

In [15]:
results_df

Unnamed: 0,Model,Accuracy,Confusion Matrix,Precision
0,Logistic Regression,0.809667,"[[4549, 138], [1004, 309]]",0.691275
1,K-Nearest Neighbors,0.795,"[[4302, 385], [845, 468]]",0.548652
2,Support Vector Machine,0.819667,"[[4482, 205], [877, 436]]",0.680187
3,Decision Tree,0.723167,"[[3810, 877], [784, 529]]",0.376245
4,Random Forest,0.816167,"[[4423, 264], [839, 474]]",0.642276
5,Gradient Boosting,0.82,"[[4455, 232], [848, 465]]",0.667145
6,AdaBoost,0.816,"[[4491, 196], [908, 405]]",0.673877
7,XGBoost,0.816667,"[[4406, 281], [819, 494]]",0.637419
8,CatBoost,0.822833,"[[4453, 234], [829, 484]]",0.674095


In [16]:
results_df=results_df.drop(columns=['Confusion Matrix'])

In [17]:
sorted_results_df = results_df.sort_values(by='Accuracy', ascending=False, ignore_index=True)
print(sorted_results_df)

                    Model  Accuracy  Precision
0                CatBoost  0.822833   0.674095
1       Gradient Boosting  0.820000   0.667145
2  Support Vector Machine  0.819667   0.680187
3                 XGBoost  0.816667   0.637419
4           Random Forest  0.816167   0.642276
5                AdaBoost  0.816000   0.673877
6     Logistic Regression  0.809667   0.691275
7     K-Nearest Neighbors  0.795000   0.548652
8           Decision Tree  0.723167   0.376245


In [18]:

# # This project g

# from sklearn.ensemble import StackingClassifier

# # Define base estimators
# estimators = [
#     ('catboost', CatBoostClassifier(verbose=0)),
#     ('gradient_boosting', GradientBoostingClassifier()),
#     ('svc', SVC(probability=True)),
#     ('xgboost', XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
#     ('adaboost', AdaBoostClassifier())
# ]

# # Create a StackingClassifier with a final estimator
# clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# # Fit the classifier on the training data
# clf.fit(X_train, y_train)

# # Make predictions on the testing data
# y_pred = clf.predict(X_test)

# # Evaluate the classifier
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Precision:", precision_score(y_test, y_pred))

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Fit a logistic regression model
log_model = LogisticRegression()
log_model = log_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = log_model.predict(X_test)

# Evaluate the classifier
score = accuracy_score(y_test, y_pred) * 100
print("Accuracy of the model is %.2f" % score)


Accuracy of the model is 80.97
