# Model Building

## Loading Data

In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('Credit-Data-Cleaned')

In [4]:
df

Unnamed: 0,duration,credit_amount,age,checking_status,credit_history,purpose,employment,property_magnitude,housing,gender,marital_status,savings_status,class
0,6.0,1169.0,67.0,<0,critical/other existing credit,radio/tv,>=7,real estate,own,male,single,no known savings,good
1,48.0,5951.0,22.0,0<=X<200,existing paid,radio/tv,1<=X<4,real estate,own,female,divorced/dependent/married,<100,bad
2,12.0,2096.0,49.0,no checking,critical/other existing credit,education,4<=X<7,real estate,own,male,single,<100,good
3,42.0,7882.0,45.0,<0,existing paid,furniture/equipment,4<=X<7,life insurance,for free,male,single,<100,good
4,24.0,4870.0,53.0,<0,delayed previously,new car,1<=X<4,no known property,for free,male,single,<100,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,12.0,1736.0,31.0,no checking,existing paid,furniture/equipment,4<=X<7,real estate,own,female,divorced/dependent/married,<100,good
996,30.0,3857.0,40.0,<0,existing paid,used car,1<=X<4,life insurance,own,male,divorced/separated,<100,good
997,12.0,804.0,38.0,no checking,existing paid,radio/tv,>=7,car,own,male,single,<100,good
998,45.0,1845.0,23.0,<0,existing paid,radio/tv,1<=X<4,no known property,for free,male,single,<100,bad


In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   duration            1000 non-null   float64
 1   credit_amount       1000 non-null   float64
 2   age                 1000 non-null   float64
 3   checking_status     1000 non-null   object 
 4   credit_history      1000 non-null   object 
 5   purpose             1000 non-null   object 
 6   employment          1000 non-null   object 
 7   property_magnitude  1000 non-null   object 
 8   housing             1000 non-null   object 
 9   gender              1000 non-null   object 
 10  marital_status      1000 non-null   object 
 11  savings_status      1000 non-null   object 
 12  class               1000 non-null   object 
dtypes: float64(3), object(10)
memory usage: 101.7+ KB


In [96]:
df.describe()

Unnamed: 0,duration,credit_amount,age
count,1000.0,1000.0,1000.0
mean,20.903,3271.258,35.546
std,12.058814,2822.736876,11.375469
min,4.0,250.0,19.0
25%,12.0,1365.5,27.0
50%,18.0,2319.5,33.0
75%,24.0,3972.25,42.0
max,72.0,18424.0,75.0


In [97]:
df['class'].value_counts()

good    700
bad     300
Name: class, dtype: int64

## Normalizing Numerical Attributes

In [98]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = StandardScaler()

x_scaled = scaler.fit_transform(df[['duration', 'credit_amount', 'age']])

df_scaled = pd.DataFrame(x_scaled, columns=['duration', 'credit_amount', 'age'])

df[['duration', 'credit_amount', 'age']] = df_scaled

df

Unnamed: 0,duration,credit_amount,age,checking_status,credit_history,purpose,employment,property_magnitude,housing,gender,marital_status,savings_status,class
0,-1.236478,-0.745131,2.766456,<0,critical/other existing credit,radio/tv,>=7,real estate,own,male,single,no known savings,good
1,2.248194,0.949817,-1.191404,0<=X<200,existing paid,radio/tv,1<=X<4,real estate,own,female,divorced/dependent/married,<100,bad
2,-0.738668,-0.416562,1.183312,no checking,critical/other existing credit,education,4<=X<7,real estate,own,male,single,<100,good
3,1.750384,1.634247,0.831502,<0,existing paid,furniture/equipment,4<=X<7,life insurance,for free,male,single,<100,good
4,0.256953,0.566664,1.535122,<0,delayed previously,new car,1<=X<4,no known property,for free,male,single,<100,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.738668,-0.544162,-0.399832,no checking,existing paid,furniture/equipment,4<=X<7,real estate,own,female,divorced/dependent/married,<100,good
996,0.754763,0.207612,0.391740,<0,existing paid,used car,1<=X<4,life insurance,own,male,divorced/separated,<100,good
997,-0.738668,-0.874503,0.215835,no checking,existing paid,radio/tv,>=7,car,own,male,single,<100,good
998,1.999289,-0.505528,-1.103451,<0,existing paid,radio/tv,1<=X<4,no known property,for free,male,single,<100,bad


## Encoding Categorical Variables

In [99]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

In [100]:
nominal_columns = ['purpose', 'property_magnitude', 'housing', 'gender', 'marital_status']
ordinal_columns = ['credit_history', 'employment', 'savings_status', 'checking_status']
target_column = 'class'

In [101]:
one_hot_encoder = OneHotEncoder(drop='first', sparse_output=False)

X_nominal_encoded = one_hot_encoder.fit_transform(df[nominal_columns])

X_nominal_encoded_df = pd.DataFrame(X_nominal_encoded, columns=one_hot_encoder.get_feature_names_out(nominal_columns))

df_encoded = pd.concat([df.drop(columns=nominal_columns), X_nominal_encoded_df], axis=1)

df_encoded

Unnamed: 0,duration,credit_amount,age,checking_status,credit_history,employment,savings_status,class,purpose_domestic appliance,purpose_education,...,purpose_used car,property_magnitude_life insurance,property_magnitude_no known property,property_magnitude_real estate,housing_own,housing_rent,gender_male,marital_status_divorced/separated,marital_status_married/widowed,marital_status_single
0,-1.236478,-0.745131,2.766456,<0,critical/other existing credit,>=7,no known savings,good,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
1,2.248194,0.949817,-1.191404,0<=X<200,existing paid,1<=X<4,<100,bad,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,-0.738668,-0.416562,1.183312,no checking,critical/other existing credit,4<=X<7,<100,good,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
3,1.750384,1.634247,0.831502,<0,existing paid,4<=X<7,<100,good,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.256953,0.566664,1.535122,<0,delayed previously,1<=X<4,<100,bad,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.738668,-0.544162,-0.399832,no checking,existing paid,4<=X<7,<100,good,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
996,0.754763,0.207612,0.391740,<0,existing paid,1<=X<4,<100,good,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
997,-0.738668,-0.874503,0.215835,no checking,existing paid,>=7,<100,good,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
998,1.999289,-0.505528,-1.103451,<0,existing paid,1<=X<4,<100,bad,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [102]:
categories = {
    'credit_history': ['delayed previously','critical/other existing credit','existing paid','all paid','no credits'],
    'employment': ['unemployed', '<1', '1<=X<4','4<=X<7', '>=7'],
    'savings_status' : ['<100', '100<=X<500', '500<=X<1000', '>=1000', 'no known savings'],
    'checking_status': ['<0', '0<=X<200', '>=200', 'no checking']
}

ordinal_encoder = OrdinalEncoder(categories=[categories[col] for col in ordinal_columns])

X_ordinal_encoded = ordinal_encoder.fit_transform(df[ordinal_columns])

X_ordinal_encoded_df = pd.DataFrame(X_ordinal_encoded, columns=ordinal_columns)

df_encoded = pd.concat([df_encoded.drop(columns=ordinal_columns), X_ordinal_encoded_df], axis=1)

df_encoded

Unnamed: 0,duration,credit_amount,age,class,purpose_domestic appliance,purpose_education,purpose_furniture/equipment,purpose_new car,purpose_other,purpose_radio/tv,...,housing_own,housing_rent,gender_male,marital_status_divorced/separated,marital_status_married/widowed,marital_status_single,credit_history,employment,savings_status,checking_status
0,-1.236478,-0.745131,2.766456,good,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,4.0,4.0,0.0
1,2.248194,0.949817,-1.191404,bad,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,1.0
2,-0.738668,-0.416562,1.183312,good,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,3.0,0.0,3.0
3,1.750384,1.634247,0.831502,good,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,2.0,3.0,0.0,0.0
4,0.256953,0.566664,1.535122,bad,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.738668,-0.544162,-0.399832,good,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,0.0,3.0
996,0.754763,0.207612,0.391740,good,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,2.0,2.0,0.0,0.0
997,-0.738668,-0.874503,0.215835,good,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,2.0,4.0,0.0,3.0
998,1.999289,-0.505528,-1.103451,bad,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0


In [103]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y_encoded = label_encoder.fit_transform(df[target_column])

y_encoded

array([1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,

In [104]:
x = df_encoded.drop(target_column, axis=1)
x

Unnamed: 0,duration,credit_amount,age,purpose_domestic appliance,purpose_education,purpose_furniture/equipment,purpose_new car,purpose_other,purpose_radio/tv,purpose_repairs,...,housing_own,housing_rent,gender_male,marital_status_divorced/separated,marital_status_married/widowed,marital_status_single,credit_history,employment,savings_status,checking_status
0,-1.236478,-0.745131,2.766456,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,4.0,4.0,0.0
1,2.248194,0.949817,-1.191404,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,1.0
2,-0.738668,-0.416562,1.183312,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,3.0,0.0,3.0
3,1.750384,1.634247,0.831502,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,2.0,3.0,0.0,0.0
4,0.256953,0.566664,1.535122,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.738668,-0.544162,-0.399832,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,0.0,3.0
996,0.754763,0.207612,0.391740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,2.0,2.0,0.0,0.0
997,-0.738668,-0.874503,0.215835,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,2.0,4.0,0.0,3.0
998,1.999289,-0.505528,-1.103451,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0


In [105]:
x.shape, y_encoded.shape

((1000, 25), (1000,))

In [106]:
np.unique(y_encoded, return_counts=True)

(array([0, 1]), array([300, 700], dtype=int64))

## Training and Testing Dataset

In [107]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded)

In [108]:
x_train.shape, y_train.shape

((700, 25), (700,))

In [109]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([210, 490], dtype=int64))

In [110]:
x_test.shape, y_test.shape

((300, 25), (300,))

In [111]:
np.unique(y_test, return_counts=True)

(array([0, 1]), array([ 90, 210], dtype=int64))

## Elimination of class imbalance

### 3 Techniques:

1) Oversampling minority class either by duplicating data or using SMOTE

2) Undersampling majority class

3) Combined Over- and Undersampling

Using 1st technique since we have less data

In [112]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority', random_state=42)

x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

In [113]:
x_train_resampled.shape, y_train_resampled.shape

((980, 25), (980,))

In [114]:
np.unique(y_train_resampled, return_counts=True)

(array([0, 1]), array([490, 490], dtype=int64))

## Hyperarameter Tuning and Model Building

In [115]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [116]:
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=10000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

param_grids = {
    'Logistic Regression': {
        'classifier__C': [0.1, 1, 10]
    },
    'Random Forest': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20]
    },
    'Gradient Boosting': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 1],
        'classifier__max_depth': [3, 5, 7]
    },
    'AdaBoost': {
        'classifier__n_estimators': [50, 100, 200]
    },
    'Support Vector Machine': {
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf']
    },
    'K-Nearest Neighbors': {
        'classifier__n_neighbors': [3, 5, 7],
        'classifier__p': [1, 2]
    },
    'Naive Bayes': {
    },
    'Decision Tree': {
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4]
    }
}

### 1) HPT with all the combined attributes (from EDA + other techniques) and SMOTE


- duration

- credit_amount

- age

- checking_status

- credit_history

- purpose

- employment

- property_magnitude

- housing

- gender

- marital_status

- savings_status 

In [117]:
import pandas as pd
from sklearn.pipeline import Pipeline

results_list = []

for name, classifier in classifiers.items():
    print(f"Processing {name}...")
    
    pipe = Pipeline([
        ('classifier', classifier)
    ])

    param_grid = param_grids.get(name, {})

    grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

    grid_search.fit(x_train_resampled, y_train_resampled)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    y_pred = grid_search.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)  # Store as dict for easier processing

    results_list.append({
        'Classifier': name,
        'Best Parameters': best_params,
        'Best Cross-Validation Score': best_score,
        'Test Accuracy': accuracy,
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1-Score': report['weighted avg']['f1-score']
    })

results_df = pd.DataFrame(results_list)

print(results_df)

Processing Logistic Regression...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Processing Random Forest...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Processing Gradient Boosting...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Processing AdaBoost...
Fitting 5 folds for each of 3 candidates, totalling 15 fits




Processing Support Vector Machine...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Processing K-Nearest Neighbors...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Processing Naive Bayes...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Processing Decision Tree...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
               Classifier                                    Best Parameters  \
0     Logistic Regression                             {'classifier__C': 0.1}   
1           Random Forest  {'classifier__max_depth': 20, 'classifier__n_e...   
2       Gradient Boosting  {'classifier__learning_rate': 0.1, 'classifier...   
3                AdaBoost                  {'classifier__n_estimators': 100}   
4  Support Vector Machine  {'classifier__C': 10, 'classifier__kernel': 'r...   
5     K-Nearest Neighbors  {'classifier__n_neighbors': 3, 'classifier__p'...   
6             Naive Bayes                                                 

In [118]:
results_df.sort_values(by='Test Accuracy', ascending=False)

Unnamed: 0,Classifier,Best Parameters,Best Cross-Validation Score,Test Accuracy,Precision,Recall,F1-Score
0,Logistic Regression,{'classifier__C': 0.1},0.740816,0.733333,0.765278,0.733333,0.742125
3,AdaBoost,{'classifier__n_estimators': 100},0.785714,0.726667,0.726667,0.726667,0.726667
1,Random Forest,"{'classifier__max_depth': 20, 'classifier__n_e...",0.834694,0.723333,0.715931,0.723333,0.718955
2,Gradient Boosting,"{'classifier__learning_rate': 0.1, 'classifier...",0.816327,0.72,0.713269,0.72,0.716105
4,Support Vector Machine,"{'classifier__C': 10, 'classifier__kernel': 'r...",0.812245,0.713333,0.723,0.713333,0.717381
5,K-Nearest Neighbors,"{'classifier__n_neighbors': 3, 'classifier__p'...",0.816327,0.676667,0.684003,0.676667,0.679978
6,Naive Bayes,{},0.697959,0.643333,0.705568,0.643333,0.657939
7,Decision Tree,"{'classifier__max_depth': None, 'classifier__m...",0.747959,0.613333,0.647718,0.613333,0.625549


#### Highest Accuracy:
Logistic Regression - 73.3%

### 2) HPT with only EDA based important attributes and SMOTE

- duration

- credit_amount

- age

- checking_status

- credit_history

- purpose

- employment

- property_magnitude

- housing

- gender

In [119]:
x_train_resampled.columns

Index(['duration', 'credit_amount', 'age', 'purpose_domestic appliance',
       'purpose_education', 'purpose_furniture/equipment', 'purpose_new car',
       'purpose_other', 'purpose_radio/tv', 'purpose_repairs',
       'purpose_retraining', 'purpose_used car',
       'property_magnitude_life insurance',
       'property_magnitude_no known property',
       'property_magnitude_real estate', 'housing_own', 'housing_rent',
       'gender_male', 'marital_status_divorced/separated',
       'marital_status_married/widowed', 'marital_status_single',
       'credit_history', 'employment', 'savings_status', 'checking_status'],
      dtype='object')

In [120]:
temp_x_train = x_train_resampled.drop(['marital_status_divorced/separated', 'marital_status_married/widowed', 'marital_status_single', 'savings_status'], axis=1)
temp_x_test = x_test.drop(['marital_status_divorced/separated', 'marital_status_married/widowed', 'marital_status_single', 'savings_status'], axis=1)

In [121]:
import pandas as pd
from sklearn.pipeline import Pipeline

results_list = []

for name, classifier in classifiers.items():
    print(f"Processing {name}...")
    
    pipe = Pipeline([
        ('classifier', classifier)
    ])

    param_grid = param_grids.get(name, {})

    grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

    grid_search.fit(temp_x_train, y_train_resampled)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    y_pred = grid_search.predict(temp_x_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)  # Store as dict for easier processing

    results_list.append({
        'Classifier': name,
        'Best Parameters': best_params,
        'Best Cross-Validation Score': best_score,
        'Test Accuracy': accuracy,
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1-Score': report['weighted avg']['f1-score']
    })

results_df = pd.DataFrame(results_list)

print(results_df)

Processing Logistic Regression...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Processing Random Forest...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Processing Gradient Boosting...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Processing AdaBoost...
Fitting 5 folds for each of 3 candidates, totalling 15 fits




Processing Support Vector Machine...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Processing K-Nearest Neighbors...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Processing Naive Bayes...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Processing Decision Tree...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
               Classifier                                    Best Parameters  \
0     Logistic Regression                               {'classifier__C': 1}   
1           Random Forest  {'classifier__max_depth': None, 'classifier__n...   
2       Gradient Boosting  {'classifier__learning_rate': 0.1, 'classifier...   
3                AdaBoost                  {'classifier__n_estimators': 200}   
4  Support Vector Machine  {'classifier__C': 10, 'classifier__kernel': 'r...   
5     K-Nearest Neighbors  {'classifier__n_neighbors': 5, 'classifier__p'...   
6             Naive Bayes                                                 

In [122]:
results_df.sort_values(by='Test Accuracy', ascending=False)

Unnamed: 0,Classifier,Best Parameters,Best Cross-Validation Score,Test Accuracy,Precision,Recall,F1-Score
2,Gradient Boosting,"{'classifier__learning_rate': 0.1, 'classifier...",0.814286,0.743333,0.73801,0.743333,0.740244
3,AdaBoost,{'classifier__n_estimators': 200},0.778571,0.723333,0.720754,0.723333,0.72197
1,Random Forest,"{'classifier__max_depth': None, 'classifier__n...",0.820408,0.72,0.708717,0.72,0.712709
0,Logistic Regression,{'classifier__C': 1},0.729592,0.703333,0.732703,0.703333,0.71249
4,Support Vector Machine,"{'classifier__C': 10, 'classifier__kernel': 'r...",0.786735,0.693333,0.7035,0.693333,0.697664
5,K-Nearest Neighbors,"{'classifier__n_neighbors': 5, 'classifier__p'...",0.785714,0.663333,0.664404,0.663333,0.663862
7,Decision Tree,"{'classifier__max_depth': None, 'classifier__m...",0.734694,0.65,0.660109,0.65,0.654501
6,Naive Bayes,{},0.685714,0.633333,0.691356,0.633333,0.6481


#### Highest Accuracy:
Gradient Boosting - 74.3%

### 3) HPT with only onther techniques based important attributes and SMOTE

- duration

- credit_amount

- age

- checking_status       

- savings_status 

- credit_history        

- purpose               

- marital_status  

- property_magnitude    

- gender         

In [123]:
temp_x_train = x_train_resampled.drop(['housing_own', 'housing_rent', 'employment'], axis=1)
temp_x_test = x_test.drop(['housing_own', 'housing_rent', 'employment'], axis=1)

In [124]:
import pandas as pd
from sklearn.pipeline import Pipeline

results_list = []

for name, classifier in classifiers.items():
    print(f"Processing {name}...")
    
    pipe = Pipeline([
        ('classifier', classifier)
    ])

    param_grid = param_grids.get(name, {})

    grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

    grid_search.fit(temp_x_train, y_train_resampled)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    y_pred = grid_search.predict(temp_x_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)  # Store as dict for easier processing

    results_list.append({
        'Classifier': name,
        'Best Parameters': best_params,
        'Best Cross-Validation Score': best_score,
        'Test Accuracy': accuracy,
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1-Score': report['weighted avg']['f1-score']
    })

results_df = pd.DataFrame(results_list)

print(results_df)

Processing Logistic Regression...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Processing Random Forest...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Processing Gradient Boosting...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Processing AdaBoost...
Fitting 5 folds for each of 3 candidates, totalling 15 fits




Processing Support Vector Machine...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Processing K-Nearest Neighbors...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Processing Naive Bayes...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Processing Decision Tree...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
               Classifier                                    Best Parameters  \
0     Logistic Regression                               {'classifier__C': 1}   
1           Random Forest  {'classifier__max_depth': None, 'classifier__n...   
2       Gradient Boosting  {'classifier__learning_rate': 0.1, 'classifier...   
3                AdaBoost                  {'classifier__n_estimators': 100}   
4  Support Vector Machine  {'classifier__C': 10, 'classifier__kernel': 'r...   
5     K-Nearest Neighbors  {'classifier__n_neighbors': 3, 'classifier__p'...   
6             Naive Bayes                                                 

In [125]:
results_df.sort_values(by='Test Accuracy', ascending=False)

Unnamed: 0,Classifier,Best Parameters,Best Cross-Validation Score,Test Accuracy,Precision,Recall,F1-Score
1,Random Forest,"{'classifier__max_depth': None, 'classifier__n...",0.826531,0.743333,0.731778,0.743333,0.734941
5,K-Nearest Neighbors,"{'classifier__n_neighbors': 3, 'classifier__p'...",0.789796,0.726667,0.732108,0.726667,0.729095
0,Logistic Regression,{'classifier__C': 1},0.738776,0.716667,0.748015,0.716667,0.725817
2,Gradient Boosting,"{'classifier__learning_rate': 0.1, 'classifier...",0.817347,0.716667,0.712312,0.716667,0.714286
3,AdaBoost,{'classifier__n_estimators': 100},0.77551,0.716667,0.715773,0.716667,0.716212
4,Support Vector Machine,"{'classifier__C': 10, 'classifier__kernel': 'r...",0.8,0.716667,0.735938,0.716667,0.72348
7,Decision Tree,"{'classifier__max_depth': 10, 'classifier__min...",0.743878,0.663333,0.710309,0.663333,0.67608
6,Naive Bayes,{},0.709184,0.623333,0.686049,0.623333,0.638758


#### Highest Accuracy:
Random Forest - 74.3%

### 4) HPT with all the combined attributes (from EDA + other techniques) without SMOTE


- duration

- credit_amount

- age

- checking_status

- credit_history

- purpose

- employment

- property_magnitude

- housing

- gender

- marital_status

- savings_status 

In [126]:
import pandas as pd
from sklearn.pipeline import Pipeline

results_list = []

for name, classifier in classifiers.items():
    print(f"Processing {name}...")
    
    pipe = Pipeline([
        ('classifier', classifier)
    ])

    param_grid = param_grids.get(name, {})

    grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

    grid_search.fit(x_train, y_train)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    y_pred = grid_search.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)  # Store as dict for easier processing

    results_list.append({
        'Classifier': name,
        'Best Parameters': best_params,
        'Best Cross-Validation Score': best_score,
        'Test Accuracy': accuracy,
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1-Score': report['weighted avg']['f1-score']
    })

results_df = pd.DataFrame(results_list)

print(results_df)

Processing Logistic Regression...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Processing Random Forest...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Processing Gradient Boosting...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Processing AdaBoost...
Fitting 5 folds for each of 3 candidates, totalling 15 fits




Processing Support Vector Machine...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Processing K-Nearest Neighbors...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Processing Naive Bayes...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Processing Decision Tree...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
               Classifier                                    Best Parameters  \
0     Logistic Regression                               {'classifier__C': 1}   
1           Random Forest  {'classifier__max_depth': 20, 'classifier__n_e...   
2       Gradient Boosting  {'classifier__learning_rate': 0.1, 'classifier...   
3                AdaBoost                   {'classifier__n_estimators': 50}   
4  Support Vector Machine  {'classifier__C': 1, 'classifier__kernel': 'rbf'}   
5     K-Nearest Neighbors  {'classifier__n_neighbors': 7, 'classifier__p'...   
6             Naive Bayes                                                 

In [127]:
results_df.sort_values(by='Test Accuracy', ascending=False)

Unnamed: 0,Classifier,Best Parameters,Best Cross-Validation Score,Test Accuracy,Precision,Recall,F1-Score
4,Support Vector Machine,"{'classifier__C': 1, 'classifier__kernel': 'rbf'}",0.751429,0.776667,0.769811,0.776667,0.75366
5,K-Nearest Neighbors,"{'classifier__n_neighbors': 7, 'classifier__p'...",0.744286,0.77,0.758701,0.77,0.751088
0,Logistic Regression,{'classifier__C': 1},0.751429,0.756667,0.744532,0.756667,0.746391
1,Random Forest,"{'classifier__max_depth': 20, 'classifier__n_e...",0.761429,0.753333,0.738333,0.753333,0.736889
3,AdaBoost,{'classifier__n_estimators': 50},0.735714,0.736667,0.725778,0.736667,0.729237
2,Gradient Boosting,"{'classifier__learning_rate': 0.1, 'classifier...",0.752857,0.706667,0.689938,0.706667,0.695
6,Naive Bayes,{},0.632857,0.706667,0.704824,0.706667,0.705714
7,Decision Tree,"{'classifier__max_depth': None, 'classifier__m...",0.684286,0.643333,0.653596,0.643333,0.64792


#### Highest Accuracy:
Support Vector Machine: 77.6%

### 5) HPT with only EDA based important attributes without SMOTE

- duration

- credit_amount

- age

- checking_status

- credit_history

- purpose

- employment

- property_magnitude

- housing

- gender

In [128]:
temp_x_train = x_train.drop(['marital_status_divorced/separated', 'marital_status_married/widowed', 'marital_status_single', 'savings_status'], axis=1)
temp_x_test = x_test.drop(['marital_status_divorced/separated', 'marital_status_married/widowed', 'marital_status_single', 'savings_status'], axis=1)

In [129]:
import pandas as pd
from sklearn.pipeline import Pipeline

results_list = []

for name, classifier in classifiers.items():
    print(f"Processing {name}...")
    
    pipe = Pipeline([
        ('classifier', classifier)
    ])

    param_grid = param_grids.get(name, {})

    grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

    grid_search.fit(temp_x_train, y_train)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    y_pred = grid_search.predict(temp_x_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)  # Store as dict for easier processing

    results_list.append({
        'Classifier': name,
        'Best Parameters': best_params,
        'Best Cross-Validation Score': best_score,
        'Test Accuracy': accuracy,
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1-Score': report['weighted avg']['f1-score']
    })

results_df = pd.DataFrame(results_list)

print(results_df)

Processing Logistic Regression...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Processing Random Forest...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Processing Gradient Boosting...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Processing AdaBoost...
Fitting 5 folds for each of 3 candidates, totalling 15 fits




Processing Support Vector Machine...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Processing K-Nearest Neighbors...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Processing Naive Bayes...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Processing Decision Tree...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
               Classifier                                    Best Parameters  \
0     Logistic Regression                             {'classifier__C': 0.1}   
1           Random Forest  {'classifier__max_depth': 20, 'classifier__n_e...   
2       Gradient Boosting  {'classifier__learning_rate': 0.1, 'classifier...   
3                AdaBoost                   {'classifier__n_estimators': 50}   
4  Support Vector Machine  {'classifier__C': 10, 'classifier__kernel': 'r...   
5     K-Nearest Neighbors  {'classifier__n_neighbors': 5, 'classifier__p'...   
6             Naive Bayes                                                 

In [130]:
results_df.sort_values(by='Test Accuracy', ascending=False)

Unnamed: 0,Classifier,Best Parameters,Best Cross-Validation Score,Test Accuracy,Precision,Recall,F1-Score
0,Logistic Regression,{'classifier__C': 0.1},0.751429,0.77,0.758125,0.77,0.755342
4,Support Vector Machine,"{'classifier__C': 10, 'classifier__kernel': 'r...",0.761429,0.75,0.734584,0.75,0.734067
2,Gradient Boosting,"{'classifier__learning_rate': 0.1, 'classifier...",0.751429,0.743333,0.72889,0.743333,0.731196
3,AdaBoost,{'classifier__n_estimators': 50},0.742857,0.723333,0.710199,0.723333,0.714287
5,K-Nearest Neighbors,"{'classifier__n_neighbors': 5, 'classifier__p'...",0.747143,0.72,0.696809,0.72,0.697882
1,Random Forest,"{'classifier__max_depth': 20, 'classifier__n_e...",0.767143,0.7,0.677966,0.7,0.68342
6,Naive Bayes,{},0.642857,0.7,0.692661,0.7,0.695827
7,Decision Tree,"{'classifier__max_depth': 10, 'classifier__min...",0.688571,0.66,0.629791,0.66,0.639309


#### Highest Accuracy:
Logistic Regression: 77%

### 6) HPT with only onther techniques based important attributes without SMOTE

- duration

- credit_amount

- age

- checking_status       

- savings_status 

- credit_history        

- purpose               

- marital_status  

- property_magnitude    

- gender         

In [131]:
temp_x_train = x_train.drop(['housing_own', 'housing_rent', 'employment'], axis=1)
temp_x_test = x_test.drop(['housing_own', 'housing_rent', 'employment'], axis=1)

In [132]:
import pandas as pd
from sklearn.pipeline import Pipeline

results_list = []

for name, classifier in classifiers.items():
    print(f"Processing {name}...")
    
    pipe = Pipeline([
        ('classifier', classifier)
    ])

    param_grid = param_grids.get(name, {})

    grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

    grid_search.fit(temp_x_train, y_train)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    y_pred = grid_search.predict(temp_x_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)  # Store as dict for easier processing

    results_list.append({
        'Classifier': name,
        'Best Parameters': best_params,
        'Best Cross-Validation Score': best_score,
        'Test Accuracy': accuracy,
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1-Score': report['weighted avg']['f1-score']
    })

results_df = pd.DataFrame(results_list)

print(results_df)

Processing Logistic Regression...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Processing Random Forest...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Processing Gradient Boosting...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Processing AdaBoost...
Fitting 5 folds for each of 3 candidates, totalling 15 fits




Processing Support Vector Machine...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Processing K-Nearest Neighbors...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Processing Naive Bayes...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Processing Decision Tree...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
               Classifier                                    Best Parameters  \
0     Logistic Regression                               {'classifier__C': 1}   
1           Random Forest  {'classifier__max_depth': 10, 'classifier__n_e...   
2       Gradient Boosting  {'classifier__learning_rate': 0.1, 'classifier...   
3                AdaBoost                   {'classifier__n_estimators': 50}   
4  Support Vector Machine  {'classifier__C': 1, 'classifier__kernel': 'rbf'}   
5     K-Nearest Neighbors  {'classifier__n_neighbors': 7, 'classifier__p'...   
6             Naive Bayes                                                 

In [133]:
results_df.sort_values(by='Test Accuracy', ascending=False)

Unnamed: 0,Classifier,Best Parameters,Best Cross-Validation Score,Test Accuracy,Precision,Recall,F1-Score
4,Support Vector Machine,"{'classifier__C': 1, 'classifier__kernel': 'rbf'}",0.75,0.78,0.7736,0.78,0.758137
3,AdaBoost,{'classifier__n_estimators': 50},0.734286,0.756667,0.747111,0.756667,0.749801
5,K-Nearest Neighbors,"{'classifier__n_neighbors': 7, 'classifier__p'...",0.755714,0.753333,0.73871,0.753333,0.738322
2,Gradient Boosting,"{'classifier__learning_rate': 0.1, 'classifier...",0.735714,0.75,0.735081,0.75,0.735492
0,Logistic Regression,{'classifier__C': 1},0.747143,0.746667,0.732246,0.746667,0.734026
1,Random Forest,"{'classifier__max_depth': 10, 'classifier__n_e...",0.748571,0.736667,0.71676,0.736667,0.713259
7,Decision Tree,"{'classifier__max_depth': 10, 'classifier__min...",0.681429,0.72,0.72,0.72,0.72
6,Naive Bayes,{},0.631429,0.703333,0.696969,0.703333,0.699763


#### Highest Accuracy:
Support Vector Machine: 78%

## Final Conclusion

Without oversampling the minority class using SMOTE and using stratification while generating train and test splits results in more accurate model in comparision to other methods performed in this file.

### Ranking of all the methods based on test accuracy and corresponding model:

1) HPT with only onther techniques based important attributes without SMOTE - SVM (c=1, kernel='rbf') - 78%

2) HPT with all the combined attributes (from EDA + other techniques) without SMOTE - SVM (c=1, kernel='rbf') - 77.6%

3) HPT with only EDA based important attributes without SMOTE - Logistic Regression (c=0.1) - 77%

4) HPT with only EDA based important attributes and SMOTE - Gradient Boosting - 74.3%

4) HPT with only onther techniques based important attributes and SMOTE - Random Forest - 74.3%

5) HPT with all the combined attributes (from EDA + other techniques) and SMOTE - Logistic Regression (c=0.1) - 73.3% 