In [284]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score,classification_report
import imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

In [285]:
df=pd.read_csv(r"C:\Users\prashant\Downloads\train_ctrUa4K (1).csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [286]:
df.drop(['Loan_ID'],axis=1,inplace=True)

In [287]:
df.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [288]:
for i in ['LoanAmount','Loan_Amount_Term']:
    df[i].replace(0,np.nan,inplace=True)
    df[i].fillna(df[i].mean(),inplace=True)

In [289]:
df.dropna(inplace=True)

In [290]:
categorical_columns = []
numeric_columns = []

for col in df.columns:
    if df[col].dtype == 'object':
        categorical_columns.append(col)
    else:
        numeric_columns.append(col)

print("Categorical columns:", categorical_columns)
print("Numeric columns:", numeric_columns)

Categorical columns: ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']
Numeric columns: ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']


In [291]:
for col in categorical_columns:
    df[col] = LabelEncoder().fit_transform(df[col])

In [292]:
scaler = StandardScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

In [293]:
df.corr()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Gender,1.0,0.363499,0.200692,0.044667,-0.007948,0.035377,0.157993,0.102137,-0.088109,0.024682,-0.020576,0.052533
Married,0.363499,1.0,0.375597,-0.002516,0.01975,0.040226,0.107327,0.181882,-0.110044,0.020519,0.029479,0.09856
Dependents,0.200692,0.375597,1.0,0.03925,0.05118,0.124717,-0.010586,0.168145,-0.099282,-0.039492,-0.000748,0.016984
Education,0.044667,-0.002516,0.03925,1.0,-0.014796,-0.142747,-0.067085,-0.171444,-0.112531,-0.07572,-0.033095,-0.088699
Self_Employed,-0.007948,0.01975,0.05118,-0.014796,1.0,0.167491,-0.012554,0.115941,-0.02834,-0.016306,-0.052259,-0.018705
ApplicantIncome,0.035377,0.040226,0.124717,-0.142747,0.167491,1.0,-0.114302,0.490153,-0.007649,-0.044954,-0.066311,-0.042166
CoapplicantIncome,0.157993,0.107327,-0.010586,-0.067085,-0.012554,-0.114302,1.0,0.192269,-0.008529,0.000391,0.000368,-0.039323
LoanAmount,0.102137,0.181882,0.168145,-0.171444,0.115941,0.490153,0.192269,1.0,0.049991,-0.027274,-0.113254,-0.062882
Loan_Amount_Term,-0.088109,-0.110044,-0.099282,-0.112531,-0.02834,-0.007649,-0.008529,0.049991,1.0,0.024302,-0.067434,0.004054
Credit_History,0.024682,0.020519,-0.039492,-0.07572,-0.016306,-0.044954,0.000391,-0.027274,0.024302,1.0,-0.003404,0.545934


In [294]:
skewness = df.skew()

In [295]:
for column, skew_val in skewness.items():
    print(f"Skewness of '{column}': {skew_val:.2f}")

Skewness of 'Gender': -1.69
Skewness of 'Married': -0.62
Skewness of 'Dependents': 0.96
Skewness of 'Education': 1.39
Skewness of 'Self_Employed': 2.12
Skewness of 'ApplicantIncome': 6.94
Skewness of 'CoapplicantIncome': 5.93
Skewness of 'LoanAmount': 2.43
Skewness of 'Loan_Amount_Term': -2.37
Skewness of 'Credit_History': -1.90
Skewness of 'Property_Area': -0.05
Skewness of 'Loan_Status': -0.77


In [296]:
for column, skew_val in skewness.items():
    if skew_val > 0.5:  # You can adjust this threshold as needed
        df[column] = np.log1p(df[column])  # Applying log(1+x) transformation

# Print skewness after transformation
skewness_after = df.skew()
for column, skew_val in skewness_after.items():
    print(f"Skewness of '{column}' after transformation: {skew_val:.2f}")

Skewness of 'Gender' after transformation: -1.69
Skewness of 'Married' after transformation: -0.62
Skewness of 'Dependents' after transformation: 0.62
Skewness of 'Education' after transformation: 1.39
Skewness of 'Self_Employed' after transformation: 2.12
Skewness of 'ApplicantIncome' after transformation: 0.89
Skewness of 'CoapplicantIncome' after transformation: 0.58
Skewness of 'LoanAmount' after transformation: -1.01
Skewness of 'Loan_Amount_Term' after transformation: -2.37
Skewness of 'Credit_History' after transformation: -1.90
Skewness of 'Property_Area' after transformation: -0.05
Skewness of 'Loan_Status' after transformation: -0.77


In [297]:
df.isnull().sum()

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           40
Loan_Amount_Term      0
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64

In [298]:
for i in ['LoanAmount','Loan_Amount_Term']:
    df[i].replace(0,np.nan,inplace=True)
    df[i].fillna(df[i].mean(),inplace=True)

In [299]:
df.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [300]:
df.dropna(inplace=True)

In [301]:
df.corr()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Gender,1.0,0.363499,0.198653,0.044667,-0.007948,0.048716,0.228805,0.130118,-0.088109,0.024682,-0.020576,0.052533
Married,0.363499,1.0,0.396187,-0.002516,0.01975,0.018413,0.245707,0.171627,-0.110044,0.020519,0.029479,0.09856
Dependents,0.198653,0.396187,1.0,0.030597,0.06765,0.142643,-0.013889,0.124595,-0.104441,-0.032219,0.01952,0.018981
Education,0.044667,-0.002516,0.030597,1.0,-0.014796,-0.188659,-0.02328,-0.11873,-0.112531,-0.07572,-0.033095,-0.088699
Self_Employed,-0.007948,0.01975,0.06765,-0.014796,1.0,0.235503,-0.063461,0.118115,-0.02834,-0.016306,-0.052259,-0.018705
ApplicantIncome,0.048716,0.018413,0.142643,-0.188659,0.235503,1.0,-0.288727,0.41626,-0.021801,0.01125,-0.073278,-0.007115
CoapplicantIncome,0.228805,0.245707,-0.013889,-0.02328,-0.063461,-0.288727,1.0,0.202952,-0.005052,-0.003105,-0.050993,0.025545
LoanAmount,0.130118,0.171627,0.124595,-0.11873,0.118115,0.41626,0.202952,1.0,0.02893,-0.005624,-0.151566,-0.001777
Loan_Amount_Term,-0.088109,-0.110044,-0.104441,-0.112531,-0.02834,-0.021801,-0.005052,0.02893,1.0,0.024302,-0.067434,0.004054
Credit_History,0.024682,0.020519,-0.032219,-0.07572,-0.016306,0.01125,-0.003105,-0.005624,0.024302,1.0,-0.003404,0.545934


In [302]:
X = df.drop('Loan_Status', axis=1)  # Features
y = df['Loan_Status']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [303]:
models = [
        ('Logistic Regression', LogisticRegression()),
        ('SVM', SVC()),
        ('KNN', KNeighborsClassifier()),
        ('Decision Tree', DecisionTreeClassifier()),
        ('Random Forest',RandomForestClassifier())
    ]
    
for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        print(f"Model: {name}")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Classification Report:\n", classification_report(y_test, y_pred))

Model: Logistic Regression
Accuracy: 0.8181818181818182
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.52      0.66        52
           1       0.80      0.97      0.88       102

    accuracy                           0.82       154
   macro avg       0.85      0.74      0.77       154
weighted avg       0.83      0.82      0.80       154

Model: SVM
Accuracy: 0.8051948051948052
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.52      0.64        52
           1       0.80      0.95      0.87       102

    accuracy                           0.81       154
   macro avg       0.82      0.74      0.75       154
weighted avg       0.81      0.81      0.79       154

Model: KNN
Accuracy: 0.8051948051948052
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.56      0.66        52
           1       0.81      

Model: Random Forest
Accuracy: 0.7857142857142857
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.54      0.63        52
           1       0.79      0.91      0.85       102

    accuracy                           0.79       154
   macro avg       0.78      0.73      0.74       154
weighted avg       0.78      0.79      0.77       154



In [304]:
# For Training Score
models = [
        ('Logistic Regression', LogisticRegression()),
        ('SVM', SVC()),
        ('KNN', KNeighborsClassifier()),
        ('Decision Tree', DecisionTreeClassifier()),
        ('Random Forest',RandomForestClassifier())
    ]
    
for name, model in models:
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        
        print(f"Model: {name}")
        print("Accuracy:", accuracy_score(y_train,y_pred_train))
        print("Classification Report:\n", classification_report(y_train,y_pred_train))

Model: Logistic Regression
Accuracy: 0.803921568627451
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.41      0.57       112
           1       0.79      0.98      0.87       245

    accuracy                           0.80       357
   macro avg       0.85      0.70      0.72       357
weighted avg       0.83      0.80      0.78       357

Model: SVM
Accuracy: 0.8207282913165266
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.46      0.62       112
           1       0.80      0.98      0.88       245

    accuracy                           0.82       357
   macro avg       0.86      0.72      0.75       357
weighted avg       0.84      0.82      0.80       357

Model: KNN
Accuracy: 0.8235294117647058
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.52      0.65       112
           1       0.81      0

In [305]:
# Model Selection and Hyperparameter Tuning
best_model = None
best_accuracy = 0.0
best_params = None
models = [('Logistic Regression', LogisticRegression(), {'C': [0.001, 0.01, 0.1, 1, 10], 
                                                       'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                                                       'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                                                       'max_iter': [100, 200, 300]}),
        ('SVM', SVC(), {'C': [0.1, 1, 10], 
                        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
                        'gamma': ['scale', 'auto']}),
        ('KNN', KNeighborsClassifier(), {'n_neighbors': [3, 5, 7],
                                         'weights': ['uniform', 'distance'],
                                         'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                                         'p': [1, 2]}),
        ('Decision Tree', DecisionTreeClassifier(), {'max_depth': [None, 10, 20], 
                                                     'min_samples_split': [2, 5, 10]}),
        ('Random Forest',RandomForestClassifier(),{'n_estimators': [50, 100, 200],
                                                   'max_depth': [None, 10, 20],
                                                   'min_samples_split': [2, 5, 10],
                                                   'min_samples_leaf': [1, 2, 4],
                                                   'max_features': ['auto', 'sqrt', 'log2']})

    ]
for name, model, param_grid in models:
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train,y_train)
        
    y_pred = grid_search.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
        
    print(f"Model: {name}")
    print("Best Parameters:", grid_search.best_params_)
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("="*50)
        
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = name
        best_params = grid_search.best_params_
    
print("Best Model:", best_model)
print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

Model: Logistic Regression
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
Accuracy: 0.8181818181818182
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.52      0.66        52
           1       0.80      0.97      0.88       102

    accuracy                           0.82       154
   macro avg       0.85      0.74      0.77       154
weighted avg       0.83      0.82      0.80       154

Model: SVM
Best Parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
Accuracy: 0.8116883116883117
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.54      0.66        52
           1       0.80      0.95      0.87       102

    accuracy                           0.81       154
   macro avg       0.83      0.74      0.76       154
weighted avg       0.82      0.81      0.80       154

Model: KNN
Best Parameters: {'algorithm': 'auto', 'n_ne

In [306]:
from sklearn.ensemble import BaggingClassifier

In [307]:
bg=BaggingClassifier(DecisionTreeClassifier())
bg.fit(X_train,y_train)
ypred=bg.predict(X_test)
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

           0       0.62      0.58      0.60        52
           1       0.79      0.82      0.81       102

    accuracy                           0.74       154
   macro avg       0.71      0.70      0.70       154
weighted avg       0.74      0.74      0.74       154



In [308]:
bg=BaggingClassifier(SVC())
bg.fit(X_train,y_train)
ypred=bg.predict(X_test)
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

           0       0.84      0.52      0.64        52
           1       0.80      0.95      0.87       102

    accuracy                           0.81       154
   macro avg       0.82      0.74      0.75       154
weighted avg       0.81      0.81      0.79       154



In [309]:
bg=BaggingClassifier(KNeighborsClassifier())
bg.fit(X_train,y_train)
ypred=bg.predict(X_test)
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

           0       0.75      0.52      0.61        52
           1       0.79      0.91      0.85       102

    accuracy                           0.78       154
   macro avg       0.77      0.72      0.73       154
weighted avg       0.78      0.78      0.77       154



In [310]:
bg=BaggingClassifier(RandomForestClassifier())
bg.fit(X_train,y_train)
ypred=bg.predict(X_test)
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

           0       0.82      0.54      0.65        52
           1       0.80      0.94      0.86       102

    accuracy                           0.81       154
   macro avg       0.81      0.74      0.76       154
weighted avg       0.81      0.81      0.79       154



### UnderSampling

In [311]:
rus= RandomUnderSampler(random_state=1,replacement=True)
x_rus,y_rus=rus.fit_resample(X,y)

In [312]:
X_train, X_test, y_train, y_test = train_test_split(x_rus, y_rus, test_size=0.3, random_state=42)

In [313]:
models = [
        ('Logistic Regression', LogisticRegression()),
        ('SVM', SVC()),
        ('KNN', KNeighborsClassifier()),
        ('Decision Tree', DecisionTreeClassifier()),
        ('Random Forest',RandomForestClassifier())
    ]
    
for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        print(f"Model: {name}")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Classification Report:\n", classification_report(y_test, y_pred))

Model: Logistic Regression
Accuracy: 0.6161616161616161
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.43      0.55        53
           1       0.56      0.83      0.67        46

    accuracy                           0.62        99
   macro avg       0.65      0.63      0.61        99
weighted avg       0.66      0.62      0.60        99

Model: SVM
Accuracy: 0.6666666666666666
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.47      0.60        53
           1       0.59      0.89      0.71        46

    accuracy                           0.67        99
   macro avg       0.71      0.68      0.66        99
weighted avg       0.72      0.67      0.65        99

Model: KNN
Accuracy: 0.6767676767676768
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.53      0.64        53
           1       0.61      

### OverSampling

In [314]:
ros=RandomOverSampler(random_state=1)
x_ros,y_ros=ros.fit_resample(X,y)

In [315]:
X_train, X_test, y_train, y_test = train_test_split(x_ros, y_ros, test_size=0.3, random_state=42)

In [316]:
models = [
        ('Logistic Regression', LogisticRegression()),
        ('SVM', SVC()),
        ('KNN', KNeighborsClassifier()),
        ('Decision Tree', DecisionTreeClassifier()),
        ('Random Forest',RandomForestClassifier())
    ]
    
for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        print(f"Model: {name}")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Classification Report:\n", classification_report(y_test, y_pred))

Model: Logistic Regression
Accuracy: 0.7081339712918661
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.61      0.66        99
           1       0.69      0.80      0.74       110

    accuracy                           0.71       209
   macro avg       0.71      0.70      0.70       209
weighted avg       0.71      0.71      0.70       209

Model: SVM
Accuracy: 0.7272727272727273
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.65      0.69        99
           1       0.72      0.80      0.76       110

    accuracy                           0.73       209
   macro avg       0.73      0.72      0.72       209
weighted avg       0.73      0.73      0.73       209

Model: KNN
Accuracy: 0.7799043062200957
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.80      0.77        99
           1       0.81      

In [317]:
# Model Selection and Hyperparameter Tuning
best_model = None
best_accuracy = 0.0
best_params = None
models = [('Logistic Regression', LogisticRegression(), {'C': [0.001, 0.01, 0.1, 1, 10], 
                                                       'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                                                       'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                                                       'max_iter': [100, 200, 300]}),
        ('SVM', SVC(), {'C': [0.1, 1, 10], 
                        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
                        'gamma': ['scale', 'auto']}),
        ('KNN', KNeighborsClassifier(), {'n_neighbors': [3, 5, 7],
                                         'weights': ['uniform', 'distance'],
                                         'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                                         'p': [1, 2]}),
        ('Decision Tree', DecisionTreeClassifier(), {'max_depth': [None, 10, 20], 
                                                     'min_samples_split': [2, 5, 10]}),
        ('Random Forest',RandomForestClassifier(),{'n_estimators': [50, 100, 200],
                                                   'max_depth': [None, 10, 20],
                                                   'min_samples_split': [2, 5, 10],
                                                   'min_samples_leaf': [1, 2, 4],
                                                   'max_features': ['auto', 'sqrt', 'log2']})

    ]
for name, model, param_grid in models:
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train,y_train)
        
    y_pred = grid_search.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
        
    print(f"Model: {name}")
    print("Best Parameters:", grid_search.best_params_)
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("="*50)
        
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = name
        best_params = grid_search.best_params_
    
print("Best Model:", best_model)
print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

Model: Logistic Regression
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
Accuracy: 0.7607655502392344
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.53      0.68        99
           1       0.69      0.97      0.81       110

    accuracy                           0.76       209
   macro avg       0.82      0.75      0.74       209
weighted avg       0.81      0.76      0.75       209

Model: SVM
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy: 0.7511961722488039
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.76      0.74        99
           1       0.77      0.75      0.76       110

    accuracy                           0.75       209
   macro avg       0.75      0.75      0.75       209
weighted avg       0.75      0.75      0.75       209

Model: KNN
Best Parameters: {'algorithm': 'auto', 'n_n

### SMOTE

In [318]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [319]:
sm=SMOTE()

In [320]:
xtrain1,ytrain1=sm.fit_resample(X_train,y_train)

In [321]:
models = [
        ('Logistic Regression', LogisticRegression()),
        ('SVM', SVC()),
        ('KNN', KNeighborsClassifier()),
        ('Decision Tree', DecisionTreeClassifier()),
        ('Random Forest',RandomForestClassifier())
    ]
    
for name, model in models:
        model.fit(xtrain1, ytrain1)
        y_pred = model.predict(X_test)
        
        print(f"Model: {name}")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Classification Report:\n", classification_report(y_test, y_pred))

Model: Logistic Regression
Accuracy: 0.6883116883116883
Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.54      0.54        52
           1       0.76      0.76      0.76       102

    accuracy                           0.69       154
   macro avg       0.65      0.65      0.65       154
weighted avg       0.69      0.69      0.69       154

Model: SVM
Accuracy: 0.7467532467532467
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.60      0.61        52
           1       0.80      0.82      0.81       102

    accuracy                           0.75       154
   macro avg       0.72      0.71      0.71       154
weighted avg       0.74      0.75      0.74       154

Model: KNN
Accuracy: 0.6688311688311688
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.60      0.55        52
           1       0.77      

Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.58      0.52        52
           1       0.76      0.68      0.72       102

    accuracy                           0.64       154
   macro avg       0.62      0.63      0.62       154
weighted avg       0.66      0.64      0.65       154

Model: Random Forest
Accuracy: 0.7597402597402597
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.58      0.62        52
           1       0.80      0.85      0.82       102

    accuracy                           0.76       154
   macro avg       0.73      0.71      0.72       154
weighted avg       0.75      0.76      0.76       154



In [322]:
best_model = None
best_accuracy = 0.0
best_params = None
models = [('Logistic Regression', LogisticRegression(), {'C': [0.001, 0.01, 0.1, 1, 10], 
                                                       'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                                                       'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                                                       'max_iter': [100, 200, 300]}),
        ('SVM', SVC(), {'C': [0.1, 1, 10], 
                        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
                        'gamma': ['scale', 'auto']}),
        ('KNN', KNeighborsClassifier(), {'n_neighbors': [3, 5, 7],
                                         'weights': ['uniform', 'distance'],
                                         'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                                         'p': [1, 2]}),
        ('Decision Tree', DecisionTreeClassifier(), {'max_depth': [None, 10, 20], 
                                                     'min_samples_split': [2, 5, 10]}),
        ('Random Forest',RandomForestClassifier(),{'n_estimators': [50, 100, 200],
                                                   'max_depth': [None, 10, 20],
                                                   'min_samples_split': [2, 5, 10],
                                                   'min_samples_leaf': [1, 2, 4],
                                                   'max_features': ['auto', 'sqrt', 'log2']})

    ]
for name, model, param_grid in models:
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(xtrain1,ytrain1)
        
    y_pred = grid_search.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
        
    print(f"Model: {name}")
    print("Best Parameters:", grid_search.best_params_)
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("="*50)
        
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = name
        best_params = grid_search.best_params_
    
print("Best Model:", best_model)
print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

Model: Logistic Regression
Best Parameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.7467532467532467
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.54      0.59        52
           1       0.78      0.85      0.82       102

    accuracy                           0.75       154
   macro avg       0.72      0.70      0.70       154
weighted avg       0.74      0.75      0.74       154

Model: SVM
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy: 0.7467532467532467
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.62      0.62        52
           1       0.81      0.81      0.81       102

    accuracy                           0.75       154
   macro avg       0.72      0.71      0.72       154
weighted avg       0.75      0.75      0.75       154

Model: KNN
Best Parameters: {'algorithm': 'auto', 'n_ne

### Oversample and SMOTE

In [323]:
X_train, X_test, y_train, y_test = train_test_split(x_ros, y_ros, test_size=0.3, random_state=42)

In [324]:
models = [
        ('Logistic Regression', LogisticRegression()),
        ('SVM', SVC()),
        ('KNN', KNeighborsClassifier()),
        ('Decision Tree', DecisionTreeClassifier()),
        ('Random Forest',RandomForestClassifier())
    ]
    
for name, model in models:
        model.fit(xtrain1, ytrain1)
        y_pred = model.predict(X_test)
        
        print(f"Model: {name}")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Classification Report:\n", classification_report(y_test, y_pred))
        print('='*50)

Model: Logistic Regression
Accuracy: 0.722488038277512
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.69      0.70        99
           1       0.73      0.75      0.74       110

    accuracy                           0.72       209
   macro avg       0.72      0.72      0.72       209
weighted avg       0.72      0.72      0.72       209

Model: SVM
Accuracy: 0.7655502392344498
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.74      0.75        99
           1       0.77      0.79      0.78       110

    accuracy                           0.77       209
   macro avg       0.77      0.76      0.76       209
weighted avg       0.77      0.77      0.77       209

Model: KNN
Accuracy: 0.7751196172248804
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.79      0.77        99
           1       0.80      0

In [325]:
best_model = None
best_accuracy = 0.0
best_params = None
models = [('Logistic Regression', LogisticRegression(), {'C': [0.001, 0.01, 0.1, 1, 10], 
                                                       'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                                                       'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                                                       'max_iter': [100, 200, 300]}),
        ('SVM', SVC(), {'C': [0.1, 1, 10], 
                        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
                        'gamma': ['scale', 'auto']}),
        ('KNN', KNeighborsClassifier(), {'n_neighbors': [3, 5, 7],
                                         'weights': ['uniform', 'distance'],
                                         'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                                         'p': [1, 2]}),
        ('Decision Tree', DecisionTreeClassifier(), {'max_depth': [None, 10, 20], 
                                                     'min_samples_split': [2, 5, 10]}),
        ('Random Forest',RandomForestClassifier(),{'n_estimators': [50, 100, 200],
                                                   'max_depth': [None, 10, 20],
                                                   'min_samples_split': [2, 5, 10],
                                                   'min_samples_leaf': [1, 2, 4],
                                                   'max_features': ['auto', 'sqrt', 'log2']})

    ]
for name, model, param_grid in models:
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(xtrain1,ytrain1)
        
    y_pred = grid_search.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
        
    print(f"Model: {name}")
    print("Best Parameters:", grid_search.best_params_)
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("="*50)
        
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = name
        best_params = grid_search.best_params_
    
print("Best Model:", best_model)
print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

Model: Logistic Regression
Best Parameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.7416267942583732
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.67      0.71        99
           1       0.73      0.81      0.77       110

    accuracy                           0.74       209
   macro avg       0.74      0.74      0.74       209
weighted avg       0.74      0.74      0.74       209

Model: SVM
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy: 0.7799043062200957
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.74      0.76        99
           1       0.78      0.82      0.80       110

    accuracy                           0.78       209
   macro avg       0.78      0.78      0.78       209
weighted avg       0.78      0.78      0.78       209

Model: KNN
Best Parameters: {'algorithm': 'auto', 'n_ne

In [326]:
bg=BaggingClassifier(KNeighborsClassifier(algorithm='auto', n_neighbors= 7, p= 1, weights= 'distance'))
bg.fit(xtrain1,ytrain1)
ypred=bg.predict(X_test)
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

           0       0.88      0.85      0.86        99
           1       0.87      0.89      0.88       110

    accuracy                           0.87       209
   macro avg       0.87      0.87      0.87       209
weighted avg       0.87      0.87      0.87       209



In [327]:
bg=BaggingClassifier(SVC())
bg.fit(xtrain1,ytrain1)
ypred=bg.predict(X_test)
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

           0       0.75      0.73      0.74        99
           1       0.76      0.78      0.77       110

    accuracy                           0.76       209
   macro avg       0.76      0.75      0.75       209
weighted avg       0.76      0.76      0.76       209



In [328]:
bg=BaggingClassifier(RandomForestClassifier())
bg.fit(xtrain1,ytrain1)
ypred=bg.predict(X_test)
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

           0       0.90      0.85      0.88        99
           1       0.87      0.92      0.89       110

    accuracy                           0.89       209
   macro avg       0.89      0.88      0.88       209
weighted avg       0.89      0.89      0.88       209

