# DATASET LOAN

In [6]:
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
# Load the dataset
data = pd.read_csv("loan_data.csv")

# Check for any missing values
print(data.isnull().sum())


# Check the data types of each column
print(data.dtypes)


credit.policy        0
purpose              0
int.rate             0
installment          0
log.annual.inc       0
dti                  0
fico                 0
days.with.cr.line    0
revol.bal            0
revol.util           0
inq.last.6mths       0
delinq.2yrs          0
pub.rec              0
not.fully.paid       0
dtype: int64
credit.policy          int64
purpose               object
int.rate             float64
installment          float64
log.annual.inc       float64
dti                  float64
fico                   int64
days.with.cr.line    float64
revol.bal              int64
revol.util           float64
inq.last.6mths         int64
delinq.2yrs            int64
pub.rec                int64
not.fully.paid         int64
dtype: object


In [16]:
# Encode the 'purpose' column using LabelEncoder
label_encoder = LabelEncoder()
data['purpose'] = label_encoder.fit_transform(data['purpose'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
print(data.dtypes)


credit.policy          int64
purpose                int64
int.rate             float64
installment          float64
log.annual.inc       float64
dti                  float64
fico                   int64
days.with.cr.line    float64
revol.bal              int64
revol.util           float64
inq.last.6mths         int64
delinq.2yrs            int64
pub.rec                int64
not.fully.paid         int64
dtype: object


In [14]:
data

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,2,0.1189,829.10,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,1,0.1071,228.22,11.082143,14.29,707,2760.000000,33623,76.7,0,0,0,0
2,1,2,0.1357,366.86,10.373491,11.63,682,4710.000000,3511,25.6,1,0,0,0
3,1,2,0.1008,162.34,11.350407,8.10,712,2699.958333,33667,73.2,1,0,0,0
4,1,1,0.1426,102.92,11.299732,14.97,667,4066.000000,4740,39.5,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,0,0,0.1461,344.76,12.180755,10.39,672,10474.000000,215372,82.1,2,0,0,1
9574,0,0,0.1253,257.70,11.141862,0.21,722,4380.000000,184,1.1,5,0,0,1
9575,0,2,0.1071,97.81,10.596635,13.09,687,3450.041667,10036,82.9,8,0,0,1
9576,0,4,0.1600,351.58,10.819778,19.18,692,1800.000000,0,3.2,5,0,0,1


In [66]:
# Count the occurrences of each class label in the resampled training and testing sets
train_balance = pd.Series(y_train).value_counts()
test_balance = pd.Series(y_test).value_counts()

# Print the class distribution
print("Class distribution in the Original training set:")
print(train_balance)

print("\nClass distribution in the Original testing set:")
print(test_balance)

Class distribution in the Original training set:
not.fully.paid
0    6434
1    1228
Name: count, dtype: int64

Class distribution in the Original testing set:
not.fully.paid
0    1611
1     305
Name: count, dtype: int64


# BASELINE CLASSIFICATION ALGORITHMS WITH IMBALANCED DATASET

## KNN

In [25]:
def correlation_method(X_train, X_test, threshold):
    corr_matrix = X_train.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    X_train_selected = X_train.drop(to_drop, axis=1)
    X_test_selected = X_test.drop(to_drop, axis=1)
    
    return X_train_selected, X_test_selected

def mutual_information(X_train, y_train, X_test, n_features):
    selector = SelectKBest(score_func=mutual_info_classif, k=n_features)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)
    return X_train_selected, X_test_selected

def covariance_method(X_train, X_test, threshold):
    cov_matrix = X_train.cov().abs()
    upper = cov_matrix.where(np.triu(np.ones(cov_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    X_train_selected = X_train.drop(to_drop, axis=1)
    X_test_selected = X_test.drop(to_drop, axis=1)
    
    return X_train_selected, X_test_selected

from sklearn.decomposition import PCA

# PCA Dimensionality Reduction
def perform_pca(X_train, X_test, n_components):
    pca = PCA(n_components=n_components)
    pca.fit(X_train)
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    return X_train_pca, X_test_pca

In [26]:
# Initialize KNN classifier
knn = KNeighborsClassifier()

# Loop through feature selection methods
for name, method in methods.items():
    if name == 'PCA':
        # No need for feature selection, so we skip this part
        selector = None
    else:
        # Select features using the feature selection method
        selector = SelectKBest(method)
    
    # Initialize scaler
    scaler = MinMaxScaler()
    
    # Initialize PCA
    pca = PCA(n_components=5)
    
    # Create a pipeline
    pipe = Pipeline([
        ("scaler", scaler),
        ("selector", selector),
        ("pca", pca),
        ("knn", knn)
    ])
    
    # Evaluate the model using cross-validation
    cv_accuracy = cross_val_score(pipe, X, y, cv=5, scoring='accuracy')
    cv_precision = cross_val_score(pipe, X, y, cv=5, scoring='precision')
    cv_recall = cross_val_score(pipe, X, y, cv=5, scoring='recall')
    cv_f1 = cross_val_score(pipe, X, y, cv=5, scoring='f1')
    
    print("Feature Selection Method:", name)
    print("Cross-validation Metrics:")
    print("Mean Accuracy:", cv_accuracy.mean())
    print("Mean Precision:", cv_precision.mean())
    print("Mean Recall:", cv_recall.mean())
    print("Mean F1 Score:", cv_f1.mean())
    
    # Add a space between results for each method
    print()

# Fit the KNN classifier on the training data
knn.fit(X_train, y_train)

# Predict using the trained KNN classifier
y_pred_knn = knn.predict(X_test)

# KNN Classifier Metrics
print("KNN Classifier Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Precision:", precision_score(y_test, y_pred_knn))
print("Recall:", recall_score(y_test, y_pred_knn))
print("F1 Score:", f1_score(y_test, y_pred_knn))

# Cross-validation for KNN Classifier
cv_accuracy_knn = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
cv_precision_knn = cross_val_score(knn, X, y, cv=5, scoring='precision')
cv_recall_knn = cross_val_score(knn, X, y, cv=5, scoring='recall')
cv_f1_knn = cross_val_score(knn, X, y, cv=5, scoring='f1')

print()

print("Cross-validation Metrics for KNN Classifier:")
print("Mean Accuracy:", cv_accuracy_knn.mean())
print("Mean Precision:", cv_precision_knn.mean())
print("Mean Recall:", cv_recall_knn.mean())
print("Mean F1 Score:", cv_f1_knn.mean())

Feature Selection Method: Chi-Square
Cross-validation Metrics:
Mean Accuracy: 0.7205757207411001
Mean Precision: 0.25398451471169364
Mean Recall: 0.2339124140427072
Mean F1 Score: 0.12031195377862694

Feature Selection Method: Correlation
Cross-validation Metrics:
Mean Accuracy: 0.7186968063360897
Mean Precision: 0.217317848045027
Mean Recall: 0.23130655085052482
Mean F1 Score: 0.1153589440898148

Feature Selection Method: Covariance
Cross-validation Metrics:
Mean Accuracy: 0.7186968063360897
Mean Precision: 0.217317848045027
Mean Recall: 0.23130655085052482
Mean F1 Score: 0.1153589440898148

Feature Selection Method: Mutual Information
Cross-validation Metrics:
Mean Accuracy: 0.7197410292330082
Mean Precision: 0.19692325781608955
Mean Recall: 0.22478763492367632
Mean F1 Score: 0.1158937839941184

Feature Selection Method: PCA
Cross-validation Metrics:
Mean Accuracy: 0.7256937047918587
Mean Precision: 0.2144609725560512
Mean Recall: 0.21431521577143348
Mean F1 Score: 0.1110163641906261

## LOGISTIC REGRESSION

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def logistic_regression_classifier(X, y, cv=5):
    # Standardizing the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Initialize Logistic Regression classifier
    logistic_reg = LogisticRegression()
    
    # Perform cross-validation
    accuracy_scores = cross_val_score(logistic_reg, X_scaled, y, cv=cv, scoring='accuracy')
    precision_scores = cross_val_score(logistic_reg, X_scaled, y, cv=cv, scoring='precision')
    recall_scores = cross_val_score(logistic_reg, X_scaled, y, cv=cv, scoring='recall')
    f1_scores = cross_val_score(logistic_reg, X_scaled, y, cv=cv, scoring='f1')
    
    # Compute mean scores
    accuracy = accuracy_scores.mean()
    precision = precision_scores.mean()
    recall = recall_scores.mean()
    f1 = f1_scores.mean()
    
    return accuracy, precision, recall, f1

# Example usage:
accuracy, precision, recall, f1 = logistic_regression_classifier(X_train, y_train)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train, X_test, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = logistic_regression_classifier(X_train_corr, y_train)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train, y_train, X_test, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = logistic_regression_classifier(X_train_mi, y_train)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()


X_train_cov, X_test_cov = covariance_method(X_train, X_test, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = logistic_regression_classifier(X_train_cov, y_train)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()


def main(X_train, X_test, y_train, y_test):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train, X_test, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = logistic_regression_classifier(X_train_pca, y_train)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)
    print()

if __name__ == "__main__":
    main(X_train, X_test, y_train, y_test)

Accuracy: 0.8393371927260835
Precision: 0.4709523809523809
Recall: 0.016290028206404513
F1 Score: 0.03142379382070213

Accuracy with Correlation method: 0.8393371927260835
Precision with Correlation method: 0.4709523809523809
Recall with Correlation method: 0.016290028206404513
F1 Score with Correlation method: 0.03142379382070213

Accuracy with Mutual Information method: 0.839598374490538
Precision with Mutual Information method: 0.5257142857142858
Recall with Mutual Information method: 0.015480338476854155
F1 Score with Mutual Information method: 0.029931052745352356

Accuracy with Covariance method: 0.8395982041731175
Precision with Covariance method: 0.2
Recall with Covariance method: 0.0016326530612244899
F1 Score with Covariance method: 0.0032388663967611335



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy with PCA: 0.8395981190144072
Precision with PCA: 0.06666666666666667
Recall with PCA: 0.0008130081300813009
F1 Score with PCA: 0.0016064257028112448



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## RANDOM FOREST

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

def random_forest_classifier(X, y, cv=5):
    # Initialize Random Forest classifier
    rf_classifier = RandomForestClassifier(random_state=42)
    
    # Perform cross-validation
    accuracy_scores = cross_val_score(rf_classifier, X, y, cv=cv, scoring='accuracy')
    precision_scores = cross_val_score(rf_classifier, X, y, cv=cv, scoring='precision')
    recall_scores = cross_val_score(rf_classifier, X, y, cv=cv, scoring='recall')
    f1_scores = cross_val_score(rf_classifier, X, y, cv=cv, scoring='f1')
    
    # Compute mean scores
    accuracy = accuracy_scores.mean()
    precision = precision_scores.mean()
    recall = recall_scores.mean()
    f1 = f1_scores.mean()
    
    return accuracy, precision, recall, f1

# Assuming you have already split your data into X_train, X_test, y_train, y_test
accuracy, precision, recall, f1 = random_forest_classifier(X_train, y_train)

# Print the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()


X_train_corr, X_test_corr = correlation_method(X_train, X_test, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = random_forest_classifier(X_train_corr, y_train)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train, y_train, X_test, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = random_forest_classifier(X_train_mi, y_train)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()


X_train_cov, X_test_cov = covariance_method(X_train, X_test, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = random_forest_classifier(X_train_cov, y_train)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()


def main(X_train, X_test, y_train, y_test):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train, X_test, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = random_forest_classifier(X_train_pca, y_train)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)
    print()

if __name__ == "__main__":
    main(X_train, X_test, y_train, y_test)

Accuracy: 0.8380311987451012
Precision: 0.40166666666666667
Recall: 0.021174713787954203
F1 Score: 0.040078977439165386

Accuracy with Correlation method: 0.8380311987451012
Precision with Correlation method: 0.40166666666666667
Recall with Correlation method: 0.021174713787954203
F1 Score with Correlation method: 0.040078977439165386

Accuracy with Mutual Information method: 0.836857541399907
Precision with Mutual Information method: 0.41213423831070894
Recall with Mutual Information method: 0.03176372988219678
F1 Score with Mutual Information method: 0.0587698931167981

Accuracy with Covariance method: 0.8115372169111573
Precision with Covariance method: 0.26393796750725157
Recall with Covariance method: 0.09608428737348598
F1 Score with Covariance method: 0.14075831006277745

Accuracy with PCA: 0.8136250530112971
Precision with PCA: 0.20370294817123175
Recall with PCA: 0.05536087605774017
F1 Score with PCA: 0.08703955821486682



## SVM

In [32]:
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

def svm_classifier(X, y, cv=5):
    # Initialize SVM classifier
    svm_classifier = SVC(kernel='rbf', random_state=42)
    
    # Perform cross-validation
    accuracy_scores = cross_val_score(svm_classifier, X, y, cv=cv, scoring='accuracy')
    precision_scores = cross_val_score(svm_classifier, X, y, cv=cv, scoring='precision')
    recall_scores = cross_val_score(svm_classifier, X, y, cv=cv, scoring='recall')
    f1_scores = cross_val_score(svm_classifier, X, y, cv=cv, scoring='f1')
    
    # Compute mean scores
    accuracy = accuracy_scores.mean()
    precision = precision_scores.mean()
    recall = recall_scores.mean()
    f1 = f1_scores.mean()
    
    return accuracy, precision, recall, f1

# Example usage:
accuracy, precision, recall, f1 = svm_classifier(X_train, y_train)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


X_train_corr, X_test_corr = correlation_method(X_train, X_test, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = svm_classifier(X_train_corr, y_train)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train, y_train, X_test, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = svm_classifier(X_train_mi, y_train)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()


X_train_cov, X_test_cov = covariance_method(X_train, X_test, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = svm_classifier(X_train_cov, y_train)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()


def main(X_train, X_test, y_train, y_test):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train, X_test, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = svm_classifier(X_train_pca, y_train)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)
    print()

if __name__ == "__main__":
    main(X_train, X_test, y_train, y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.8397285821585688
Precision: 0.2
Recall: 0.0008130081300813009
F1 Score: 0.0016194331983805667


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy with Correlation method: 0.8397285821585688
Precision with Correlation method: 0.2
Recall with Correlation method: 0.0008130081300813009
F1 Score with Correlation method: 0.0016194331983805667



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy with Mutual Information method: 0.8397285821585688
Precision with Mutual Information method: 0.2
Recall with Mutual Information method: 0.0008130081300813009
F1 Score with Mutual Information method: 0.0016194331983805667



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy with Covariance method: 0.8397285821585688
Precision with Covariance method: 0.0
Recall with Covariance method: 0.0
F1 Score with Covariance method: 0.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy with PCA: 0.8399895936056028
Precision with PCA: 0.4
Recall with PCA: 0.0016260162601626018
F1 Score with PCA: 0.0032388663967611335



## NAIVE BAYES

In [30]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from statistics import mean

def naive_bayes_classifier(X, y, cv=5):
    # Initialize Naive Bayes classifier
    nb_classifier = GaussianNB()
    
    # Perform cross-validation
    accuracy_scores = cross_val_score(nb_classifier, X, y, cv=cv, scoring='accuracy')
    precision_scores = cross_val_score(nb_classifier, X, y, cv=cv, scoring='precision')
    recall_scores = cross_val_score(nb_classifier, X, y, cv=cv, scoring='recall')
    f1_scores = cross_val_score(nb_classifier, X, y, cv=cv, scoring='f1')
    
    # Compute mean scores
    accuracy = mean(accuracy_scores)
    precision = mean(precision_scores)
    recall = mean(recall_scores)
    f1 = mean(f1_scores)
    
    return accuracy, precision, recall, f1

# Example usage:
accuracy, precision, recall, f1 = naive_bayes_classifier(X_train, y_train)

# Print the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


X_train_corr, X_test_corr = correlation_method(X_train, X_test, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = naive_bayes_classifier(X_train_corr, y_train)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train, y_train, X_test, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = naive_bayes_classifier(X_train_mi, y_train)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()


X_train_cov, X_test_cov = covariance_method(X_train, X_test, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = naive_bayes_classifier(X_train_cov, y_train)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()


def main(X_train, X_test, y_train, y_test):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train, X_test, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = naive_bayes_classifier(X_train_pca, y_train)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)
    print()

if __name__ == "__main__":
    main(X_train, X_test, y_train, y_test)

Accuracy: 0.8228916832300358
Precision: 0.32770772148692545
Recall: 0.09934627509540402
F1 Score: 0.1521681241279163
Accuracy with Correlation method: 0.8228916832300358
Precision with Correlation method: 0.32770772148692545
Recall with Correlation method: 0.09934627509540402
F1 Score with Correlation method: 0.1521681241279163

Accuracy with Mutual Information method: 0.8228915980713255
Precision with Mutual Information method: 0.32931397578460947
Recall with Mutual Information method: 0.10097229135556662
F1 Score with Mutual Information method: 0.15435650744990337

Accuracy with Covariance method: 0.8204110099993357
Precision with Covariance method: 0.3218147229114971
Recall with Covariance method: 0.10669653227144517
F1 Score with Covariance method: 0.15927314308800292

Accuracy with PCA: 0.829939758728342
Precision with PCA: 0.26937903225806453
Recall with PCA: 0.034196117471378795
F1 Score with PCA: 0.06053546362742611



## DECISION TREE

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from statistics import mean

def decision_tree_classifier(X, y, cv=5):
    # Initialize Decision Tree classifier
    dt_classifier = DecisionTreeClassifier(random_state=42)
    
    # Perform cross-validation
    accuracy_scores = cross_val_score(dt_classifier, X, y, cv=cv, scoring='accuracy')
    precision_scores = cross_val_score(dt_classifier, X, y, cv=cv, scoring='precision')
    recall_scores = cross_val_score(dt_classifier, X, y, cv=cv, scoring='recall')
    f1_scores = cross_val_score(dt_classifier, X, y, cv=cv, scoring='f1')
    
    # Compute mean scores
    accuracy = mean(accuracy_scores)
    precision = mean(precision_scores)
    recall = mean(recall_scores)
    f1 = mean(f1_scores)
    
    return accuracy, precision, recall, f1

# Example usage:
accuracy, precision, recall, f1 = decision_tree_classifier(X_train, y_train)

# Print the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train, X_test, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = decision_tree_classifier(X_train_corr, y_train)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train, y_train, X_test, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = decision_tree_classifier(X_train_mi, y_train)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()


X_train_cov, X_test_cov = covariance_method(X_train, X_test, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = decision_tree_classifier(X_train_cov, y_train)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()


def main(X_train, X_test, y_train, y_test):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train, X_test, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = decision_tree_classifier(X_train_pca, y_train)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)
    print()

if __name__ == "__main__":
    main(X_train, X_test, y_train, y_test)


Accuracy: 0.7342718674794214
Precision: 0.20168497283082398
Recall: 0.22148000663680106
F1 Score: 0.21103974498329137

Accuracy with Correlation method: 0.7342718674794214
Precision with Correlation method: 0.20168497283082398
Recall with Correlation method: 0.22148000663680106
F1 Score with Correlation method: 0.21103974498329137

Accuracy with Mutual Information method: 0.741190416579379
Precision with Mutual Information method: 0.22696498507846494
Recall with Mutual Information method: 0.2557059897129583
F1 Score with Mutual Information method: 0.24038864998743795

Accuracy with Covariance method: 0.8179313586731591
Precision with Covariance method: 0.28847492891554455
Recall with Covariance method: 0.0887473037995686
F1 Score with Covariance method: 0.13543908998195306

Accuracy with PCA: 0.7366197782807818
Precision with PCA: 0.18745845536342234
Recall with PCA: 0.1921718931475029
F1 Score with PCA: 0.18955038567636542



# 1- BALANCING TECHNIQUE: SMOTE

## SAMPLING

In [33]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the entire dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

import pandas as pd

# Count the occurrences of each class label in the resampled training and testing sets
train_balance = pd.Series(y_train_s).value_counts()
test_balance = pd.Series(y_test_s).value_counts()

# Print the class distribution
print("Class distribution in the resampled training set:")
print(train_balance)

print("\nClass distribution in the resampled testing set:")
print(test_balance)

import pandas as pd

# Create a DataFrame for the resampled data
resampled_data = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled)], axis=1)

# Define the column names
column_names = list(X.columns) + ['purpose']

# Set the column names
resampled_data.columns = column_names

# Save the resampled data to a CSV file
resampled_data.to_excel('loan_SMOTE.xlsx', index=False)


Class distribution in the resampled training set:
not.fully.paid
1    6441
0    6431
Name: count, dtype: int64

Class distribution in the resampled testing set:
not.fully.paid
0    1614
1    1604
Name: count, dtype: int64


## KNN

In [69]:
from sklearn.model_selection import cross_val_score

def knn_classifier(X, y, cv=5):
    # Initialize KNN classifier
    knn = KNeighborsClassifier()
    
    # Standardizing the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Perform cross-validation
    accuracy_scores = cross_val_score(knn, X_scaled, y, cv=cv, scoring='accuracy')
    precision_scores = cross_val_score(knn, X_scaled, y, cv=cv, scoring='precision')
    recall_scores = cross_val_score(knn, X_scaled, y, cv=cv, scoring='recall')
    f1_scores = cross_val_score(knn, X_scaled, y, cv=cv, scoring='f1')
    
    # Compute mean scores
    accuracy = accuracy_scores.mean()
    precision = precision_scores.mean()
    recall = recall_scores.mean()
    f1 = f1_scores.mean()
    
    return accuracy, precision, recall, f1

# Example usage:
accuracy, precision, recall, f1 = knn_classifier(X_train, y_train)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train_s, X_test_s, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = knn_classifier(X_train_corr, y_train_s)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train_s, y_train_s, X_test_s, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = knn_classifier(X_train_mi, y_train_s)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()

X_train_cov, X_test_cov = covariance_method(X_train_s, X_test_s, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = knn_classifier(X_train_cov, y_train_s)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()

def main(X_train_s, X_test_s, y_train_s, y_test_s):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train_s, X_test_s, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = knn_classifier(X_train_pca, y_train_s)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)
    print()

if __name__ == "__main__":
    main(X_train_s, X_test_s, y_train_s, y_test_s)


Accuracy: 0.8218478077593211
Precision: 0.2787018589693309
Recall: 0.0692218350754936
F1 Score: 0.11077855836379942

Accuracy with Correlation method: 0.7440175919010871
Precision with Correlation method: 0.7095559966166499
Recall with Correlation method: 0.827044292604889
F1 Score with Correlation method: 0.763780955300253

Accuracy with Mutual Information method: 0.7322091112770723
Precision with Mutual Information method: 0.69593370211991
Recall with Mutual Information method: 0.8259563723624168
F1 Score with Mutual Information method: 0.7553225056476818

Accuracy with Covariance method: 0.744871568560889
Precision with Covariance method: 0.7790188748156985
Recall with Covariance method: 0.6843676064550015
F1 Score with Covariance method: 0.7285424716146494

Accuracy with PCA: 0.6294298021288312
Precision with PCA: 0.619350222192067
Recall with PCA: 0.6736550072519985
F1 Score with PCA: 0.6453257091963703



## LOGISTIC REGRESISION

In [40]:
accuracy, precision, recall, f1 = logistic_regression_classifier(X_train_s, y_train_s)
print('Logistic regression')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train_s, X_test_s, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = logistic_regression_classifier(X_train_corr, y_train_s)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train_s, y_train_s, X_test_s, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = logistic_regression_classifier(X_train_mi, y_train_s)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()

X_train_cov, X_test_cov = covariance_method(X_train_s, X_test_s, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = logistic_regression_classifier(X_train_cov, y_train_s)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()

def main(X_train_s, X_test_s, y_train_s, y_test_s):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train_s, X_test_s, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = logistic_regression_classifier(X_train_pca, y_train_s)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)
    print()
    
if __name__ == "__main__":
    main(X_train_s, X_test_s, y_train_s, y_test_s)

Logistic regression
Accuracy: 0.6689712962334322
Precision: 0.6987554858739301
Recall: 0.5949372135942447
F1 Score: 0.64262705909632

Accuracy with Correlation method: 0.6689712962334322
Precision with Correlation method: 0.6987554858739301
Recall with Correlation method: 0.5949372135942447
F1 Score with Correlation method: 0.64262705909632

Accuracy with Mutual Information method: 0.6593380858623576
Precision with Mutual Information method: 0.7005928546134486
Recall with Mutual Information method: 0.5575207561352872
F1 Score with Mutual Information method: 0.6208636058409167

Accuracy with Covariance method: 0.6615915691643848
Precision with Covariance method: 0.7242079896713348
Recall with Covariance method: 0.5228992092671386
F1 Score with Covariance method: 0.6072454528142218

Accuracy with PCA: 0.5219870701035749
Precision with PCA: 0.5217082823587559
Recall with PCA: 0.5460321208120311
F1 Score with PCA: 0.5334975578268246



## RANDOM FOREST

In [41]:
accuracy, precision, recall, f1 = random_forest_classifier(X_train_s, y_train_s)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train_s, X_test_s, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = random_forest_classifier(X_train_corr, y_train_s)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train_s, y_train_s, X_test_s, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = random_forest_classifier(X_train_mi, y_train_s)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()


X_train_cov, X_test_cov = covariance_method(X_train_s, X_test_s, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = random_forest_classifier(X_train_cov, y_train_s)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()

def main(X_train_s, X_test_s, y_train_s, y_test_s):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train_s, X_test_s, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = random_forest_classifier(X_train_pca, y_train_s)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)
    print()
    
if __name__ == "__main__":
    main(X_train_s, X_test_s, y_train_s, y_test_s)

Accuracy: 0.8373990238456258
Precision: 0.8453069874436798
Recall: 0.8262692202053689
F1 Score: 0.8356548516710403

Accuracy with Correlation method: 0.8373990238456258
Precision with Correlation method: 0.8453069874436798
Recall with Correlation method: 0.8262692202053689
F1 Score with Correlation method: 0.8356548516710403

Accuracy with Mutual Information method: 0.837864741515227
Precision with Mutual Information method: 0.8486984289601593
Recall with Mutual Information method: 0.822697550703757
F1 Score with Mutual Information method: 0.835482465859819

Accuracy with Covariance method: 0.7902422884558806
Precision with Covariance method: 0.856775972573538
Recall with Covariance method: 0.6975641958473273
F1 Score with Covariance method: 0.7689169069422428

Accuracy with PCA: 0.7010586522431183
Precision with PCA: 0.6939915046160607
Recall with PCA: 0.7203865483860087
F1 Score with PCA: 0.7069077442086551



## SVM

In [45]:
accuracy, precision, recall, f1 = svm_classifier(X_train_s, y_train_s)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train_s, X_test_s, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = svm_classifier(X_train_corr, y_train_s)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train_s, y_train_s, X_test_s, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = svm_classifier(X_train_mi, y_train_s)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()

X_train_cov, X_test_cov = covariance_method(X_train_s, X_test_s, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = svm_classifier(X_train_cov, y_train_s)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()

def main(X_train_s, X_test_s, y_train_s, y_test_s):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train_s, X_test_s, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = svm_classifier(X_train_pca, y_train_s)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)
    print()

if __name__ == "__main__":
    main(X_train_s, X_test_s, y_train_s, y_test_s)

Accuracy: 0.5247836392302412
Precision: 0.5297862400133879
Recall: 0.49959355078085477
F1 Score: 0.5082827716169349

Accuracy with Correlation method: 0.5247836392302412
Precision with Correlation method: 0.5297862400133879
Recall with Correlation method: 0.49959355078085477
F1 Score with Correlation method: 0.5082827716169349

Accuracy with Mutual Information method: 0.5239291495990526
Precision with Mutual Information method: 0.5287821184002055
Recall with Mutual Information method: 0.4980414785403486
F1 Score with Mutual Information method: 0.5073312683393274

Accuracy with Covariance method: 0.6628344384849238
Precision with Covariance method: 0.7489536039896743
Recall with Covariance method: 0.4907609297977632
F1 Score with Covariance method: 0.5929120216287727

Accuracy with PCA: 0.5245497846274545
Precision with PCA: 0.5287741264818541
Recall with PCA: 0.46342246143912413
F1 Score with PCA: 0.49136177205374276



## NAIVE BAYES

In [43]:
accuracy, precision, recall, f1 = naive_bayes_classifier(X_train_s, y_train_s)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train_s, X_test_s, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = naive_bayes_classifier(X_train_corr, y_train_s)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train_s, y_train_s, X_test_s, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = naive_bayes_classifier(X_train_mi, y_train_s)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()

X_train_cov, X_test_cov = covariance_method(X_train_s, X_test_s, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = naive_bayes_classifier(X_train_cov, y_train_s)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()

def main(X_train_s, X_test_s, y_train_s, y_test_s):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train_s, X_test_s, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = naive_bayes_classifier(X_train_pca, y_train_s)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)
    print()

if __name__ == "__main__":
    main(X_train_s, X_test_s, y_train_s, y_test_s)

Accuracy: 0.610551036881134
Precision: 0.6194290571676648
Recall: 0.5787922410843785
F1 Score: 0.5975344942661783

Accuracy with Correlation method: 0.610551036881134
Precision with Correlation method: 0.6194290571676648
Recall with Correlation method: 0.5787922410843785
F1 Score with Correlation method: 0.5975344942661783

Accuracy with Mutual Information method: 0.6072102352879052
Precision with Mutual Information method: 0.6080662389166589
Recall with Mutual Information method: 0.6082873959783933
F1 Score with Mutual Information method: 0.6075708693802837

Accuracy with Covariance method: 0.6630674783684493
Precision with Covariance method: 0.749477496153962
Recall with Covariance method: 0.4907609297977632
F1 Score with Covariance method: 0.5930784900596344

Accuracy with PCA: 0.5106431001576632
Precision with PCA: 0.577628643422337
Recall with PCA: 0.082440646849356
F1 Score with PCA: 0.14423380481726142



## DESICION TREE

In [44]:
accuracy, precision, recall, f1 = decision_tree_classifier(X_train_s, y_train_s)

# Print the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train_s, X_test_s, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = decision_tree_classifier(X_train_corr, y_train_s)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train_s, y_train_s, X_test_s, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = decision_tree_classifier(X_train_mi, y_train_s)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()

X_train_cov, X_test_cov = covariance_method(X_train_s, X_test_s, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = decision_tree_classifier(X_train_cov, y_train_s)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()

def main(X_train_s, X_test_s, y_train_s, y_test_s):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train_s, X_test_s, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = decision_tree_classifier(X_train_pca, y_train_s)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)
    print()

if __name__ == "__main__":
    main(X_train_s, X_test_s, y_train_s, y_test_s)


Accuracy: 0.7513203129125459
Precision: 0.7405701674930063
Recall: 0.7744124917481412
F1 Score: 0.7570894145031395

Accuracy with Correlation method: 0.7513203129125459
Precision with Correlation method: 0.7405701674930063
Recall with Correlation method: 0.7744124917481412
F1 Score with Correlation method: 0.7570894145031395

Accuracy with Mutual Information method: 0.748521450502033
Precision with Mutual Information method: 0.7371196072331181
Recall with Mutual Information method: 0.7730132897089081
F1 Score with Mutual Information method: 0.7546161900632253

Accuracy with Covariance method: 0.8168117621321505
Precision with Covariance method: 0.900373772784873
Recall with Covariance method: 0.712779057384751
F1 Score with Covariance method: 0.7955837538909399

Accuracy with PCA: 0.675653774488726
Precision with PCA: 0.667303345351693
Recall with PCA: 0.7014444969136844
F1 Score with PCA: 0.6839090095060553



# 2- BALANCING TECHNIQUE: CLUSTER BASED OVERSAMPLING

## SAMPLING

In [46]:
from imblearn.over_sampling import KMeansSMOTE

# Adjust parameters
kmeans_smote = KMeansSMOTE(cluster_balance_threshold=0.1, k_neighbors=10, random_state=42)

# Apply K-means SMOTE for cluster-based oversampling
X_resampled, y_resampled = kmeans_smote.fit_resample(X, y)

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Count the occurrences of each class label in the resampled training and testing sets
train_balance = pd.Series(y_train_c).value_counts()
test_balance = pd.Series(y_test_c).value_counts()

# Print the class distribution
print("Class distribution in the resampled training set:")
print(train_balance)

print("\nClass distribution in the resampled testing set:")
print(test_balance)

import pandas as pd

# Create a DataFrame for the resampled data
resampled_data = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled)], axis=1)

# Define the column names
column_names = list(X.columns) + ['purpose']

# Set the column names
resampled_data.columns = column_names

# Save the resampled data to a CSV file
resampled_data.to_excel('loan_CLUSTER.xlsx', index=False)



Class distribution in the resampled training set:
not.fully.paid
1    6450
0    6426
Name: count, dtype: int64

Class distribution in the resampled testing set:
not.fully.paid
0    1619
1    1601
Name: count, dtype: int64


## KNN

In [53]:
accuracy, precision, recall, f1 = knn_classifier(X_train_c, y_train_c)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train_c, X_test_c, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = knn_classifier(X_train_corr, y_train_c)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train_c, y_train_c, X_test_c, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = knn_classifier(X_train_mi, y_train_c)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()

X_train_cov, X_test_cov = covariance_method(X_train_c, X_test_c, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = knn_classifier(X_train_cov, y_train_c)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()

def main(X_train_c, X_test_c, y_train_c, y_test_c):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train_c, X_test_c, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = knn_classifier(X_train_pca, y_train_c)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)
    print()

if __name__ == "__main__":
    main(X_train_c, X_test_c, y_train_c, y_test_c)

Accuracy: 0.7972198637158536
Precision: 0.7638682990243562
Recall: 0.8615503875968992
F1 Score: 0.8097633330916405

Accuracy with Correlation method: 0.7972198637158536
Precision with Correlation method: 0.7638682990243562
Recall with Correlation method: 0.8615503875968992
F1 Score with Correlation method: 0.8097633330916405

Accuracy with Mutual Information method: 0.7971414400289454
Precision with Mutual Information method: 0.7644374347906835
Recall with Mutual Information method: 0.8601550387596898
F1 Score with Mutual Information method: 0.8094626908017961

Accuracy with Covariance method: 0.7488357655430259
Precision with Covariance method: 0.7891365697441726
Recall with Covariance method: 0.6804651162790697
F1 Score with Covariance method: 0.7307512257605076

Accuracy with PCA: 0.7517092805885545
Precision with PCA: 0.7372485300051494
Recall with PCA: 0.7837209302325581
F1 Score with PCA: 0.7597569863652539



## LOGISTIC REGRESSION

In [54]:

accuracy, precision, recall, f1 = logistic_regression_classifier(X_train_c, y_train_c)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train_c, X_test_c, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = logistic_regression_classifier(X_train_corr, y_train_c)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train_c, y_train_c, X_test_c, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = logistic_regression_classifier(X_train_mi, y_train_c)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()

X_train_cov, X_test_cov = covariance_method(X_train_c, X_test_c, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = logistic_regression_classifier(X_train_cov, y_train_c)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()

def main(X_train_c, X_test_c, y_train_c, y_test_c):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train_c, X_test_c, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = logistic_regression_classifier(X_train_pca, y_train_c)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)

if __name__ == "__main__":
    main(X_train_c, X_test_c, y_train_c, y_test_c)

Accuracy: 0.7300398600976904
Precision: 0.7451145922868315
Recall: 0.7010852713178295
F1 Score: 0.7222603729039921

Accuracy with Correlation method: 0.7300398600976904
Precision with Correlation method: 0.7451145922868315
Recall with Correlation method: 0.7010852713178295
F1 Score with Correlation method: 0.7222603729039921

Accuracy with Mutual Information method: 0.7207202556835313
Precision with Mutual Information method: 0.7395438085481186
Recall with Mutual Information method: 0.6834108527131784
F1 Score with Mutual Information method: 0.7101674859151721

Accuracy with Covariance method: 0.6677540855092565
Precision with Covariance method: 0.7463173108952464
Recall with Covariance method: 0.5102325581395349
F1 Score with Covariance method: 0.6060589519231401

Accuracy with PCA: 0.6792483869022493
Precision with PCA: 0.6433840956281667
Recall with PCA: 0.8071317829457365
F1 Score with PCA: 0.7160081974635732


## RANDOM FOREST

In [64]:
accuracy, precision, recall, f1 = random_forest_classifier(X_train_c, y_train_c)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train_c, X_test_c, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = random_forest_classifier(X_train_corr, y_train_c)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train_c, y_train_c, X_test_c, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = random_forest_classifier(X_train_mi, y_train_c)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()

X_train_cov, X_test_cov = covariance_method(X_train_c, X_test_c, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = random_forest_classifier(X_train_cov, y_train_c)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()

def main(X_train_c, X_test_c, y_train_c, y_test_c):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train_c, X_test_c, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = random_forest_classifier(X_train_pca, y_train_c)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)
    print()

if __name__ == "__main__":
    main(X_train_c, X_test_c, y_train_c, y_test_c)



Accuracy: 0.8637772116022433
Precision: 0.8664614710389898
Recall: 0.8607751937984496
F1 Score: 0.8635945644349196

Accuracy with Correlation method: 0.8637772116022433
Precision with Correlation method: 0.8664614710389898
Recall with Correlation method: 0.8607751937984496
F1 Score with Correlation method: 0.8635945644349196

Accuracy with Mutual Information method: 0.8595062111801243
Precision with Mutual Information method: 0.8626130500371335
Recall with Mutual Information method: 0.855968992248062
F1 Score with Mutual Information method: 0.8592521899737445

Accuracy with Covariance method: 0.7910067840559609
Precision with Covariance method: 0.8586007232825337
Recall with Covariance method: 0.697829457364341
F1 Score with Covariance method: 0.7698011429311091

Accuracy with PCA: 0.7769496170777301
Precision with PCA: 0.7733708152974044
Recall with PCA: 0.7846511627906977
F1 Score with PCA: 0.7789568323133395



## SVM

In [67]:
accuracy, precision, recall, f1 = svm_classifier(X_train_c, y_train_c)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train_c, X_test_c, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = svm_classifier(X_train_corr, y_train_c)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train_c, y_train_c, X_test_c, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = svm_classifier(X_train_mi, y_train_c)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()

X_train_cov, X_test_cov = covariance_method(X_train_c, X_test_c, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = svm_classifier(X_train_cov, y_train_c)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()

def main(X_train_c, X_test_c, y_train_c, y_test_c):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train_c, X_test_c, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = svm_classifier(X_train_pca, y_train_c)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)
    print()

if __name__ == "__main__":
    main(X_train_c, X_test_c, y_train_c, y_test_c)


Accuracy: 0.748990984743412
Precision: 0.6972941004003251
Recall: 0.881860465116279
F1 Score: 0.77877034935572

Accuracy with Correlation method: 0.748990984743412
Precision with Correlation method: 0.6972941004003251
Recall with Correlation method: 0.881860465116279
F1 Score with Correlation method: 0.77877034935572

Accuracy with Mutual Information method: 0.748990984743412
Precision with Mutual Information method: 0.6972941004003251
Recall with Mutual Information method: 0.881860465116279
F1 Score with Mutual Information method: 0.77877034935572

Accuracy with Covariance method: 0.6675211361032383
Precision with Covariance method: 0.7517547858084618
Recall with Covariance method: 0.5021705426356589
F1 Score with Covariance method: 0.6020741111899942

Accuracy with PCA: 0.7458071217511911
Precision with PCA: 0.6932219933335304
Recall with PCA: 0.884031007751938
F1 Score with PCA: 0.7770386269761757



## NAIVE BAYES

In [56]:
accuracy, precision, recall, f1 = naive_bayes_classifier(X_train_s, y_train_s)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train_c, X_test_c, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = naive_bayes_classifier(X_train_corr, y_train_c)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train_c, y_train_c, X_test_c, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = naive_bayes_classifier(X_train_mi, y_train_c)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()

X_train_cov, X_test_cov = covariance_method(X_train_c, X_test_c, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = naive_bayes_classifier(X_train_cov, y_train_c)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()

def main(X_train_c, X_test_c, y_train_c, y_test_c):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train_c, X_test_c, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = naive_bayes_classifier(X_train_pca, y_train_c)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)

if __name__ == "__main__":
    main(X_train_c, X_test_c, y_train_c, y_test_c)



Accuracy: 0.610551036881134
Precision: 0.6194290571676648
Recall: 0.5787922410843785
F1 Score: 0.5975344942661783

Accuracy with Correlation method: 0.6859287523367303
Precision with Correlation method: 0.6555030289785851
Recall with Correlation method: 0.7883720930232558
F1 Score with Correlation method: 0.7155323848093232

Accuracy with Mutual Information method: 0.6698518362178134
Precision with Mutual Information method: 0.6381526980895135
Recall with Mutual Information method: 0.7893023255813953
F1 Score with Mutual Information method: 0.7054357849846729

Accuracy with Covariance method: 0.6675988060061508
Precision with Covariance method: 0.7519331444022184
Recall with Covariance method: 0.5021705426356589
F1 Score with Covariance method: 0.6021297894488216

Accuracy with PCA: 0.5937411807272508
Precision with PCA: 0.5616971075385326
Recall with PCA: 0.8623255813953489
F1 Score with PCA: 0.6801239244021832


## DECISON TREE

In [57]:
accuracy, precision, recall, f1 = decision_tree_classifier(X_train_s, y_train_s)
# Print the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train_c, X_test_c, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = decision_tree_classifier(X_train_corr, y_train_c)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train_c, y_train_c, X_test_c, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = decision_tree_classifier(X_train_mi, y_train_c)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()

X_train_cov, X_test_cov = covariance_method(X_train_c, X_test_c, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = decision_tree_classifier(X_train_cov, y_train_c)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()

def main(X_train_c, X_test_c, y_train_c, y_test_c):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train_c, X_test_c, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = decision_tree_classifier(X_train_pca, y_train_c)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)

if __name__ == "__main__":
    main(X_train_c, X_test_c, y_train_c, y_test_c)



Accuracy: 0.7513203129125459
Precision: 0.7405701674930063
Recall: 0.7744124917481412
F1 Score: 0.7570894145031395

Accuracy with Correlation method: 0.794500512573117
Precision with Correlation method: 0.784690352950575
Recall with Correlation method: 0.8130232558139535
F1 Score with Correlation method: 0.7985706149764931

Accuracy with Mutual Information method: 0.778658505698607
Precision with Mutual Information method: 0.7682350183554248
Recall with Mutual Information method: 0.7993798449612404
F1 Score with Mutual Information method: 0.7834796848986458

Accuracy with Covariance method: 0.8102674727130194
Precision with Covariance method: 0.8938520988678637
Recall with Covariance method: 0.7051162790697675
F1 Score with Covariance method: 0.788241403744235

Accuracy with PCA: 0.7178480069951155
Precision with PCA: 0.7163032607469184
Recall with PCA: 0.7232558139534884
F1 Score with PCA: 0.7197418019311401


# 3- BALANCING TECHNIQUE: ENSEMBLE METHODS

## SAMPLING

In [58]:
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
import pandas as pd

# Define your ensemble classifier, for example, Balanced Random Forest
ensemble_clf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)

# Fit the ensemble classifier to your original dataset
ensemble_clf.fit(X, y)

# Use RandomOverSampler to generate synthetic samples to balance the dataset
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train_e, X_test_e, y_train_e, y_test_e = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Count the occurrences of each class label in the resampled training and testing sets
train_balance = pd.Series(y_train_s).value_counts()
test_balance = pd.Series(y_test_s).value_counts()

# Print the class distribution
print("Class distribution in the resampled training set:")
print(train_balance)

print("\nClass distribution in the resampled testing set:")
print(test_balance)

# Create a DataFrame for the resampled data
resampled_data = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled)], axis=1)

# Define the column names
column_names = list(X.columns) + ['purpose']

# Set the column names
resampled_data.columns = column_names

# Save the resampled data to a CSV file
resampled_data.to_excel('Loan_ENSEMBLE.xlsx', index=False)

  warn(
  warn(


Class distribution in the resampled training set:
not.fully.paid
1    6441
0    6431
Name: count, dtype: int64

Class distribution in the resampled testing set:
not.fully.paid
0    1614
1    1604
Name: count, dtype: int64


## KNN

In [59]:
accuracy, precision, recall, f1 = knn_classifier(X_train_e, y_train_e)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train_e, X_test_e, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = knn_classifier(X_train_corr, y_train_e)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train_e, y_train_e, X_test_e, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = knn_classifier(X_train_mi, y_train_e)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()

X_train_cov, X_test_cov = covariance_method(X_train_e, X_test_e, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = knn_classifier(X_train_cov, y_train_e)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()

def main(X_train_e, X_test_e, y_train_e, y_test_e):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train_e, X_test_e, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = knn_classifier(X_train_pca, y_train_e)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)

if __name__ == "__main__":
    main(X_train_e, X_test_e, y_train_e, y_test_e)

Accuracy: 0.7389679015698433
Precision: 0.6969706395110119
Recall: 0.8467622597323746
F1 Score: 0.7645207368643423

Accuracy with Correlation method: 0.7389679015698433
Precision with Correlation method: 0.6969706395110119
Recall with Correlation method: 0.8467622597323746
F1 Score with Correlation method: 0.7645207368643423

Accuracy with Mutual Information method: 0.7304997397424582
Precision with Mutual Information method: 0.6906972175427204
Recall with Mutual Information method: 0.835583822020055
F1 Score with Mutual Information method: 0.7562452965378825

Accuracy with Covariance method: 0.6520357269483483
Precision with Covariance method: 0.6457142879723945
Recall with Covariance method: 0.6755229389627473
F1 Score with Covariance method: 0.6599840249438989

Accuracy with PCA: 0.6998921251348436
Precision with PCA: 0.660515413811289
Recall with PCA: 0.8237841458302213
F1 Score with PCA: 0.7331394013082522


## LOGISTIC REGRESSION

In [60]:
accuracy, precision, recall, f1 = logistic_regression_classifier(X_train_e, y_train_e)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train_e, X_test_e, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = logistic_regression_classifier(X_train_corr, y_train_e)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train_e, y_train_e, X_test_e, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = logistic_regression_classifier(X_train_mi, y_train_e)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()

X_train_cov, X_test_cov = covariance_method(X_train_e, X_test_e, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = logistic_regression_classifier(X_train_cov, y_train_e)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()

def main(X_train_e, X_test_e, y_train_e, y_test_e):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train_e, X_test_e, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = logistic_regression_classifier(X_train_pca, y_train_e)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)

if __name__ == "__main__":
    main(X_train_e, X_test_e, y_train_e, y_test_e)

Accuracy: 0.6235250186706497
Precision: 0.6298253532965863
Recall: 0.6006847235808008
F1 Score: 0.6149089674295256

Accuracy with Correlation method: 0.6235250186706497
Precision with Correlation method: 0.6298253532965863
Recall with Correlation method: 0.6006847235808008
F1 Score with Correlation method: 0.6149089674295256

Accuracy with Mutual Information method: 0.6206498743974473
Precision with Mutual Information method: 0.6275924021623089
Recall with Mutual Information method: 0.5949396228960773
F1 Score with Mutual Information method: 0.6108243405151336

Accuracy with Covariance method: 0.6105509463567715
Precision with Covariance method: 0.6311129969889635
Recall with Covariance method: 0.5334617089659759
F1 Score with Covariance method: 0.5781229351223531

Accuracy with PCA: 0.5219093398510875
Precision with PCA: 0.5213835829318644
Recall with PCA: 0.5525550645933821
F1 Score with PCA: 0.536417505240929


## RANDOM FOREST

In [63]:
accuracy, precision, recall, f1 = random_forest_classifier(X_train_e, y_train_e)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train_e, X_test_e, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = random_forest_classifier(X_train_corr, y_train_e)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train_e, y_train_e, X_test_e, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = random_forest_classifier(X_train_mi, y_train_e)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()

X_train_cov, X_test_cov = covariance_method(X_train_e, X_test_e, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = random_forest_classifier(X_train_cov, y_train_e)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()

def main(X_train_e, X_test_e, y_train_e, y_test_e):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train_e, X_test_e, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = random_forest_classifier(X_train_pca, y_train_e)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)

if __name__ == "__main__":
    main(X_train_e, X_test_e, y_train_e, y_test_e)

Accuracy: 0.9543193548630441
Precision: 0.9367551678584348
Recall: 0.9745375345132488
F1 Score: 0.9552503322612143

Accuracy with Correlation method: 0.9543193548630441
Precision with Correlation method: 0.9367551678584348
Recall with Correlation method: 0.9745375345132488
F1 Score with Correlation method: 0.9552503322612143

Accuracy with Mutual Information method: 0.9529210250375298
Precision with Mutual Information method: 0.9344981689775482
Recall with Mutual Information method: 0.9742269755070376
F1 Score with Mutual Information method: 0.9539349381508929

Accuracy with Covariance method: 0.7009017433483453
Precision with Covariance method: 0.7024597700629867
Recall with Covariance method: 0.6983410752232218
F1 Score with Covariance method: 0.700214871362678

Accuracy with PCA: 0.8977622981118127
Precision with PCA: 0.8432892788047652
Recall with PCA: 0.9773325655691494
F1 Score with PCA: 0.9053693250383146


## SVM

In [70]:
accuracy, precision, recall, f1 = svm_classifier(X_train_e, y_train_e)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train_e, X_test_e, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = svm_classifier(X_train_corr, y_train_e)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train_e, y_train_e, X_test_e, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = svm_classifier(X_train_mi, y_train_e)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()

X_train_cov, X_test_cov = covariance_method(X_train_e, X_test_e, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = svm_classifier(X_train_cov, y_train_e)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()

def main(X_train_e, X_test_e, y_train_e, y_test_e):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train_e, X_test_e, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = svm_classifier(X_train_pca, y_train_e)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)

if __name__ == "__main__":
    main(X_train_e, X_test_e, y_train_e, y_test_e)

Accuracy: 0.5332520122811385
Precision: 0.5341418738001099
Recall: 0.5360885707539669
F1 Score: 0.5343578405498877

Accuracy with Correlation method: 0.5332520122811385
Precision with Correlation method: 0.5341418738001099
Recall with Correlation method: 0.5360885707539669
F1 Score with Correlation method: 0.5343578405498877

Accuracy with Mutual Information method: 0.5324752830772249
Precision with Mutual Information method: 0.533478892692074
Recall with Mutual Information method: 0.5339151395708551
F1 Score with Mutual Information method: 0.5329207946661308

Accuracy with Covariance method: 0.6032483158696751
Precision with Covariance method: 0.6647945258513248
Recall with Covariance method: 0.41779486240477237
F1 Score with Covariance method: 0.5130968559823589

Accuracy with PCA: 0.5285901283182837
Precision with PCA: 0.532699609411701
Recall with PCA: 0.4811290229317349
F1 Score with PCA: 0.5049174720322578


## NAIVE BAYES 

In [61]:
accuracy, precision, recall, f1 = naive_bayes_classifier(X_train_e, y_train_e)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train_e, X_test_e, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = naive_bayes_classifier(X_train_corr, y_train_e)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train_e, y_train_e, X_test_e, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = naive_bayes_classifier(X_train_mi, y_train_e)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()

X_train_cov, X_test_cov = covariance_method(X_train_e, X_test_e, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = naive_bayes_classifier(X_train_cov, y_train_e)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()

def main(X_train_e, X_test_e, y_train_e, y_test_e):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train_e, X_test_e, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = naive_bayes_classifier(X_train_pca, y_train_e)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)

if __name__ == "__main__":
    main(X_train_e, X_test_e, y_train_e, y_test_e)

Accuracy: 0.5752803313191662
Precision: 0.6767868982440396
Recall: 0.28986442858588435
F1 Score: 0.4057168902840556

Accuracy with Correlation method: 0.5752803313191662
Precision with Correlation method: 0.6767868982440396
Recall with Correlation method: 0.28986442858588435
F1 Score with Correlation method: 0.4057168902840556

Accuracy with Mutual Information method: 0.5698423518229343
Precision with Mutual Information method: 0.6625348151436157
Recall with Mutual Information method: 0.285984247984619
F1 Score with Mutual Information method: 0.39930938208916483

Accuracy with Covariance method: 0.614047570552425
Precision with Covariance method: 0.6557018384765475
Recall with Covariance method: 0.4812949033629035
F1 Score with Covariance method: 0.5551008613143397

Accuracy with PCA: 0.5090899434977105
Precision with PCA: 0.5739315092651125
Recall with PCA: 0.07297184971738889
F1 Score with PCA: 0.12920181009816653


## DECISION TREE

In [62]:
accuracy, precision, recall, f1 = decision_tree_classifier(X_train_e, y_train_e)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print()

X_train_corr, X_test_corr = correlation_method(X_train_e, X_test_e, threshold=0.8)
accuracy_corr, precision_corr, recall_corr, f1_corr = decision_tree_classifier(X_train_corr, y_train_e)
print("Accuracy with Correlation method:", accuracy_corr)
print("Precision with Correlation method:", precision_corr)
print("Recall with Correlation method:", recall_corr)
print("F1 Score with Correlation method:", f1_corr)
print()

X_train_mi, X_test_mi = mutual_information(X_train_e, y_train_e, X_test_e, n_features=10)
accuracy_mi, precision_mi, recall_mi, f1_mi = decision_tree_classifier(X_train_mi, y_train_e)
print("Accuracy with Mutual Information method:", accuracy_mi)
print("Precision with Mutual Information method:", precision_mi)
print("Recall with Mutual Information method:", recall_mi)
print("F1 Score with Mutual Information method:", f1_mi)
print()

X_train_cov, X_test_cov = covariance_method(X_train_e, X_test_e, threshold=0.8)
accuracy_cov, precision_cov, recall_cov, f1_cov = decision_tree_classifier(X_train_cov, y_train_e)
print("Accuracy with Covariance method:", accuracy_cov)
print("Precision with Covariance method:", precision_cov)
print("Recall with Covariance method:", recall_cov)
print("F1 Score with Covariance method:", f1_cov)
print()

def main(X_train_e, X_test_e, y_train_e, y_test_e):
    # Perform PCA
    n_components = 2  # Number of components to retain
    X_train_pca, X_test_pca = perform_pca(X_train_e, X_test_e, n_components)
    
    # Evaluate Logistic classifier using PCA
    accuracy, precision, recall, f1 = decision_tree_classifier(X_train_pca, y_train_e)
    
    print("Accuracy with PCA:", accuracy)
    print("Precision with PCA:", precision)
    print("Recall with PCA:", recall)
    print("F1 Score with PCA:", f1)

if __name__ == "__main__":
    main(X_train_e, X_test_e, y_train_e, y_test_e)

Accuracy: 0.8884394354297267
Precision: 0.8295980624847245
Recall: 0.9779528403259303
F1 Score: 0.8976797722904271

Accuracy with Correlation method: 0.8884394354297267
Precision with Correlation method: 0.8295980624847245
Recall with Correlation method: 0.9779528403259303
F1 Score with Correlation method: 0.8976797722904271

Accuracy with Mutual Information method: 0.88836288199395
Precision with Mutual Information method: 0.8284936951834588
Recall with Mutual Information method: 0.9799723171219444
F1 Score with Mutual Information method: 0.8978449263819395

Accuracy with Covariance method: 0.6979498947654287
Precision with Covariance method: 0.7054594608557766
Recall with Covariance method: 0.6807976234646724
F1 Score with Covariance method: 0.6927422838147158

Accuracy with PCA: 0.8794281274281274
Precision with PCA: 0.8170494871312307
Recall with PCA: 0.9781087221544941
F1 Score with PCA: 0.8903341906240044
