# Loan Eligibility Prediction

### import require packages

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.pandas.set_option('display.max_columns',None)

# 1. Load the Dataset

In [13]:
df = pd.read_csv("Loan_default.csv")

# 2. Perform Exploratory Data Analysis or Data preprocessing

In [14]:
df.shape

(255347, 18)

In [15]:
df.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56.0,85994.0,50587.0,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69.0,50432.0,124440.0,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46.0,84208.0,129188.0,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32.0,31713.0,44799.0,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60.0,20437.0,9139.0,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,,0


## 1. Missing Values

In [16]:
df.isnull().sum()


LoanID             0
Age               21
Income            51
LoanAmount         1
CreditScore        0
MonthsEmployed     0
NumCreditLines     0
InterestRate       0
LoanTerm           0
DTIRatio           1
Education         14
EmploymentType     0
MaritalStatus     26
HasMortgage       15
HasDependents     25
LoanPurpose       34
HasCoSigner       12
Default            0
dtype: int64

In [20]:

df['Income'].fillna(df['Income'].mean(), inplace=True)

df['Age'].fillna(df['Age'].mode()[0], inplace=True)
df['Education'].fillna(df['Education'].mode()[0],inplace=True)
df['MaritalStatus'].fillna(df['MaritalStatus'].mode()[0],inplace=True)
df['HasMortgage'].fillna(df['HasMortgage'].mode()[0],inplace=True)
df['HasDependents'].fillna(df['HasDependents'].mode()[0],inplace=True) 
df['HasDependents'].fillna(df['HasDependents'].mode()[0],inplace=True)                        

df.dropna(axis=0, inplace=True)

In [21]:
df.isnull().sum()


LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64

In [37]:
df.drop("LoanID",axis=1,inplace=True)

In [38]:
import sweetviz

In [39]:
my_report = sweetviz.analyze([df, "Data_set Visualization"],target_feat="Default")

                                             |      | [  0%]   00:00 -> (? left)

In [40]:
my_report.show_html("Report.html")

Report Report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


## 4. Label encoding

## 5. Deal with imbalanced Data (SMOTE)

In [8]:
# pip install imbalanced-Learn

class_0_count, class_1_count = df['Default'].value_counts()

# separate  the class 1, 0 into 2 df
df_class_0 = df[df['Default']==0]
df_class_1 = df[df['Default']==1]

print(df_class_0['Default'].value_counts())
print(df_class_1['Default'].value_counts())

0    225694
Name: Default, dtype: int64
1    29653
Name: Default, dtype: int64


In [9]:
## class 0 ==> 60000
#  class 1 ==> 29649
# merge together

df_class_0 = df_class_0.sample(60000)

df2 = pd.concat([df_class_1, df_class_0],axis= 0)
df2.value_counts()

LoanID      Age   Income    LoanAmount  CreditScore  MonthsEmployed  NumCreditLines  InterestRate  LoanTerm  DTIRatio  Education    EmploymentType  MaritalStatus  HasMortgage  HasDependents  LoanPurpose  HasCoSigner  Default
000ELHLBPV  37.0  81658.0   152598.0    792          23              4               14.06         24        0.86      PhD          Self-employed   Married        Yes          No             Auto         No           1          1
O22PN76VC1  22.0  66682.0   5172.0      755          40              4               4.54          36        0.24      Bachelor's   Unemployed      Single         No           Yes            Home         No           0          1
O25AGSOVAF  42.0  109920.0  209477.0    304          110             3               22.25         60        0.82      High School  Unemployed      Divorced       No           No             Education    No           1          1
O24OF1PXT6  26.0  91881.0   160067.0    626          35              4               

In [10]:
x = df2.drop(['LoanID','Default'],axis = 'columns')
y = df2['Default']
print(y.value_counts())
x.value_counts

0    60000
1    29653
Name: Default, dtype: int64


<bound method DataFrame.value_counts of          Age    Income  LoanAmount  CreditScore  MonthsEmployed  \
2       46.0   84208.0    129188.0          451              26   
5       25.0   90298.0     90448.0          720              18   
8       36.0   42053.0     92357.0          827              83   
11      28.0  149227.0    139759.0          375              56   
18      19.0   40718.0     78515.0          319             119   
...      ...       ...         ...          ...             ...   
108292  43.0  109963.0    180180.0          451              94   
232127  32.0   30804.0     27681.0          545              84   
111997  40.0   78053.0    233472.0          522              16   
173216  39.0   20095.0    151300.0          738              12   
19592   57.0   39443.0    190431.0          561              44   

        NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
2                    3         21.17        24      0.31     Master's   
5        

In [11]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy= 'minority',random_state=12345)
x_sm, y_sm = smote.fit_resample(x,y)
y_sm.value_counts()
df2 = pd.concat(x_sm, y_sm, axis = 1)



KeyboardInterrupt



# 2. Feature Selection

### Scaling the data

In [None]:
from sklearn.model_selection import train_test_split

X_train, x_test, Y_train, y_test = train_test_split(x_sm, y_sm, train_size= 0.7, random_state=12350, stratify=y_sm)

# 3. Model Building

In [None]:
class Models:
    def __init__(self ,X_train, Y_train):
        self.X_train = X_train
        self.Y_train = Y_train
        

    # Built the Logistic Regression model
    def model_LG(self):
        from sklearn.linear_model import LogisticRegressionCV
        model_lg = LogisticRegressionCV().fit(self.X_train, self.Y_train)
        return model_lg

    # Built the Naive Bayes 
    def model_NB(self):
        from sklearn.naive_bayes import GaussianNB
        model_nb = GaussianNB().fit(self.X_train, self.Y_train)
        return model_nb

    # Built the KNN model
    def model_KNN(self):
        from sklearn.neighbors import KNeighborsClassifier
        model_knn = KNeighborsClassifier().fit(self.X_train, self.Y_train)
        return model_knn

    # Built the Support Vector Machine
    def model_SVM(self):
        from sklearn.svm import SVC
        model_svm = SVC(C=3.0).fit(self.X_train, self.Y_train)
        return model_svm
        
    # Built the Decison tree model 
    def model_DT(self):
        from sklearn.tree import DecisionTreeClassifier
        model_dt = DecisionTreeClassifier().fit(self.X_train, self.Y_train)
        return model_dt

    # Built the model for Random forest
    def model_Rand_forest(self):
        from sklearn.ensemble import RandomForestClassifier
        model_rand_forest = RandomForestClassifier().fit(self.X_train, self.Y_train)
        return model_rand_forest

    # Built the catboost model
    def model_Cat_Boost(self):
        from catboost import CatBoostClassifier
        model_catboost = CatBoostClassifier().fit(self.X_train, self.Y_train)
        return model_catboost

    # Built the adaboost model
    def model_Ada_Boost(self):
        from sklearn.ensemble import AdaBoostClassifier
        model_adaboost = AdaBoostClassifier().fit(self.X_train, self.Y_train)
        return model_adaboost

    def model_XG_Boost(self):
        from xgboost import XGBClassifier
        model_xgboost = XGBClassifier().fit(self.X_train, self.Y_train)
        return model_xgboost

## Model Evaluation code

In [None]:
class Model_Evaluation_class:
    def __init__(self ,x_test, y_test):
        self.x_test = x_test
        self.y_test = y_test

    def Classification_Report(self, model):
        from sklearn.metrics import classification_report
        return classification_report(self.y_test, model.predict(self.x_test))

    def model_evaluate(self, model, model_name):
        from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,f1_score
        
        Accuracy = accuracy_score(self.y_test, model.predict(self.x_test))
        Precision = precision_score(self.y_test, model.predict(self.x_test))
        Recall = recall_score(self.y_test, model.predict(self.x_test))
        f1_Score = f1_score(self.y_test, model.predict(self.x_test))
        return model_name,Accuracy,Precision,Recall,f1_Score

### Build multiple model

In [None]:
m2 = Models(X_train,Y_train)
models = [(m2.model_LG(),"LogisticRegression",(0,0)),(m2.model_KNN(), "KNeighborsClassifier",(0,1)),
          (m2.model_NB(),"Naive Bayes",(1,0)),(m2.model_SVM(),"Support Vector Machine",(1,1)),
          (m2.model_DT(),"DecisionTreeClassifier",(2,0)),(m2.model_Rand_forest(),"RandomForest",(2,1)),
         (m2.model_Ada_Boost(), "AdaBoost",(3,0)), (m2.model_Cat_Boost(),"Cat_Boost",(3,1)),
         (m2.model_XG_Boost(), "XGBoost",(4,0))]

## Evaluate all models


In [None]:
model_evaluation_obj = Model_Evaluation_class(x_test, y_test)
Results2 = []
for model,model_name,_ in models:
    Results2.append(model_evaluation_obj.model_evaluate(model, model_name))

df2 = pd.DataFrame(Results2,columns=['Algorithm','Accuracy', 'Precision', 'Recall', 'F1_Score'])
df2

## Classification report for all models

In [None]:
for model,model_name,_ in models:
    print(f"Classification report for {model_name} is :")
    print(model_evaluation_obj.Classification_Report(model))
    print("-"*80)

In [None]:
# Initialize the Models class


# List of models with their corresponding labels

from sklearn.metrics import roc_curve, auc

plt.figure(figsize=(10, 6))

for model, name, _ in models:
    # Fit the model
    
    
    # Predict probabilities or decision function
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(x_test)[:, 1]
    else:
        y_prob = model.decision_function(x_test)
    
    # Compute ROC curve and ROC area
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    
    # Plot ROC curve
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')

# Plotting the diagonal line
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='No Discrimination')

# Customizing the plot
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curves')
plt.legend(loc="lower right")

plt.show()

In [None]:

from catboost import CatBoostClassifier, Pool

# Get feature importances


m2 = Models(X_train,Y_train)
model = m2.model_Cat_Boost()
feature_importances = model.get_feature_importance(Pool(X_train, label=Y_train))
feature_names = x.columns

# Create a DataFrame for plotting
importance_df = pd.DataFrame({
    'Features': feature_names,
    'Importance': feature_importances
})

# Sort by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Features'], importance_df['Importance'], color='skyblue')
plt.xlabel('F score')
plt.ylabel('Features')
plt.title('Feature Importances: CatBoost')
plt.gca().invert_yaxis()  # To display the most important feature at the top
plt.show()

In [None]:
# Get feature importances
feature_importances = model.get_feature_importance(Pool(X_train, label=Y_train))
feature_names = x.columns

# Create a DataFrame for plotting
importance_df = pd.DataFrame({
    'Features': feature_names,
    'Importance': feature_importances
})

# Sort by importance
importance_df = importance_df.sort_values(by='Importance', ascending=True)

# Plot the feature importances
plt.figure(figsize=(8, 5))
plt.barh(importance_df['Features'], importance_df['Importance'], color='skyblue')
plt.xlabel('F score', fontsize=15)
plt.ylabel('Features', fontsize=15)
plt.title('plot_importance: importance_type="weighted"', fontsize=15)

# Add text labels for each bar, aligned to the right, with increased font size
for i, v in enumerate(importance_df['Importance']):
    plt.text(v , i, str(int(v)), color='black', va='center', fontsize=15)

# Increase font size of tick labels
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)

plt.tight_layout()
plt.show()