In [3]:
# import all the necessary libraries here
import pandas as pd
import numpy as np

In [4]:
# read dataset
df = pd.read_csv('cross-validation.csv')
print(df.shape)

(614, 13)


In [5]:
print(df.columns)

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')


In [6]:
print(df.head())

    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [8]:
#remove all rows containing the null values
df_clean = df.dropna()
print(df_clean.shape)

(480, 13)


In [9]:
# Data Preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
X = df_clean.drop(columns=['Loan_Status'])
y = df_clean['Loan_Status']


In [10]:
# Normalize numerical feature using StandardScaler
scale = StandardScaler()
numerical = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
X[numerical] = scale.fit_transform(X[numerical])
# Encode categorical variables using Label Encoding
categorical= ['Loan_ID','Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
encoder = LabelEncoder()

for f in categorical:
    X[f] = encoder.fit_transform(X[f])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 384 entries, 172 to 137
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            384 non-null    int64  
 1   Gender             384 non-null    int64  
 2   Married            384 non-null    int64  
 3   Dependents         384 non-null    int64  
 4   Education          384 non-null    int64  
 5   Self_Employed      384 non-null    int64  
 6   ApplicantIncome    384 non-null    float64
 7   CoapplicantIncome  384 non-null    float64
 8   LoanAmount         384 non-null    float64
 9   Loan_Amount_Term   384 non-null    float64
 10  Credit_History     384 non-null    float64
 11  Property_Area      384 non-null    int64  
dtypes: float64(5), int64(7)
memory usage: 39.0 KB


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Train a Logistic Regression
logistic = LogisticRegression(solver='saga', random_state=42, max_iter=4000)

# Train the model on the training data
logistic.fit(X_train, y_train)
y_pred = logistic.predict(X_test)
a = accuracy_score(y_test, y_pred)
p = precision_score(y_test, y_pred, pos_label='Y')
r = recall_score(y_test, y_pred, pos_label='Y')
print("Accuracy = ", a)
print("Precision = ", p)
print("Recall = ", r)

Accuracy =  0.8229166666666666
Precision =  0.8
Recall =  1.0


In [18]:
#Cross Validate the classifier with 5-folds
# print the mean accuracy, precision and recall
# k fold validation

k=5
accuracy= []
precision= []
recall= []

# split in equal part/folds len(X_train/k)
fold=len(X_train) // k

# take each part as validation and rest 4 as training
for f in range(k):
    start = f * fold
    end = (f + 1) * fold

    # intially take 0 to fold as validation and rest as train
    # at every itration increase the fold size by equal number
    Xf_train = np.concatenate((X_train[:start], X_train[end:]), axis=0)
    yf_train = np.concatenate((y_train[:start], y_train[end:]), axis=0)
    X_val = X_train[start:end]
    y_val = y_train[start:end]

    #train the logistic regression on subpart
    logistic = LogisticRegression(solver='saga', random_state=42,max_iter=5000)

    # Train the model on the training data
    logistic.fit(Xf_train, yf_train)

    #predict on the validation
    y_pred = logistic.predict(X_val)
    a = accuracy_score(y_val, y_pred)
    p = precision_score(y_val, y_pred,pos_label='Y')
    r = recall_score(y_val, y_pred,pos_label='Y')
    accuracy.append(a)
    precision.append(p)
    recall.append(r)

#  print the mean accuracy, precision and recall for the class 1(good) for the classifier.
print("mean accuracy = ", np.mean(accuracy))
print("mean precision = ", np.mean(precision))
print("mean recall = ", np.mean(recall))



mean accuracy =  0.8026315789473685
mean precision =  0.7900510102147127
mean recall =  0.9735603641641377


