<h1><b> Loan Status Prediction-Logistic Regression

 <b> 1. Import Library

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE


<b> 2. Data Cleaning

In [None]:
df = pd.read_csv('loan-dataset.csv')
df.head()

In [None]:
df.drop(["Loan_ID"], axis="columns", inplace=True)
df.dropna(inplace=True)

In [None]:
df.Dependents = df['Dependents'].replace('3+', '3')
df.Dependents = pd.to_numeric(df['Dependents'], errors='coerce').astype(int)

df.Dependents.unique()

df.head()

In [None]:
df['Gender'] = df['Gender'].replace({'Male': 1, 'Female': 0})
df['Married'] = df['Married'].replace({'Yes': 1, 'No': 0})
df['Self_Employed'] = df['Self_Employed'].replace({'Yes': 1, 'No': 0})
df['Property_Area'] = pd.factorize(df['Property_Area'])[0] + 1
df['Loan_Status'] = df['Loan_Status'].replace({'Y': 1, 'N': 0})
df['Education'] = df['Education'].replace({'Graduate': 1, "Not Graduate": 0})
df["Credit_History"] = pd.to_numeric(df['Credit_History'], errors='coerce').astype(int)
df["LoanAmount"] = pd.to_numeric(df['LoanAmount'], errors='coerce').astype(int)

In [None]:
df["LoanAmount"] = df.LoanAmount*1000

In [None]:
corr = df.corr()
sns.heatmap(corr, cmap='coolwarm', annot=True)

plt.show()

In [None]:
#Remove minus features on Loan_Status correlation
df.drop(["CoapplicantIncome", "Loan_Amount_Term", "Self_Employed"], axis="columns", inplace=True)

In [None]:
corr = df.corr()
sns.heatmap(corr, cmap='coolwarm', annot=True)

plt.show()

In [None]:
pd.crosstab(df['Gender'], df.Loan_Status).plot(kind="bar")
pd.crosstab(df['Married'], df.Loan_Status).plot(kind="bar")
pd.crosstab(df['Dependents'], df.Loan_Status).plot(kind="bar")
pd.crosstab(df['Education'], df.Loan_Status).plot(kind="bar")
pd.crosstab(df['Credit_History'], df.Loan_Status).plot(kind="bar")
pd.crosstab(df['Property_Area'], df.Loan_Status).plot(kind="bar")

<b> 3. Imbalance handling

In [None]:
df[df.Loan_Status == 1].shape

In [None]:
df[df.Loan_Status == 0].shape

In [None]:
X = df.drop(["Loan_Status"], axis=1)
y = df['Loan_Status']

smote = SMOTE(random_state=42)

Xr, yr = smote.fit_resample(X, y)

In [None]:
pd.crosstab(Xr['Gender'], yr).plot(kind="bar")
pd.crosstab(Xr['Married'], yr).plot(kind="bar")
pd.crosstab(Xr['Dependents'], yr).plot(kind="bar")
pd.crosstab(Xr['Education'], yr).plot(kind="bar")
pd.crosstab(Xr['Credit_History'], yr).plot(kind="bar")
pd.crosstab(Xr['Property_Area'], yr).plot(kind="bar")

<b> 4. Data Preparation

In [None]:
Xr.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xr, yr, test_size=0.3, random_state=64)

<b> 5. Create Logistic Regression model

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['lbfgs', 'liblinear']
}

lr = LogisticRegression(max_iter=1000)

grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='roc_auc')

grid_search.fit(X_train, y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)


In [None]:
regression = LogisticRegression(C=10, penalty='l1', solver='liblinear')
regression.fit(X_train, y_train)
regression.score(X_test, y_test)

In [None]:
print(classification_report(y_test, regression.predict(X_test)))

<b> 6. pickling logistic model

In [None]:
with open('logistic.pkl', 'wb') as f:
    pickle.dump(regression, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [None]:
X_train

In [None]:
y_train