## Import required library packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, roc_auc_score, roc_curve,ConfusionMatrixDisplay, classification_report

In [6]:
!pip install --upgrade scikit-learn



## Import dataset

In [None]:
# copy cvs data to memory
loan_df = pd.read_csv("Loan Prediction Dataset.csv")

In [None]:
# see the first 5 data row
loan_df.head()

In [None]:
# dimension of data set
loan_df.shape

In [None]:
# data types of columns in data set
loan_df.dtypes

### Analyse and fill in missing data

In [None]:
# sum up missing data per column
loan_df.isnull().sum()

#### Fill in categorical missing features with the mode

In [None]:
# create separate memory for dataset with no missing data
loan_no_missing_data_df = loan_df

In [None]:
# Gender, Married, Dependents, Self_Employed, Loan_Amount_Term and Credit_History features are categorised, 
# therefore, we use the modes to fill in missing datas

# Gender
loan_no_missing_data_df.Gender.fillna(loan_no_missing_data_df.Gender.mode()[0], inplace=True)
# Married
loan_no_missing_data_df.Married.fillna(loan_no_missing_data_df.Married.mode()[0], inplace=True)
# Dependents
loan_no_missing_data_df.Dependents.fillna(loan_no_missing_data_df.Dependents.mode()[0], inplace=True)
# Self_Employed
loan_no_missing_data_df.Self_Employed.fillna(loan_no_missing_data_df.Self_Employed.mode()[0], inplace=True)
# Loan_Amount_Term
loan_no_missing_data_df.Loan_Amount_Term.fillna(loan_no_missing_data_df.Loan_Amount_Term.mode()[0], inplace=True)
# Credit_History
loan_no_missing_data_df.Credit_History.fillna(loan_no_missing_data_df.Credit_History.mode()[0], inplace=True)

#### Fill in continuous missing features with the mean

In [None]:
# LoanAmount feature is continuous,therefore, we use the mean to fill in missing datas
loan_no_missing_data_df.LoanAmount.fillna(loan_no_missing_data_df.LoanAmount.mean(), inplace=True)

In [None]:
# confirm there is no more missing data
loan_no_missing_data_df.isnull().sum()

In [None]:
# save your new dataset with no missing data
loan_no_missing_data_df.to_csv("loan-no-missing-data.csv", index = False)

In [None]:
# create a separate DataFrame for filling in your missing data
loan_no_missing_data_df = pd.read_csv("loan-no-missing-data.csv")

In [None]:
loan_no_missing_data_df.head()

## Check for outliers in continuous features

In [None]:
loan_no_missing_data_describe_df = loan_no_missing_data_df[["ApplicantIncome","CoapplicantIncome", "LoanAmount"]].describe()

In [None]:
loan_no_missing_data_describe_df

In [None]:
# InterQuartile range check, to find lower and higher outliers per continuous features
def IQR_check(df):
    IQR_info = {}
    for col in df.columns:
        # dictionary of lower and higher outliers per numeric features, using Q1 - 1.5*IQR & Q3 - 1.5*IQR 
        IQR_info[col] = [df[col].loc["25%"] - 1.5*df[col].loc["25%"], df[col].loc["75%"] + 1.5*df[col].loc["75%"]]
    return IQR_info

In [None]:
# call the IQR function with data
IQR_check(loan_no_missing_data_describe_df)

In [None]:
IQR_range = IQR_check(loan_no_missing_data_describe_df)

In [None]:
# Create a column to indicate if `ApplicantIncome` column is an outlier using the lower and higher limit
loan_no_missing_data_df["ApplicantIncome_is_outlier"] = loan_no_missing_data_df["ApplicantIncome"].apply(lambda x: "Yes" if x < IQR_range["ApplicantIncome"][0] or x > IQR_range["ApplicantIncome"][1] else "No")

# Create a column to indicate if `CoapplicantIncome` column is an outlier using the lower and higher limit
loan_no_missing_data_df["CoapplicantIncome_is_outlier"] = loan_no_missing_data_df["CoapplicantIncome"].apply(lambda x: "Yes" if x < IQR_range["CoapplicantIncome"][0] or x > IQR_range["CoapplicantIncome"][1] else "No")

# Create a column to indicate if `LoanAmount` column is an outlier using the lower and higher limit
loan_no_missing_data_df["LoanAmount_is_outlier"] = loan_no_missing_data_df["LoanAmount"].apply(lambda x: "Yes" if x < IQR_range["LoanAmount"][0] or x > IQR_range["LoanAmount"][1] else "No")

# Create a column to indicate if all of `ApplicantIncome, CoapplicantIncome or LoanAmount` columns is an outlier
loan_no_missing_data_df['All_outliers'] = loan_no_missing_data_df[["ApplicantIncome_is_outlier", "CoapplicantIncome_is_outlier", "LoanAmount_is_outlier"]].apply(lambda x: "Yes" if x.isin(["Yes"]).sum() == 3 else "No", axis=1)

In [None]:
loan_no_missing_data_df

In [None]:
# find out were there are outliers
loan_no_missing_data_df.ApplicantIncome_is_outlier.value_counts(), loan_no_missing_data_df.CoapplicantIncome_is_outlier.value_counts(), loan_no_missing_data_df.LoanAmount_is_outlier.value_counts(), loan_no_missing_data_df.All_outliers.value_counts()

In [None]:
# focus on rows were all continuous columns are outliers
loan_no_missing_data_df[loan_no_missing_data_df["All_outliers"] == "Yes"]

In [None]:
# Drop the one line where the all continuous features are outlier
loan_no_missing_data_df = loan_no_missing_data_df.drop(loan_no_missing_data_df[loan_no_missing_data_df["All_outliers"] == "Yes"].index)

In [None]:
loan_no_missing_data_df.shape

In [None]:
# remove the Loan ID, as it does not give additional information to our data, also remove the new outlier columns add as these are no longer needed
loan_no_missing_data_df.drop(columns=["Loan_ID", "All_outliers", "ApplicantIncome_is_outlier", "CoapplicantIncome_is_outlier", "LoanAmount_is_outlier"], inplace=True)


In [None]:
# combine the Applicant and co-applicant income as one to create a new `TotalIncome`
loan_no_missing_data_df['TotalIncome'] = loan_no_missing_data_df['ApplicantIncome'] + loan_no_missing_data_df['CoapplicantIncome']

In [None]:
loan_no_missing_data_df

## Graphical analysis of features that are continuous 

In [None]:
# Create a 2x2 grid of subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

axes[0,0].set_title("ApplicantIncome")
sns.histplot(loan_no_missing_data_df['ApplicantIncome'], color="blue", kde=True,
    stat="density",ax=axes[0,0]);

axes[0,1].set_title("CoapplicantIncome")
sns.histplot(loan_no_missing_data_df['CoapplicantIncome'], color="blue", kde=True,
    stat="density",ax=axes[0,1]);

axes[1,0].set_title("TotalIncome")
sns.histplot(loan_no_missing_data_df['TotalIncome'], color="blue", kde=True,
             stat="density",ax=axes[1,0])

axes[1,1].set_title('LoanAmount');
sns.histplot(loan_no_missing_data_df['LoanAmount'], color="blue", kde=True,
    stat="density",ax=axes[1,1]);
plt.style.use("seaborn-v0_8")

In [None]:
# Create table for just features that are continuous 
loan_no_missing_data_cf =  loan_no_missing_data_df[['ApplicantIncome','CoapplicantIncome', 'LoanAmount', 'TotalIncome']].copy()

In [None]:
corr = loan_no_missing_data_cf.corr()

In [None]:
# see the correlation heatmap
plt.figure(figsize=(10,5))
sns.heatmap(corr, annot = True, cmap = "BuPu");

In [None]:
# Drop "ApplicantIncome" and "CoapplicantIncome" columns now we have a column that sums them together 
loan_no_missing_data_df.drop(columns=['ApplicantIncome','CoapplicantIncome'], inplace=True)
loan_no_missing_data_df.head()

In [None]:
# save the latest dataframe
loan_no_missing_data_df.to_csv("loan-no-missing-data.csv", index = False)

In [None]:
# histogram and density curve to show the distributions for both continuous coloumns(TotalIncome & LoanAmount)
fig, axes = plt.subplots(1, 2, figsize=(15, 4))
axes[0].set_title("TotalIncome")
sns.histplot(loan_no_missing_data_df['TotalIncome'], color="blue", kde=True,
    stat="density",ax=axes[0]);

axes[1].set_title("LoanAmount")
sns.histplot(loan_no_missing_data_df['LoanAmount'], color="blue", kde=True,
    stat="density",ax=axes[1]);

plt.style.use("seaborn-v0_8")

## Feature Scaling

In [None]:
# Apply normalisation to LoanAmount column
loan_no_missing_data_df["LoanAmount"] = loan_no_missing_data_df["LoanAmount"].apply(lambda x: (x + loan_no_missing_data_df["LoanAmount"].min())/(loan_no_missing_data_df["LoanAmount"].max() - loan_no_missing_data_df["LoanAmount"].min()))

In [None]:
# Apply normalisation to TotalIcome column
loan_no_missing_data_df["TotalIncome"] = loan_no_missing_data_df["TotalIncome"].apply(lambda x: (x + loan_no_missing_data_df["TotalIncome"].min())/(loan_no_missing_data_df["TotalIncome"].max() - loan_no_missing_data_df["TotalIncome"].min()))

In [None]:
# lets see what this two continuous column looks like after transformation
loan_no_missing_data_df[["LoanAmount","TotalIncome"]].head()

In [None]:
# save the latest dataframe
loan_no_missing_data_df.to_csv("loan-no-missing-data.csv", index = False)

## Data Transformation

In [None]:
# Adjust skewness for both continuous coloumns above using logrithm,redraw graph
loan_no_missing_data_df['TotalIncome'] = np.log(loan_no_missing_data_df['TotalIncome'])
loan_no_missing_data_df['LoanAmount'] = np.log(loan_no_missing_data_df['LoanAmount'])
fig, axes = plt.subplots(1, 2, figsize=(15, 4))
axes[0].set_title("TotalIncome")
sns.histplot(loan_no_missing_data_df['TotalIncome'], color="blue", kde=True,
    stat="density",ax=axes[0]);

axes[1].set_title("LoanAmount")
sns.histplot(loan_no_missing_data_df['LoanAmount'], color="blue", kde=True,
    stat="density",ax=axes[1]);

plt.style.use("seaborn-v0_8")

In [None]:
# LoanAmount is skewed to the left using logarithm transformation, try square root transformation on LoanAmount instead
loan_no_missing_data_df = pd.read_csv("loan-no-missing-data.csv")
loan_no_missing_data_df['TotalIncome'] = np.log(loan_no_missing_data_df['TotalIncome'])
loan_no_missing_data_df['LoanAmount'] = np.sqrt(loan_no_missing_data_df['LoanAmount'])

In [None]:
# plot 'TotalIncome' and 'LoanAmount'
fig, axes = plt.subplots(1, 2, figsize=(15, 4))
axes[0].set_title("TotalIncome")
sns.histplot(loan_no_missing_data_df['TotalIncome'], color="blue", kde=True,
    stat="density",ax=axes[0])

axes[1].set_title("LoanAmount")
sns.histplot(loan_no_missing_data_df['LoanAmount'], color="blue", kde=True,
    stat="density",ax=axes[1]);

In [None]:
# save the latest dataframe
loan_no_missing_data_df.to_csv("loan-no-missing-data.csv", index = False)

In [None]:
# Get unique values in column "Dependents" & "Loan_Amount_Term"
loan_no_missing_data_df["Dependents"].unique(),loan_no_missing_data_df["Loan_Amount_Term"].unique()

In [None]:
# Build my own label encoding
class LabelEncoding:
    def __init__(self):
        pass
    
    def fit(self, col):
        self.out = {}
        for idx, val in enumerate(col.unique()):
            self.out[val] = idx
            
    def transform(self, col):
        col = col.apply(lambda x: self.out[x])
        return col
    
    def fit_transform(self, col):
        self.fit(col)
        return self.transform(col)

In [None]:
# apply label encoding to categorised column ro make this numeric
cols = ["Gender", "Dependents", "Married", "Education", "Self_Employed", "Property_Area", "Loan_Status", "Loan_Amount_Term"]

le = LabelEncoder()

for col in cols:
    loan_no_missing_data_df[col] = le.fit_transform(loan_no_missing_data_df[col])

In [None]:
loan_no_missing_data_df.head()

## Data Modelling

#### Model training

In [None]:
# Proceed to build and test our model'
# Split features and label dataset, where feature dataset is represented by `X` and label by `y`
X = loan_no_missing_data_df.drop(columns=['Loan_Status'], axis=1)
y = loan_no_missing_data_df["Loan_Status"]

In [None]:
# Split your train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1000,stratify=y)

In [None]:
y.value_counts(1)

In [None]:
y_train.value_counts(1)

In [None]:
# function to fit and score model
def classify(model, x_t, y_t):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1000, stratify=y)
    
    model.fit(X_train, y_train)
    print("The accuracy: ", model.score(X_test, y_test) * 100)
    
    score = cross_val_score(model, x_t, y_t, cv=5)
    print("Cross Validate score: ", np.mean(score) * 100)

#### Using logistic regression algorithm

In [None]:
# Using logistic Regression
log_model = LogisticRegression()

In [None]:
# fit / train model
log_model.fit(X_train, y_train);

In [None]:
# Make a prediction with your test features
log_model.predict(X_test)

In [None]:
# actual test label
y_test.to_numpy()

In [None]:
# accuracy score for logistic Regression 
accuracy_score(y_test, log_model.predict(X_test))

In [None]:
# precision score for logistic Regression 
precision_score(y_test, log_model.predict(X_test))

In [None]:
# recall score for logistic Regression 
recall_score(y_test, log_model.predict(X_test))

In [None]:
score = cross_val_score(log_model, X, y, cv=5)
print("Cross Validate score: ", np.mean(score) * 100)

In [None]:
# confusion_matrix for logistic Regression 
confusion_matrix(y_test, log_model.predict(X_test))

In [None]:
# produce more colourful confusion matrix
def customise_confusion_matrix(y_true, y_pred, normalize=False, title=None, cmap=plt.cm.Purples):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt='g', cmap=cmap)
    plt.xlabel('Predicted labels')
    plt.ylabel('Actual labels')
    plt.title('Confusion Matrix')
    plt.show()

In [None]:
# confusion_matrix for logistic Regression 
customise_confusion_matrix(y_test, log_model.predict(X_test))

In [None]:
# OR
import sklearn

# Note this is from a fitted model and not predictions. Also, note we are passing all X and y date, 
# hence why we are getting more data
ConfusionMatrixDisplay.from_estimator(estimator=log_model,X=X, y=y);

In [None]:
# Note this is from a predictions
ConfusionMatrixDisplay.from_predictions(y_true=y_test,y_pred=log_model.predict(X_test));

In [None]:
# confusion_matrix for logistic Regression 
roc_auc_score(y_test, log_model.predict(X_test))

#### Using RandomForestClassifier algorithm

In [None]:
# RandomForestClassifier algorithm
# Try different amount of n_estimate to see which is better
np.random.seed(1000) # so our results are replicable
for i in range (10, 200, 10):
    print(f"Trying model with {i} estimators ...")
    rand_model = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accuracy on the test set: {accuracy_score(y_test, rand_model.predict(X_test)) * 100:.2f}%")
    print(f"Model precision on the test set: {precision_score(y_test, rand_model.predict(X_test)) * 100:.2f}%")
    print(f"Model recall on the test set: {recall_score(y_test, rand_model.predict(X_test)) * 100:.2f}%")

**The results above indicate that the best estimator for the chosen range is `90`. For this model training `n_estimator = 90` will be used**

In [None]:
np.random.seed(1000) # so our results are replicable
rand_model = RandomForestClassifier(n_estimators=130).fit(X_train, y_train)
print(f"Model accuracy on the test set: {accuracy_score(y_test, rand_model.predict(X_test)) * 100:.2f}%")
print(f"Model precision on the test set: {precision_score(y_test, rand_model.predict(X_test)) * 100:.2f}%")
print(f"Model recall on the test set: {recall_score(y_test, rand_model.predict(X_test)) * 100:.2f}%")

In [None]:
# confusion_matrix for RandomForestClassifier 
confusion_matrix(y_test, rand_model.predict(X_test))

In [None]:
# confusion_matrix for RandomForestClassifier
customise_confusion_matrix(y_test, rand_model.predict(X_test))

In [None]:
score = cross_val_score(rand_model, X, y, cv=5)
print("Cross Validate score: ", np.mean(score) * 100)

In [None]:
# confusion_matrix for RandomForestClassifier 
roc_auc_score(y_test, rand_model.predict(X_test))

**Area under the receiver operating characteristics curve (AUC/POC) using results for RFC model**

* Area under curve (AUC)
* ROC curve

ROC curve: comparisons a model's true positive rate (`tpr`) versus a models false positive rate (`fpr`)

* True positive = model predicts 1 when truth is 1
* False positive = model predicts 1 when truth is 0
* True negative = model predicts 0 when truth is 0
* False negative = model predicts 0 when truth is 1

In [None]:
# Make predictions with probabilities
y_probs = rand_model.predict_proba(X_test)
y_probs[:10], len(y_probs)

In [None]:
y_probs_positive = y_probs[:,1]
y_probs_positive[:10]

In [None]:
# Calculate fpr, tpr and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive)

fpr

In [None]:
# Create a function for plotting ROC cureve

def roc_curve(fpr, tpr):
    
    plt.plot(fpr, tpr, color="orange", label="ROC")
    # Plot line with no predictive power (baselne)
    plt.plot([0, 1], [0, 1], color="darkblue", linestyle="--", label="Random guess")
    
    # Customize the plot
    plt.xlabel("False positive rate (fpr)")
    plt.ylabel("True positive rate (tpr)")
    plt.title("Receiver Operating Characteristic (ROC) Curve")
    plt.legend()
    plt.show()

roc_curve(fpr, tpr)

In [None]:
# Using the logistics regression print classification report
print(classification_report(y_test,log_model.predict(X_test)))

In [None]:
pd.DataFrame(classification_report(y_test,log_model.predict(X_test), output_dict=True))