# Machine Learning Modeling

In [None]:
#For the "loan approval" dataset, we can approach it as a binary classification problem where we aim to predict whether a loan
#will be approved or not based on the given features. Two suitable machine learning models for this problem type are Logistic 
#Regression and Decision Trees. Logistic Regression is commonly used for binary classification tasks, and Decision Trees can
#handle both classification and regression problems.


In [27]:
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [28]:
#Importing the data
ld = pd.read_csv("C:\\Users\\oyina\\Documents\\Data Analytics _ Durham college\\Project\\Datasets\\archive (3)\\loan.csv")
ld

Unnamed: 0,LOAN_ID,NO_OF_DEPENDENTS,EDUCATION,SELF_EMPLOYED,INCOME_ANNUM,LOAN_AMOUNT,LOAN_TERM,CIBIL_SCORE,RESIDENTIAL_ASSETS_VALUE,COMMERCIAL_ASSETS_VALUE,LUXURY_ASSETS_VALUE,BANK_ASSET_VALUE,LOAN_STATUS
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4264,4265,5,Graduate,Yes,1000000,2300000,12,317,2800000,500000,3300000,800000,Rejected
4265,4266,0,Not Graduate,Yes,3300000,11300000,20,559,4200000,2900000,11000000,1900000,Approved
4266,4267,2,Not Graduate,No,6500000,23900000,18,457,1200000,12400000,18100000,7300000,Rejected
4267,4268,1,Not Graduate,No,4100000,12800000,8,780,8200000,700000,14100000,5800000,Approved


In [29]:
ld.columns = ld.columns.str.strip()
ld.columns

Index(['LOAN_ID', 'NO_OF_DEPENDENTS', 'EDUCATION', 'SELF_EMPLOYED',
       'INCOME_ANNUM', 'LOAN_AMOUNT', 'LOAN_TERM', 'CIBIL_SCORE',
       'RESIDENTIAL_ASSETS_VALUE', 'COMMERCIAL_ASSETS_VALUE',
       'LUXURY_ASSETS_VALUE', 'BANK_ASSET_VALUE', 'LOAN_STATUS'],
      dtype='object')

In [30]:
# Features (all columns except 'loan_status')
X = ld.drop('LOAN_STATUS', axis=1) 
# Target variable
y = ld['LOAN_STATUS']  

In [31]:
#Converting categorical variables to numerical with one-hot encoding
X = pd.get_dummies(X)

In [32]:
#Splitting the Dataset
#Split the dataset into training and testing sets to train the models on a subset of the data and evaluate their performance.
##from sklearn.model_selection import train_test_split

# Split the dataset into 80% training and 20% testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Model 1: Logistic Regression:
Applying the Logistic Regression model on the training data, train it, and then evaluate its performance on the testing data.

In [33]:
##from sklearn.linear_model import LogisticRegression
##from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Initialize the Logistic Regression model
logreg_model = LogisticRegression()

# Train the model using the training data
logreg_model.fit(X_train, y_train)

# Predict on the testing data
y_pred_logreg = logreg_model.predict(X_test)

# Evaluate the model's performance
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
conf_matrix_logreg = confusion_matrix(y_test, y_pred_logreg)
classification_report_logreg = classification_report(y_test, y_pred_logreg)

print("Logistic Regression Accuracy:", accuracy_logreg)
print("Confusion Matrix for Logistic Regression:\n", conf_matrix_logreg)
print("Classification Report for Logistic Regression:\n", classification_report_logreg)


Logistic Regression Accuracy: 0.6370023419203747
Confusion Matrix for Logistic Regression:
 [[510  26]
 [284  34]]
Classification Report for Logistic Regression:
               precision    recall  f1-score   support

    Approved       0.64      0.95      0.77       536
    Rejected       0.57      0.11      0.18       318

    accuracy                           0.64       854
   macro avg       0.60      0.53      0.47       854
weighted avg       0.61      0.64      0.55       854



# Model 2: Decision Trees:
Applying the Decision Trees model on the training data, train it, and then evaluate its performance on the testing data.

In [34]:
##from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree model
dt_model = DecisionTreeClassifier()

# Train the model using the training data
dt_model.fit(X_train, y_train)

# Predict on the testing data
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model's performance
accuracy_dt = accuracy_score(y_test, y_pred_dt)
conf_matrix_dt = confusion_matrix(y_test, y_pred_dt)
classification_report_dt = classification_report(y_test, y_pred_dt)

print("Decision Tree Accuracy:", accuracy_dt)
print("Confusion Matrix for Decision Tree:\n", conf_matrix_dt)
print("Classification Report for Decision Tree:\n", classification_report_dt)


Decision Tree Accuracy: 0.9730679156908665
Confusion Matrix for Decision Tree:
 [[528   8]
 [ 15 303]]
Classification Report for Decision Tree:
               precision    recall  f1-score   support

    Approved       0.97      0.99      0.98       536
    Rejected       0.97      0.95      0.96       318

    accuracy                           0.97       854
   macro avg       0.97      0.97      0.97       854
weighted avg       0.97      0.97      0.97       854



# Model Evaluation:
Compare the performance of both models based on the accuracy, confusion matrix, and classification report. The evaluation metrics provide insights into how well each model is performing in predicting loan approvals.
Remember that this is just a basic implementation, and there are many other techniques and strategies to improve the model performance, such as hyperparameter tuning, feature engineering, and ensemble methods. Depending on the dataset and specific requirements, you can explore other machine learning models as well