In [None]:
import pandas as pd
file_path = './loan_data_set.csv'
data = pd.read_csv(file_path)
data['Gender'].fillna(data['Gender'].mode()[0], inplace=True)
data['Married'].fillna(data['Married'].mode()[0], inplace=True)
data['Dependents'].fillna(data['Dependents'].mode()[0], inplace=True)
data['Self_Employed'].fillna(data['Self_Employed'].mode()[0], inplace=True)
data['LoanAmount'].fillna(data['LoanAmount'].median(), inplace=True)
data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mode()[0], inplace=True)
data['Credit_History'].fillna(data['Credit_History'].mode()[0], inplace=True)
data = pd.get_dummies(data, columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area'], drop_first=True)
data['Loan_Status'] = data['Loan_Status'].map({'Y': 1, 'N': 0})
data.head()


In [None]:
   Loan_ID  ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0  LP001002            5849                0.0       146.0             360.0   
1  LP001003            4583             1508.0       128.0             360.0   
2  LP001005            3000                0.0        66.0             360.0   
3  LP001006            2583                0.0       120.0             360.0   
4  LP001008            6000                0.0       141.0             360.0   

   Credit_History  Gender_Male  Married_Yes  Dependents_1  Dependents_2  \
0             1.0            1            1             0             0   
1             1.0            1            1             1             0   
2             1.0            1            0             0             0   
3             1.0            1            1             0             0   
4             1.0            1            1             1             0   

   Dependents_3+  Education_Not Graduate  Self_Employed_Yes  \
0              0                       0                  0   
1              0                       0                  0   
2              0                       0                  0   
3              0                       0                  0   
4              0                       0                  0   

   Property_Area_Semiurban  Property_Area_Urban  Loan_Status  
0                        0                    1            1  
1                        0                    0            0  
2                        1                    0            1  
3                        1                    0            1  
4                        0                    1            1  


In [None]:
## Next, lets split the dataset into train and test sets to train a model.

In [None]:
from sklearn.model_selection import train_test_split

# X contains the features used for training the model, and y contains the target variable
X = data.drop(columns=['Loan_ID', 'Loan_Status'])
y = data['Loan_Status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
## Now that we have a train and test set, we can use the train set to train a model.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

accuracy, conf_matrix, class_report

In [None]:
              precision    recall  f1-score   support

           0       0.95      0.42      0.58        43
           1       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123


In [None]:
## Intepretation : The model performs well in identifying loan approvals (class 1) with high recall (0.99), meaning it correctly identifies most approved loans. However, it has a lower precision (0.76) for class 1, indicating some false positives.
## For non-approved loans (class 0), the model has high precision (0.95) but low recall (0.42), meaning it misses a significant number of actual non-approved loans.
## The overall accuracy of 78.86% suggests that the model is reasonably effective, but the imbalance between precision and recall for class 0 indicates areas for potential improvement, possibly through techniques such as balancing the dataset, tuning the model, or exploring different algorithms.