 # Logistics Regression Project on Loan datasets

## The aim of this project is to predict whether a person will get their loan approved or not using logistic regression. 

In [10]:
import warnings # I am importing warnings to control non-critical messages
warnings.filterwarnings('ignore') # I am hiding non-critical warnings for cleaner output

In [28]:
import pandas as pd # I am using pandas to load and manipulate tabular data
import numpy as np # I am using numpy for numeric operations and arrays

##### Step 1 : Load the Loan dataset

In [30]:
loan = pd.read_csv('loan.csv') # I am reading the CSV file into a DataFrame

In [20]:
print(loan.head())  # I am previewing the first few rows to understand structure

   age  gender  occupation education_level marital_status  income  \
0   32    Male    Engineer      Bachelor's        Married   85000   
1   45  Female     Teacher        Master's         Single   62000   
2   28    Male     Student     High School         Single   25000   
3   51  Female     Manager      Bachelor's        Married  105000   
4   36    Male  Accountant      Bachelor's        Married   75000   

   credit_score loan_status  
0           720    Approved  
1           680    Approved  
2           590      Denied  
3           780    Approved  
4           710    Approved  


#### Step 2: Checking the Data description

In [22]:
print(loan.shape) # I am checking how many rows and columns I have

(61, 8)


In [24]:
print(loan.dtypes) # I am inspecting data types of each column

age                 int64
gender             object
occupation         object
education_level    object
marital_status     object
income              int64
credit_score        int64
loan_status        object
dtype: object


In [26]:
print(loan.duplicated().sum()) # I am checking for duplicate rows

0


In [28]:
print(loan.isna().sum()) # I am checking for missing values per column

age                0
gender             0
occupation         0
education_level    0
marital_status     0
income             0
credit_score       0
loan_status        0
dtype: int64


#### Step 3:Drop non-predictive identifiers

In [32]:
if 'Loan_ID' in loan.columns: # I am checking if Loan_ID exists before dropping it
     loan.drop(columns=['Loan_ID'], axis=1, inplace=True) # I am removing unique ID that doesn't help prediction

In [38]:
loan

Unnamed: 0,age,gender,occupation,education_level,marital_status,income,credit_score,loan_status
0,32,Male,Engineer,Bachelor's,Married,85000,720,Approved
1,45,Female,Teacher,Master's,Single,62000,680,Approved
2,28,Male,Student,High School,Single,25000,590,Denied
3,51,Female,Manager,Bachelor's,Married,105000,780,Approved
4,36,Male,Accountant,Bachelor's,Married,75000,710,Approved
...,...,...,...,...,...,...,...,...
56,39,Male,Architect,Master's,Married,100000,770,Approved
57,25,Female,Receptionist,High School,Single,32000,570,Denied
58,43,Male,Banker,Bachelor's,Married,95000,760,Approved
59,30,Female,Writer,Master's,Single,55000,650,Approved


In [54]:
print("Columns in dataset before cleaning:", loan.columns.tolist())  # I am listing all columns

Columns in dataset before cleaning: ['age', 'gender', 'occupation', 'education_level', 'marital_status', 'income', 'credit_score', 'loan_status']


In [58]:
loan.columns = loan.columns.str.strip().str.replace(" ", "_")  # I am normalizing column names and Standardize column names (remove spaces, unify format)

In [34]:
target_col = None
for col in loan.columns:
    if col.lower() == "loan_status":  # I am matching regardless of case
        target_col = col
        break

if target_col is None:  # I am failing early if target column missing
    raise ValueError("Target column 'Loan_Status' not found in the dataset")

print(" Using target column:", target_col)

 Using target column: loan_status


In [62]:
print("Missing values per column:\n", loan.isnull().sum())  # I am checking for columns that contain null cells

Missing values per column:
 age                0
gender             0
occupation         0
education_level    0
marital_status     0
income             0
credit_score       0
loan_status        0
dtype: int64


#### Step 4: Handle missing values (simple strategy: fill with mode for categorical, median for numeric)

In [38]:
for col in loan.columns:
    if loan[col].dtype == "object":
        loan[col].fillna(loan[col].mode()[0], inplace=True)  # I am filling categorical NaNs with most common value
    else:
        loan[col].fillna(loan[col].median(), inplace=True)   # I am filling numeric NaNs with median

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  loan[col].fillna(loan[col].median(), inplace=True)   # I am filling numeric NaNs with median
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  loan[col].fillna(loan[col].mode()[0], inplace=True)  # I am filling categorical NaNs with most common value


#### Step 5: Encode categorical variables into numeric

In [40]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

for col in loan.select_dtypes(include="object").columns:
    loan[col] = encoder.fit_transform(loan[col])  # I am converting categories to numbers

#### Step 6: Define features (X) and target (y)

In [42]:
X = loan.drop(columns=[target_col], axis=1)  # I am using all other columns as features
y = loan[target_col]  # I am using Loan_Status as label

print("Shape of features (X):", X.shape)
print("Shape of target (y):", y.shape)

Shape of features (X): (61, 7)
Shape of target (y): (61,)


##### Step 7: Split into training and testing sets

In [44]:
from sklearn.model_selection import train_test_split

# I am splitting into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train set size:", X_train.shape)
print("Test set size:", X_test.shape)

Train set size: (48, 7)
Test set size: (13, 7)


##### Step 8: Train logistic regression model

In [46]:
from sklearn.linear_model import LogisticRegression # I am importing Logistic Regression model

In [48]:
# I am creating a logistic regression model
lr = LogisticRegression(max_iter=1000, random_state=42)

In [50]:
# I am fitting the model to training data
lr.fit(X_train, y_train)

##### Step 9: Make predictions on the test set

In [52]:
# I am predicting loan approval for test data
y_pred = lr.predict(X_test)

In [54]:
# I am getting predicted probabilities (useful later for ROC, threshold tuning)
y_pred_proba = lr.predict_proba(X_test)[:, 1]

In [56]:
# I am getting predicted probabilities (useful later for ROC, threshold tuning)
y_pred_proba = lr.predict_proba(X_test)[:, 1]

##### Step 10: Evaluate results

In [58]:
# I am importing metrics to check how good the model is
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

##### Getting the Accuracy

In [60]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)  # I am measuring overall correctness
print("\nAccuracy:", accuracy)


Accuracy: 1.0


##### Getting the Confusion Matrix

In [98]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", cm)


Confusion Matrix:
 [[10  0]
 [ 0  3]]


##### Getting the Classification Report

In [100]:
# Classification Report
cr = classification_report(y_test, y_pred)
print("\nClassification Report:\n", cr)


Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         3

    accuracy                           1.00        13
   macro avg       1.00      1.00      1.00        13
weighted avg       1.00      1.00      1.00        13



In [18]:
##### Getting the Cross Validation Report

In [102]:
# Cross Validation (5-fold)
from sklearn.model_selection import cross_val_score

In [104]:
# I am validating the model across 5 folds to check generalization
cv_scores = cross_val_score(lr, X, y, cv=5, scoring='accuracy')
print("\nCross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))


Cross-Validation Scores: [1.         0.91666667 0.91666667 1.         1.        ]
Mean CV Accuracy: 0.9666666666666666


##### Model interpretation

#### Conclusion

In [62]:
# After training your model (lr), we can check feature importance like this:
# Geting the feature names and coefficients
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lr.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print(feature_importance)

           Feature  Coefficient
6     credit_score     0.411611
4   marital_status     0.001354
5           income    -0.005035
1           gender    -0.022865
3  education_level    -0.044755
0              age    -0.055862
2       occupation    -0.061789


#### By

## Ebenezer Adebiyi

### Linkedin : Ebenezer Adebiyi
### Email : Ebenezerdadebiyi@gmail.com