# Libraries

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load and Preprocess the Dataset

In [16]:
# Load dataset
df = pd.read_csv('loan_approval_dataset.csv')
df

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4264,4265,5,Graduate,Yes,1000000,2300000,12,317,2800000,500000,3300000,800000,Rejected
4265,4266,0,Not Graduate,Yes,3300000,11300000,20,559,4200000,2900000,11000000,1900000,Approved
4266,4267,2,Not Graduate,No,6500000,23900000,18,457,1200000,12400000,18100000,7300000,Rejected
4267,4268,1,Not Graduate,No,4100000,12800000,8,780,8200000,700000,14100000,5800000,Approved


In [17]:
df.columns

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [18]:
# Remove extra spaces from column names
df.columns = df.columns.str.strip()

# Display updated column names
print(df.columns)

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')


In [19]:
# Handle missing values
df['loan_amount'] = df['loan_amount'].fillna(df['loan_amount'].median())
df['loan_term'] = df['loan_term'].fillna(df['loan_term'].median())
df['cibil_score'] = df['cibil_score'].fillna(df['cibil_score'].mode()[0])

# Drop remaining missing values
df.dropna(inplace=True)

# Encode categorical variables
le = LabelEncoder()
for col in ['education', 'self_employed', 'loan_status']:
    df[col] = le.fit_transform(df[col])

# Display processed DataFrame
print(df.head())

   loan_id  no_of_dependents  education  self_employed  income_annum  \
0        1                 2          0              0       9600000   
1        2                 0          1              1       4100000   
2        3                 3          0              0       9100000   
3        4                 3          0              0       8200000   
4        5                 5          1              1       9800000   

   loan_amount  loan_term  cibil_score  residential_assets_value  \
0     29900000         12          778                   2400000   
1     12200000          8          417                   2700000   
2     29700000         20          506                   7100000   
3     30700000          8          467                  18200000   
4     24200000         20          382                  12400000   

   commercial_assets_value  luxury_assets_value  bank_asset_value  loan_status  
0                 17600000             22700000           8000000            

In [20]:
df = df.drop(columns=['loan_id'])
df

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,0,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,0
1,0,1,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,1
2,3,0,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,1
3,3,0,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,1
4,5,1,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4264,5,0,1,1000000,2300000,12,317,2800000,500000,3300000,800000,1
4265,0,1,1,3300000,11300000,20,559,4200000,2900000,11000000,1900000,0
4266,2,1,0,6500000,23900000,18,457,1200000,12400000,18100000,7300000,1
4267,1,1,0,4100000,12800000,8,780,8200000,700000,14100000,5800000,0


# Feature Selection

In [21]:
# Define features (X) by dropping the target column
X = df.drop(columns=['loan_status'])

# Define target variable (y)
y = df['loan_status']


# Split and Scale the Data

In [22]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Hyperparameter Tuning with GridSearchCV

In [23]:
# Create Logistic Regression model
model = LogisticRegression()

# Define hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],        # Regularization strength
    'penalty': ['l1', 'l2'],             # Regularization type
    'solver': ['liblinear']              # Solver compatible with l1 penalty
}

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           cv=5, scoring='accuracy', n_jobs=-1)

# Fit GridSearch to the data
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)


Best Parameters: {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
Best Cross-Validation Score: 0.9402635431918009


# Evaluate the Tuning Model

In [24]:
# Predict using the best estimator
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9344262295081968
Confusion Matrix:
 [[493  43]
 [ 13 305]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.92      0.95       536
           1       0.88      0.96      0.92       318

    accuracy                           0.93       854
   macro avg       0.93      0.94      0.93       854
weighted avg       0.94      0.93      0.93       854



# Saving the Tuned Model as a Pickel File

In [25]:
import joblib

# Save the best model to a Joblib file
joblib.dump(best_model, 'loan_approval_model.pkl')

print("Model has been saved as 'loan_approval_model.pkl'")

Model has been saved as 'loan_approval_model.pkl'


# Loading the Pickel File

In [26]:
# Load the saved model
loaded_model = joblib.load('loan_approval_model.pkl')

# Make predictions with the loaded model
y_loaded_pred = loaded_model.predict(X_test)

# Evaluate loaded model
print("Loaded Model Accuracy:", accuracy_score(y_test, y_loaded_pred))

Loaded Model Accuracy: 0.9344262295081968


In [27]:
import joblib
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)  # Fit the scaler on your training data

# Save the scaler using joblib
joblib.dump(scaler, 'scaler.pkl')

print("Scaler has been saved as 'scaler.pkl'")

Scaler has been saved as 'scaler.pkl'


In [28]:
print(df.shape)  # Check the shape of your input data
print(scaler.mean_.shape)  # Check the number of features in the scaler


(4269, 12)
(11,)
