In [33]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier



Reading the Dataset

In [34]:
df = pd.read_csv('loan_approval_data.csv')

In [35]:
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,employment_type,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,12th,No,Salaried,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,12th,No,Salaried,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,8th,No,Salaried,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,8th,No,Salaried,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Graduate,No,Salaried,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   loan_id                   4269 non-null   int64 
 1   no_of_dependents          4269 non-null   int64 
 2   education                 4269 non-null   object
 3   self_employed             4269 non-null   object
 4   employment_type           4269 non-null   object
 5   income_annum              4269 non-null   int64 
 6   loan_amount               4269 non-null   int64 
 7   loan_term                 4269 non-null   int64 
 8   cibil_score               4269 non-null   int64 
 9   residential_assets_value  4269 non-null   int64 
 10  commercial_assets_value   4269 non-null   int64 
 11  luxury_assets_value       4269 non-null   int64 
 12  bank_asset_value          4269 non-null   int64 
 13  loan_status               4269 non-null   object
dtypes: int64(10), object(4)


In [37]:
df.describe()

Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
count,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0
mean,2135.0,2.498712,5059124.0,15133450.0,10.900445,599.936051,7472617.0,4973155.0,15126310.0,4976692.0
std,1232.498479,1.69591,2806840.0,9043363.0,5.709187,172.430401,6503637.0,4388966.0,9103754.0,3250185.0
min,1.0,0.0,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0
25%,1068.0,1.0,2700000.0,7700000.0,6.0,453.0,2200000.0,1300000.0,7500000.0,2300000.0
50%,2135.0,3.0,5100000.0,14500000.0,10.0,600.0,5600000.0,3700000.0,14600000.0,4600000.0
75%,3202.0,4.0,7500000.0,21500000.0,16.0,748.0,11300000.0,7600000.0,21700000.0,7100000.0
max,4269.0,5.0,9900000.0,39500000.0,20.0,900.0,29100000.0,19400000.0,39200000.0,14700000.0


In [38]:
df = df.drop('loan_id',axis=1)

In [39]:
df.head()

Unnamed: 0,no_of_dependents,education,self_employed,employment_type,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,12th,No,Salaried,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,0,12th,No,Salaried,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,8th,No,Salaried,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,3,8th,No,Salaried,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,Graduate,No,Salaried,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [40]:
df['total_collateral'] = df['residential_assets_value'] + df['commercial_assets_value'] + df['luxury_assets_value']  + df['bank_asset_value']	
df['income__loan_ratio'] =  df['income_annum'] / df['loan_amount']


In [41]:
df.columns

Index(['no_of_dependents', 'education', 'self_employed', 'employment_type',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status',
       'total_collateral', 'income__loan_ratio'],
      dtype='object')

In [42]:
df = df.drop(['income_annum','loan_amount','residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value','no_of_dependents','education', 'self_employed', 'employment_type'],axis=1)

In [43]:
df.columns

Index(['loan_term', 'cibil_score', 'loan_status', 'total_collateral',
       'income__loan_ratio'],
      dtype='object')

In [44]:
X = df.drop('loan_status',axis=1)
y = df['loan_status']


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [46]:
numerical_cols = X.select_dtypes(include=np.number).columns

In [47]:
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])



In [48]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_cols),
])

In [49]:
preprocessed_X_train = preprocessor.fit_transform(X_train)
preprocessed_X_test = preprocessor.transform(X_test)


print(preprocessed_X_train.shape)
print(preprocessed_X_test.shape)

(3415, 4)
(854, 4)


In [50]:
feature_names = preprocessor.get_feature_names_out().tolist()
print(feature_names)

['num__loan_term', 'num__cibil_score', 'num__total_collateral', 'num__income__loan_ratio']


In [51]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [52]:
print(y_train_encoded.shape)
print(y_test_encoded.shape)

(3415,)
(854,)


In [53]:
models = {
            "Random Forest": RandomForestClassifier(random_state=42),
            "Logistic Regression": LogisticRegression(random_state=42),
            "KNN": KNeighborsClassifier(),
            "Decision Tree": DecisionTreeClassifier(random_state=42),
            "SVM": SVC(random_state=42),
            "XGBoost": XGBClassifier(random_state=42)
        }

In [54]:
from sklearn.metrics import accuracy_score
for i in range(len(list(models))):
    model = list(models.values())[i]
    
    model.fit(preprocessed_X_train, y_train_encoded)

    y_train_pred = model.predict(preprocessed_X_train)
    y_test_pred = model.predict(preprocessed_X_test)

    train_accuracy = accuracy_score(y_train_encoded, y_train_pred)
    test_accuracy = accuracy_score(y_test_encoded, y_test_pred)

    print(f"Model: {list(models.keys())[i]}",(train_accuracy,test_accuracy))

Model: Random Forest (1.0, 0.9976580796252927)
Model: Logistic Regression (0.9168374816983894, 0.8981264637002342)
Model: KNN (0.9803806734992679, 0.9601873536299765)
Model: Decision Tree (1.0, 0.9941451990632318)
Model: SVM (0.9765739385065886, 0.9707259953161592)
Model: XGBoost (1.0, 0.9929742388758782)


In [55]:
from scipy.stats import randint, uniform
import numpy as np

models_params = {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'C': np.logspace(-3, 3, 7),
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear', 'saga'],
            'class_weight': [None, 'balanced']  
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': [3,5,7,10,15],
            'min_samples_split': [3,5,10,12],
            'min_samples_leaf': [8,10,12,18,20,16],
            'criterion': ['gini', 'entropy'],
            'class_weight': [None, 'balanced']  
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [25, 50, 100, 150], 
            'max_features': ['sqrt', 'log2', None], 
            'max_depth': [3, 6, 9,12], 
            'min_samples_split': [3,5,10,12],
            'class_weight': [None, 'balanced']  
        }
    },
    'KNeighborsClassifier': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3,5,7,9,11,13,15],
            'weights': ['uniform', 'distance'],
            'metric' : ['minkowski','euclidean','manhattan']
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'C': np.logspace(-2, 2, 5),
            'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
            'gamma': ['scale', 'auto'],
            'degree': randint(2, 5)  ,
            'class_weight': [None, 'balanced']  
        }
    },
    'XGBClassifier': {
        'model': XGBClassifier(),
        'params': {
            'n_estimators': [25, 50, 100, 150],
            'learning_rate': [0.1, 0.01, 0.001,10],
            'max_depth': [3,5,7,9],
            'subsample': [0.5, 0.7, 1],
            'colsample_bytree': [0.4,0.6,0.8]
        }
    }
}

In [56]:
from sklearn.model_selection import RandomizedSearchCV
cv_folds = 5
n = 10
best_results = {}

for model_name, model_info in models_params.items():
    print(f"RandomizedSearchCV for {model_name}:")

    random_search = RandomizedSearchCV(
        estimator=model_info['model'], 
        param_distributions=model_info['params'], 
        n_iter=n,  
        scoring='accuracy',  
        n_jobs=-1,             
        cv=cv_folds,          
        verbose=1,             
        random_state=42      
    )
    
   
    random_search.fit(preprocessed_X_train, y_train_encoded)  
    
    
    best_results[model_name] = {
        'best_score': random_search.best_score_,
        'best_params': random_search.best_params_
    }

    print(f"Best parameters for {model_name}: {random_search.best_params_}")
    print(f"Best cross-validation score for {model_name}: {random_search.best_score_}")
    print("="*30)


RandomizedSearchCV for LogisticRegression:
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters for LogisticRegression: {'solver': 'saga', 'penalty': 'l1', 'class_weight': 'balanced', 'C': np.float64(0.01)}
Best cross-validation score for LogisticRegression: 0.9420204978038067
RandomizedSearchCV for DecisionTreeClassifier:
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters for DecisionTreeClassifier: {'min_samples_split': 5, 'min_samples_leaf': 8, 'max_depth': 10, 'criterion': 'gini', 'class_weight': None}
Best cross-validation score for DecisionTreeClassifier: 0.9956076134699854
RandomizedSearchCV for RandomForestClassifier:
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters for RandomForestClassifier: {'n_estimators': 50, 'min_samples_split': 3, 'max_features': None, 'max_depth': 3, 'class_weight': None}
Best cross-validation score for RandomForestClassifier: 0.995900439238653
RandomizedSearchCV for KNeighbor

In [57]:
from sklearn.metrics import accuracy_score  
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score



for model_name, model_info in models_params.items():
    
    
    random_search = RandomizedSearchCV(
        estimator=model_info['model'], 
        param_distributions=model_info['params'], 
        n_iter=10,  
        scoring='accuracy',  
        n_jobs=-1,  
        cv=cv_folds,  
        verbose=1,  
        random_state=42  
    )
    
    random_search.fit(preprocessed_X_train, y_train_encoded)
    

    best_model = random_search.best_estimator_
    
    
    y_train_pred = best_model.predict(preprocessed_X_train)
    
    
    y_test_pred = best_model.predict(preprocessed_X_test)
    
    
    train_accuracy = accuracy_score(y_train_encoded, y_train_pred)
    test_accuracy = accuracy_score(y_test_encoded, y_test_pred)
    
    scores = cross_val_score(best_model, preprocessed_X_train, y_train_encoded, cv=10, scoring='accuracy', n_jobs=-1)
    
    # Print the results
    print(f"{model_name} - Training Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}")
    print(f"{model_name} - Cross-validation Best Score: {random_search.best_score_:.4f}")
    print(f"{model_name} - Mean Cross-Validation Accuracy: {scores.mean():.4f}")
    print(f"{model_name} - Test Classification Report:\n {classification_report(y_test_encoded, y_test_pred,digits=20)}")
    print("="*60)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
LogisticRegression - Training Accuracy: 0.9394, Test Accuracy: 0.9368
LogisticRegression - Cross-validation Best Score: 0.9420
LogisticRegression - Mean Cross-Validation Accuracy: 0.9414
LogisticRegression - Test Classification Report:
                       precision    recall  f1-score   support

                   0  0.98785425101214574539 0.91044776119402981429 0.94757281553398053919       536
                   1  0.86666666666666669627 0.98113207547169811740 0.92035398230088494298       318

            accuracy                      0.93676814988290402209       854
           macro avg  0.92726045883940622083 0.94578991833286396584 0.93396339891743274109       854
        weighted avg  0.94272819501464888425 0.93676814988290402209 0.93743746545420958416       854

Fitting 5 folds for each of 10 candidates, totalling 50 fits
DecisionTreeClassifier - Training Accuracy: 0.9956, Test Accuracy: 0.9988
DecisionTreeClassifier 

In [60]:
from sklearn.feature_selection import RFE
rfe = RFE(
    XGBClassifier(n_estimators=50, max_depth=5, learning_rate=0.1, subsample=0.7, colsample_bytree= 0.8,random_state=42),
    n_features_to_select=4,
)
rfe.fit(preprocessed_X_train, y_train_encoded)
selected_features = [feature for feature, support in zip(feature_names, rfe.support_) if support]
print("Selected Features:", selected_features)

Selected Features: ['num__loan_term', 'num__cibil_score', 'num__total_collateral', 'num__income__loan_ratio']


In [67]:
import joblib
xgb = XGBClassifier(n_estimators=50, max_depth=5, learning_rate=0.1, subsample=0.7, colsample_bytree= 0.8,random_state=42)
xgb.fit(preprocessed_X_train, y_train_encoded)
joblib.dump(xgb, 'xgb_model.pkl')

['xgb_model.pkl']

In [68]:

def predict_new_data(new_data, preprocessor, model):

    preprocessed_new_data = preprocessor.transform(new_data)

    predictions = model.predict(preprocessed_new_data)
    
    return predictions


new_data = {
    'no_of_dependents': 2,
    'self_employed': 'Yes',
    'income_annum': 500000,
    'loan_amount': 100000,
    'loan_term': 15,
    'cibil_score': 750,
    'residential_assets_value': 500000,
    'commercial_assets_value': 100000,
    'luxury_assets_value': 200000,
    'bank_asset_value': 300000
}

df = pd.DataFrame([new_data])

df['total_collateral'] = df['residential_assets_value'] + df['commercial_assets_value'] + df['luxury_assets_value']  + df['bank_asset_value']	
df['income__loan_ratio'] =  df['income_annum'] / df['loan_amount']

xgb = joblib.load('xgb_model.pkl')
predicted_loan_status = predict_new_data(df, preprocessor, xgb)  


loan_status_decoded = label_encoder.inverse_transform(predicted_loan_status)

print(f"Predicted Loan Status: {loan_status_decoded[0]}")


Predicted Loan Status: Approved
