In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, roc_curve

In [2]:
loan_data = pd.read_csv('loan_data_encoded.csv')

In [3]:
value = loan_data.drop('TARGET', axis = 1)

tar = loan_data['TARGET']

In [7]:
tar.value_counts()

TARGET
0    1291341
1     122360
Name: count, dtype: int64

In [9]:
os = SMOTE(random_state= 56)

In [11]:
value_gen, tar_gen = os.fit_resample(value, tar)

In [12]:
tar_gen.value_counts()

TARGET
1    1291341
0    1291341
Name: count, dtype: int64

In [15]:
tar_gen.shape

(2582682,)

In [17]:
# Train data test splitting 
train_data, test_data, train_lab, test_lab = train_test_split(value_gen, tar_gen, test_size= 0.20, random_state= 50)

# Model Building

## Logistic Regression

In [45]:
model1 = LogisticRegression()

In [47]:
model1.fit(train_data, train_lab)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [49]:
train_pred = model1.predict(train_data)
test_pred = model1.predict(test_data)

In [29]:
def get_perform(actual, pred):
    acc = accuracy_score(actual, pred) * 100
    prec = precision_score(actual, pred, average='weighted', zero_division=0.0) * 100 
    rec = recall_score(actual, pred, average='weighted') * 100 
    f1 = f1_score(actual, pred, average='weighted') * 100
    return pd.DataFrame({
        "accuracy": [acc],
        "precision": [prec],
        "recall": [rec],
        "F1 score" : [f1]
    })

In [51]:
get_perform(train_lab, train_pred)

Unnamed: 0,accuracy,precision,recall,F1 score
0,58.56075,58.565144,58.56075,58.555688


In [53]:
get_perform(test_lab, test_pred)

Unnamed: 0,accuracy,precision,recall,F1 score
0,58.616711,58.619222,58.616711,58.612849


In [55]:
train_pred_proba = model1.predict_proba(train_data)[:, 1]
test_pred_proba = model1.predict_proba(test_data)[:, 1]

In [57]:
roc_auc_train = roc_auc_score(train_lab, train_pred_proba)
print(f"ROC-AUC Score (Train): {roc_auc_train:.4f}")

ROC-AUC Score (Train): 0.6172


In [59]:
roc_auc_test = roc_auc_score(test_lab, test_pred_proba)
print(f"ROC-AUC Score (Test): {roc_auc_test:.4f}")

ROC-AUC Score (Test): 0.6174


# Decision Tree Classifier

In [61]:
model2 = DecisionTreeClassifier(max_depth= 25,criterion='gini',random_state= 56)

In [63]:
model2.fit(train_data, train_lab)

In [65]:
train_predd = model2.predict(train_data)
test_predd = model2.predict(test_data)

In [67]:
get_perform(train_lab, train_predd)

Unnamed: 0,accuracy,precision,recall,F1 score
0,93.472336,93.513856,93.472336,93.470796


In [69]:
get_perform(test_lab, test_predd)

Unnamed: 0,accuracy,precision,recall,F1 score
0,92.216627,92.243738,92.216627,92.215309


In [71]:
train_pred_proba = model2.predict_proba(train_data)[:, 1]
test_pred_proba = model2.predict_proba(test_data)[:, 1]

In [73]:
roc_auc_train2 = roc_auc_score(train_lab, train_pred_proba)
print(f"ROC-AUC Score (Train): {roc_auc_train2:.4f}")

ROC-AUC Score (Train): 0.9873


In [75]:
roc_auc_test2 = roc_auc_score(test_lab, test_pred_proba)
print(f"ROC-AUC Score (Test): {roc_auc_test2:.4f}")

ROC-AUC Score (Test): 0.9733


# Random Forest Classifier

In [21]:
model3 = RandomForestClassifier(n_estimators= 150, criterion = 'entropy', max_depth = 25, random_state= 56)

In [23]:
model3.fit(train_data, train_lab)

In [25]:
train_predr = model3.predict(train_data)
test_predr = model3.predict(test_data)

In [31]:
get_perform(train_lab, train_predr)

Unnamed: 0,accuracy,precision,recall,F1 score
0,96.655898,96.662468,96.655898,96.655777


In [33]:
get_perform(test_lab, test_predr)

Unnamed: 0,accuracy,precision,recall,F1 score
0,95.922654,95.936989,95.922654,95.92236


In [35]:
train_pred_proba = model3.predict_proba(train_data)[:, 1]
test_pred_proba = model3.predict_proba(test_data)[:, 1]

In [37]:
roc_auc_train3 = roc_auc_score(train_lab, train_pred_proba)
print(f"ROC-AUC Score (Train): {roc_auc_train3:.4f}")

ROC-AUC Score (Train): 0.9964


In [39]:
roc_auc_test3 = roc_auc_score(test_lab, test_pred_proba)
print(f"ROC-AUC Score (Test): {roc_auc_test3:.4f}")

ROC-AUC Score (Test): 0.9943


In [41]:
from sklearn.metrics import classification_report

In [43]:
report = classification_report(test_lab, test_predr)
print(report)

              precision    recall  f1-score   support

           0       0.95      0.97      0.96    258102
           1       0.97      0.95      0.96    258435

    accuracy                           0.96    516537
   macro avg       0.96      0.96      0.96    516537
weighted avg       0.96      0.96      0.96    516537



# XGBoost classifier

In [77]:
model4 = XGBClassifier(n_estimators=200, random_state=56, max_depth=10)

In [79]:
model4.fit(train_data, train_lab)

In [81]:
train_predx = model4.predict(train_data)
test_predx = model4.predict(test_data)

In [83]:
get_perform(train_lab, train_predx)

Unnamed: 0,accuracy,precision,recall,F1 score
0,92.679217,92.679669,92.679217,92.679199


In [85]:
get_perform(test_lab, test_predx)

Unnamed: 0,accuracy,precision,recall,F1 score
0,91.824593,91.825599,91.824593,91.82453


In [87]:
train_pred_proba = model4.predict_proba(train_data)[:, 1]
test_pred_proba = model4.predict_proba(test_data)[:, 1]

In [89]:
roc_auc_train4 = roc_auc_score(train_lab, train_pred_proba)
print(f"ROC-AUC Score (Train): {roc_auc_train4:.4f}")

ROC-AUC Score (Train): 0.9802


In [91]:
roc_auc_test4 = roc_auc_score(test_lab, test_pred_proba)
print(f"ROC-AUC Score (Test): {roc_auc_test4:.4f}")

ROC-AUC Score (Test): 0.9750


In [93]:
model1_perf = get_perform(test_lab, test_pred)
model2_perf = get_perform(test_lab, test_predd)
model3_perf = get_perform(test_lab, test_predr)
model4_perf = get_perform(test_lab, test_predx)

In [95]:
performance_df = pd.concat([model1_perf, model2_perf, model3_perf, model4_perf], 
                          keys=['Model 1', 'Model 2', 'Model 3', 'Model 4'], 
                          names=['Model', None])

In [97]:
performance_df

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,F1 score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,0,58.616711,58.619222,58.616711,58.612849
Model 2,0,92.216627,92.243738,92.216627,92.215309
Model 3,0,95.922654,95.936989,95.922654,95.92236
Model 4,0,91.824593,91.825599,91.824593,91.82453


In [99]:
performance_df.to_csv('model_performance.csv', index=False)

In [101]:
roc_auc_series1 = pd.Series([roc_auc_test])
roc_auc_series2 = pd.Series([roc_auc_test2])
roc_auc_series3 = pd.Series([roc_auc_test3])
roc_auc_series4 = pd.Series([roc_auc_test4])

# Concatenate the Series into a DataFrame
ROC_AUC_df = pd.concat([roc_auc_series1, roc_auc_series2, roc_auc_series3, roc_auc_series4],
                       keys=['Model 1', 'Model 2', 'Model 3', 'Model 4'],
                       names=['Model', None])

print(ROC_AUC_df)

Model     
Model 1  0    0.617441
Model 2  0    0.973346
Model 3  0    0.994326
Model 4  0    0.975047
dtype: float64


In [103]:
ROC_AUC_df.to_csv('ROC_AUC_performance.csv', index=False)

In [234]:
import pickle as pk
pk.dump(model3,open('banking_RFmodel.pkl', 'wb'))

In [105]:
import pickle as pk
pk.dump(model3,open('bankingRFmodel3.pkl', 'wb'))

# Testing Prediction

In [108]:
model = pk.load(open('bankingRFmodel3.pkl', 'rb'))

In [110]:
age_encoder = pk.load(open('AGE_GROUP.pkl', 'rb'))
children_encoder = pk.load(open('CHILDREN_CATEGORY.pkl', 'rb'))
gender_encoder = pk.load(open('label_encoder_CODE_GENDER.pkl', 'rb'))
flag_own_car_encoder = pk.load(open('label_encoder_FLAG_OWN_CAR.pkl', 'rb'))
flag_own_realty_encoder = pk.load(open('label_encoder_FLAG_OWN_REALTY.pkl', 'rb'))
income_type_encoder = pk.load(open('label_encoder_NAME_INCOME_TYPE.pkl', 'rb'))
education_type_encoder = pk.load(open('label_encoder_NAME_EDUCATION_TYPE.pkl', 'rb'))
family_status_encoder = pk.load(open('label_encoder_NAME_FAMILY_STATUS.pkl', 'rb'))
housing_type_encoder = pk.load(open('label_encoder_NAME_HOUSING_TYPE.pkl', 'rb'))
occupation_encoder = pk.load(open('label_encoder_OCCUPATION_TYPE.pkl', 'rb'))
organization_encoder = pk.load(open('label_encoder_ORGANIZATION_TYPE.pkl', 'rb'))

In [138]:
sample_data = pd.DataFrame({
    'CODE_GENDER': ['F'],
    'FLAG_OWN_CAR': ['N'],
    'FLAG_OWN_REALTY': ['Y'],
    'NAME_INCOME_TYPE': ['Commercial associate'],
    'NAME_EDUCATION_TYPE': ['Higher education'],
    'NAME_FAMILY_STATUS': ['Married'],
    'NAME_HOUSING_TYPE': ['House / apartment'],
    'OCCUPATION_TYPE': ['Accountants'],
    'ORGANIZATION_TYPE': ['Business Entity Type 3'],
    'AGE_GROUP': ['25-34'],
    'AMT_INCOME_TOTAL_LOG': [202500.00],
    'CHILDREN_CATEGORY': ['0'],
    'AMT_CREDIT_log': [416000.00]
})

In [140]:
processed_data = sample_data.copy()

In [142]:
processed_data['CODE_GENDER'] = gender_encoder.transform(processed_data[['CODE_GENDER']])  # Keep [0] to get the single encoded value
processed_data['FLAG_OWN_CAR'] = flag_own_car_encoder.transform(processed_data[['FLAG_OWN_CAR']])
processed_data['FLAG_OWN_REALTY'] = flag_own_realty_encoder.transform(processed_data[['FLAG_OWN_REALTY']])
processed_data['NAME_INCOME_TYPE'] = income_type_encoder.transform(processed_data[['NAME_INCOME_TYPE']])
processed_data['NAME_EDUCATION_TYPE'] = education_type_encoder.transform(processed_data[['NAME_EDUCATION_TYPE']])
processed_data['NAME_FAMILY_STATUS'] = family_status_encoder.transform(processed_data[['NAME_FAMILY_STATUS']])
processed_data['NAME_HOUSING_TYPE'] = housing_type_encoder.transform(processed_data[['NAME_HOUSING_TYPE']])
processed_data['OCCUPATION_TYPE'] = occupation_encoder.transform(processed_data[['OCCUPATION_TYPE']])
processed_data['ORGANIZATION_TYPE'] = organization_encoder.transform(processed_data[['ORGANIZATION_TYPE']])
if 'AGE_GROUP' in processed_data.columns and 'age_encoder' in globals():
    processed_data['Age_group_encoded'] = age_encoder.transform(processed_data[['AGE_GROUP']])
    processed_data.drop('AGE_GROUP', axis=1, inplace=True)
processed_data['AMT_INCOME_TOTAL_LOG'] = np.log1p(processed_data[['AMT_INCOME_TOTAL_LOG']])
if 'CHILDREN_CATEGORY' in processed_data.columns and 'children_encoder' in globals():
    processed_data['CHILDREN_encoded'] = children_encoder.transform(processed_data[['CHILDREN_CATEGORY']])
    processed_data['CHILDREN_encoded'] = processed_data['CHILDREN_encoded'].astype(int)
    processed_data.drop('CHILDREN_CATEGORY', axis=1, inplace=True)
processed_data['AMT_CREDIT_log'] = np.log1p(processed_data[['AMT_CREDIT_log']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [144]:
feature_columns_trained = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE',
                           'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
                           'OCCUPATION_TYPE', 'ORGANIZATION_TYPE',
                           'AMT_INCOME_TOTAL_LOG', 'AMT_CREDIT_log', 'Age_group_encoded',
                           'CHILDREN_encoded']

user_df = pd.DataFrame([processed_data[feature_columns_trained].iloc[0].values], columns=feature_columns_trained)

print(user_df)

   CODE_GENDER  FLAG_OWN_CAR  FLAG_OWN_REALTY  NAME_INCOME_TYPE  \
0          0.0           0.0              1.0               0.0   

   NAME_EDUCATION_TYPE  NAME_FAMILY_STATUS  NAME_HOUSING_TYPE  \
0                  1.0                 1.0                1.0   

   OCCUPATION_TYPE  ORGANIZATION_TYPE  AMT_INCOME_TOTAL_LOG  AMT_CREDIT_log  \
0              0.0                5.0               12.2185       12.938443   

   Age_group_encoded  CHILDREN_encoded  
0                1.0               0.0  


In [146]:
user_df

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE,ORGANIZATION_TYPE,AMT_INCOME_TOTAL_LOG,AMT_CREDIT_log,Age_group_encoded,CHILDREN_encoded
0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,5.0,12.2185,12.938443,1.0,0.0


In [148]:
prediction = model.predict(user_df)

In [150]:
if prediction[0] == 1:
    print("Prediction: Likely to default on the loan.")
else:
    print("Prediction: Likely not to default on the loan.")


Prediction: Likely to default on the loan.


In [3]:
import sys
print(sys.version)

3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:03:56) [MSC v.1929 64 bit (AMD64)]


In [None]:
importances = model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)