## Model Building

In [81]:
# importing required packages

import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [82]:
# reading the data 

data = pd.read_csv('cleaned_loan_data.csv')

data = data.iloc[:, 1:]
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE_x,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,AMT_ANNUITY_x,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,100004,0,1,1,1,1,0,67500.0,135000.0,6750.0,...,2,4.0,4,14,365243.0,-784.0,-694.0,-724.0,-714.0,0.0
1,100006,0,0,0,0,1,0,135000.0,312682.5,29686.5,...,10,16.051936,0,0,342257.65671,13488.741567,33274.831806,76665.634755,82353.171672,0.0
2,100006,0,0,0,0,1,0,135000.0,312682.5,29686.5,...,10,16.051936,0,2,342257.65671,13488.741567,33274.831806,76665.634755,82353.171672,0.0
3,100006,0,0,0,0,1,0,135000.0,312682.5,29686.5,...,3,12.0,4,11,365243.0,-545.0,-215.0,-425.0,-416.0,0.0
4,100006,0,0,0,0,1,0,135000.0,312682.5,29686.5,...,10,16.051936,0,2,342257.65671,13488.741567,33274.831806,76665.634755,82353.171672,0.0


In [83]:
tar = data['TARGET']
val = data.drop('TARGET', axis=1)

In [84]:
## undersampling 

rus = RandomOverSampler(random_state=42)

val_usample, tar_usample = rus.fit_resample(val, tar)

In [85]:
# oversampling the model 
ros = RandomOverSampler(random_state=42)

val_resample, tar_resample = ros.fit_resample(val, tar)

In [86]:
# train test split

train_data, test_data, train_lab, test_lab = train_test_split(val_usample, tar_usample, test_size=0.2, random_state=42)

## Logistic model

In [88]:
logistic = LogisticRegression()

logistic.fit(train_data, train_lab)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [89]:
## model predictions 

train_predicted = logistic.predict(train_data)
test_predicted = logistic.predict(test_data)

# scores of train data
print(classification_report(train_lab, train_predicted))

              precision    recall  f1-score   support

           0       0.60      0.60      0.60    178597
           1       0.60      0.60      0.60    178428

    accuracy                           0.60    357025
   macro avg       0.60      0.60      0.60    357025
weighted avg       0.60      0.60      0.60    357025



In [90]:
print(classification_report(test_lab, test_predicted))

              precision    recall  f1-score   support

           0       0.60      0.60      0.60     44544
           1       0.60      0.60      0.60     44713

    accuracy                           0.60     89257
   macro avg       0.60      0.60      0.60     89257
weighted avg       0.60      0.60      0.60     89257



In [91]:
## random forest model

forest = RandomForestClassifier(n_estimators=700, max_depth=10, random_state=54)

forest.fit(train_data, train_lab)

In [92]:
## model predictions 

train_predicted = forest.predict(train_data)
test_predicted = forest.predict(test_data)

# scores of train data
print(classification_report(train_lab, train_predicted))

              precision    recall  f1-score   support

           0       0.80      0.78      0.79    178597
           1       0.78      0.80      0.79    178428

    accuracy                           0.79    357025
   macro avg       0.79      0.79      0.79    357025
weighted avg       0.79      0.79      0.79    357025



In [93]:
print(classification_report(test_lab, test_predicted))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78     44544
           1       0.78      0.79      0.78     44713

    accuracy                           0.78     89257
   macro avg       0.78      0.78      0.78     89257
weighted avg       0.78      0.78      0.78     89257



## Feature Importance

In [95]:
len(forest.feature_importances_)

155

In [96]:
importance = pd.DataFrame({
    'features':train_data.columns,
    'importance':forest.feature_importances_
}).sort_values('importance', ascending=False).iloc[:25, :]['features'].to_list()

importance

['EXT_SOURCE_3',
 'EXT_SOURCE_2',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_LAST_PHONE_CHANGE',
 'DAYS_REGISTRATION',
 'AMT_CREDIT_x',
 'DAYS_ID_PUBLISH',
 'AMT_GOODS_PRICE_x',
 'AMT_ANNUITY_x',
 'ORGANIZATION_TYPE',
 'SK_ID_CURR',
 'OWN_CAR_AGE',
 'REGION_POPULATION_RELATIVE',
 'NAME_INCOME_TYPE',
 'AMT_INCOME_TOTAL',
 'CODE_GENDER',
 'FLAG_EMP_PHONE',
 'HOUR_APPR_PROCESS_START_x',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'DAYS_DECISION',
 'DAYS_LAST_DUE_1ST_VERSION',
 'NAME_EDUCATION_TYPE']

In [97]:
# train test split

train_data, test_data, train_lab, test_lab = train_test_split(val_usample[importance], tar_usample, test_size=0.2, random_state=42)

## Decision Tree Classifier

In [99]:
tree = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=10, random_state=54)

tree.fit(train_data, train_lab)

In [100]:
# predictions

train_predicted = tree.predict(train_data)
test_predict = tree.predict(test_data)


In [101]:
print(classification_report(train_lab, train_predicted))

              precision    recall  f1-score   support

           0       0.75      0.70      0.73    178597
           1       0.72      0.77      0.74    178428

    accuracy                           0.74    357025
   macro avg       0.74      0.74      0.74    357025
weighted avg       0.74      0.74      0.74    357025



In [103]:
print(classification_report(test_lab, test_predicted))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78     44544
           1       0.78      0.79      0.78     44713

    accuracy                           0.78     89257
   macro avg       0.78      0.78      0.78     89257
weighted avg       0.78      0.78      0.78     89257



## Deciision Tree using Grid Search

In [105]:
param_grid = {
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'max_features': ['sqrt', 'log2', None],
}


tree = DecisionTreeClassifier(criterion='entropy', random_state=54)
grid_search = GridSearchCV(tree, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(train_data, train_lab)

print("Best parameters:", grid_search.best_params_)
print("Best accuracy:", grid_search.best_score_)

Best parameters: {'max_depth': 20, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best accuracy: 0.9379287164764373


In [106]:
# predictions

train_predicted = grid_search.predict(train_data)
test_predicted = grid_search.predict(test_data)

In [107]:
# predicted scores

print(classification_report(train_lab, train_predicted))

              precision    recall  f1-score   support

           0       0.97      0.92      0.95    178597
           1       0.92      0.97      0.95    178428

    accuracy                           0.95    357025
   macro avg       0.95      0.95      0.95    357025
weighted avg       0.95      0.95      0.95    357025



In [108]:
print(classification_report(test_lab, test_predicted))

              precision    recall  f1-score   support

           0       0.97      0.91      0.94     44544
           1       0.91      0.97      0.94     44713

    accuracy                           0.94     89257
   macro avg       0.94      0.94      0.94     89257
weighted avg       0.94      0.94      0.94     89257



## saving model and data

In [113]:
# data 

features = importance + ['TARGET']

imp_data = data[features]
imp_data.to_csv('important_data.csv')

In [114]:
# important raw data

raw_data = pd.read_csv('loan_data.csv')

new_raw = raw_data[features]
new_raw.to_csv('imp_raw_data.csv')


In [115]:
# model 

import pickle

with open('model.pkl', 'wb') as file:
    pickle.dump(grid_search, file)