In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [30]:
from xgboost import XGBClassifier as xgb

In [51]:
from sklearn.model_selection import RandomizedSearchCV

In [2]:
data = pd.read_csv("loan_data.csv")

In [3]:
data.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [4]:
data.purpose.value_counts()

debt_consolidation    3957
all_other             2331
credit_card           1262
home_improvement       629
small_business         619
major_purchase         437
educational            343
Name: purpose, dtype: int64

In [38]:
data["credit.policy"].value_counts()

1    7710
0    1868
Name: credit.policy, dtype: int64

In [6]:
data.isna().sum()

credit.policy        0
purpose              0
int.rate             0
installment          0
log.annual.inc       0
dti                  0
fico                 0
days.with.cr.line    0
revol.bal            0
revol.util           0
inq.last.6mths       0
delinq.2yrs          0
pub.rec              0
not.fully.paid       0
dtype: int64

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
 13  not.fully.paid     9578 non-null   int64  
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB


In [18]:
cat_features_1 = ["purpose", "not.fully.paid"]
cat_features_2 = ["credit.policy"]

In [17]:
x = data.drop("credit.policy", axis = 1)
y = data["credit.policy"]

In [26]:
onehot = OneHotEncoder()
transformer_1 = ColumnTransformer([("onehot", onehot, cat_features_1)], remainder = "passthrough")

x_t = transformer_1.fit_transform(x)

In [29]:
x_train, x_test, y_train, y_test = train_test_split(x_t, y, test_size = 0.2, random_state = 42)

In [31]:
models = {"RandomForestClassifier": RandomForestClassifier(),
          "LogisticRegression": LogisticRegression(),
          "XGBoost": xgb()}

In [53]:
rf_grid = {"n_estimators": [100, 200],
           "max_depth": [10,20],
           "max_features": ["sqrt"],
           "min_samples_split": [2],
           "min_samples_leaf": [2, 4]}
lr_grid = {"penalty" : ['l1', 'l2'],
           "C": np.logspace(-4, 4, 20),
           "solver": ['liblinear']}
xg_grid = {"min_child_weight": [1, 5, 10],
           "gamma": [0.5, 1, 1.5, 2, 5],
           "subsample": [0.6, 0.8, 1.0],
           "colsample_bytree": [0.6, 0.8, 1.0],
           "max_depth": [3, 4, 5, 6, 7]} 

In [54]:
##Create a function to fit the model to the data 
def fit_data_and_score_model(x_train, y_train, x_test, y_test):
    for i, (name, model) in enumerate(models.items()):
        if name == "RandomForestClassifier":
            r_clf = RandomizedSearchCV(estimator = model, param_distributions = rf_grid, cv = 5)
        elif name == "LogisticRegression":
            r_clf = RandomizedSearchCV(estimator = model, param_distributions = lr_grid, cv = 5)
        else:
            r_clf = RandomizedSearchCV(estimator = model, param_distributions = xg_grid, cv = 5)
        r_clf.fit(x_train, y_train)
        print(name)
        print(r_clf.score(x_test, y_test))
             
    

In [55]:
fit_data_and_score_model(x_train, y_train, x_test, y_test)



RandomForestClassifier
0.9879958246346555




LogisticRegression
0.9039665970772442








































































































































































































XGBoost
0.9921711899791231
