In [1]:
import pickle
import os
import joblib as jlb
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV

os.chdir('/Users/calebward/LHL_project_IV')
df = pd.read_csv("data/data.csv")

In [2]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
X, y = df.drop('Loan_Status', axis = 1), df['Loan_Status'].map(dict(Y=1, N=0))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((429, 12), (185, 12), (429,), (185,))

In [4]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ApplicantIncome,429.0,5413.344988,6273.774127,210.0,2787.0,3800.0,5746.0,81000.0
CoapplicantIncome,429.0,1726.402797,3323.953944,0.0,0.0,1250.0,2254.0,41667.0
LoanAmount,414.0,145.272947,83.009993,9.0,100.25,126.0,163.5,650.0
Loan_Amount_Term,420.0,341.885714,64.685664,12.0,360.0,360.0,360.0,480.0
Credit_History,394.0,0.832487,0.373908,0.0,1.0,1.0,1.0,1.0


In [5]:
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()
quantitative_features = X_train.drop(categorical_features, axis = 1).columns.tolist()
categorical_features, quantitative_features

(['Loan_ID',
  'Gender',
  'Married',
  'Dependents',
  'Education',
  'Self_Employed',
  'Property_Area'],
 ['ApplicantIncome',
  'CoapplicantIncome',
  'LoanAmount',
  'Loan_Amount_Term',
  'Credit_History'])

In [6]:

def log_transform(x):
    return np.log(x+1)

data_clean_pipeline = Pipeline(steps = [
    ('impute', IterativeImputer()),
    ('scale', StandardScaler())
])

columns_transformer = ColumnTransformer(transformers=[
    ('cols', data_clean_pipeline, quantitative_features)
])

In [7]:
rf_classifier = RandomForestClassifier(n_estimators=11, criterion='entropy')

rf_model_pipeline = Pipeline(steps = [
    ('preprocessing', columns_transformer),
    ('rf_model', rf_classifier),
])

rf_model_pipeline.fit(X_train, y_train)

y_pred = rf_model_pipeline.predict(X_test)

# calculate accuracy
ac = accuracy_score(y_test, y_pred)
print(f"Accuracy= {ac}")

Accuracy= 0.7243243243243244


In [8]:

rf_classifier = RandomForestClassifier(random_state=0)

rf_model_pipeline = Pipeline(steps=[
    ('preprocessing', columns_transformer),
    ('rf_model', rf_classifier),
])

params_dict = {'rf_model__n_estimators' : np.arange(5, 7, 1), 'rf_model__criterion': ['gini', 'entropy'], 'rf_model__max_depth': np.arange(10, 25, 5)}

grid_search = GridSearchCV(rf_model_pipeline, params_dict, cv=10, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [9]:
y_pred = grid_search.predict(X_test)
ac = accuracy_score(y_test, y_pred)
print(f'Accuracy = {ac}')

Accuracy = 0.7297297297297297


In [13]:
with open('data/pickles/simple_model.p', 'wb') as f:
    pickle.dump(grid_search, f)