In [1]:
#Import Packages
import pandas as pd
import numpy as np
import pickle

#SKLearn Packages
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("../data/data-2.csv")

In [3]:
data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [4]:
# Define X and y
X = data.iloc[:, 1:-1]
y = data.iloc[:, -1].map({'N':0,'Y':1})

In [5]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [6]:
# Using own function in Pipeline
def numFeat(data):
    return data[num_feats]

def catFeat(data):
    return data[cat_feats]

In [7]:
# Separate features
# num_feats = X_train.select_dtypes(exclude=object).index.tolist()
# cat_feats = X_train.select_dtypes(include=object).index.tolist()

#categorical features
cat_feats = X_train.dtypes[X_train.dtypes == 'object'].index.tolist()
#numeric features
num_feats = X_train.dtypes[~X_train.dtypes.index.isin(cat_feats)].index.tolist()

In [8]:
# Imputer steps
num_transform = Pipeline([('FunctionTransformer', FunctionTransformer()), 
                          ('impute_mean', SimpleImputer(strategy='mean'))])

cat_transform = Pipeline([('FunctionTransformer',FunctionTransformer()), 
                          ('impute_mode', SimpleImputer(strategy='most_frequent')), 
                          ('one-hot-encode', OneHotEncoder(sparse=False))])


In [9]:
preprocess = ColumnTransformer([('numeric', num_transform, num_feats),
                                ('categorical', cat_transform, cat_feats)])

In [10]:
pipeline = Pipeline([('preprocessing', preprocess),
                     ('rfc', RandomForestClassifier())])

In [11]:
pipeline.fit(X_train, y_train)

In [12]:
score = pipeline.score(X_test, y_test)
print(f'Test Score: {score}')

Test Score: 0.7675675675675676


In [13]:
# Test with grid search
# Set Parameters
params = {"rfc__n_estimators" : np.arange(200,600,200),
              "rfc__max_depth" : np.arange(2,6,2),
              "rfc__min_samples_split" : np.arange(2,6,2)}

In [14]:
# Create grid search
grid_search = GridSearchCV(pipeline, params, n_jobs=-1, verbose=10, refit=True)

In [15]:
#Fit model and tune
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [16]:
print("Best Score:", grid_search.best_score_)
print("Best Parameters:\n", grid_search.best_params_)

Best Score: 0.8274418604651164
Best Parameters:
 {'rfc__max_depth': 4, 'rfc__min_samples_split': 2, 'rfc__n_estimators': 200}


In [17]:
# SAVE MODEL TO PICKLE
# pickle.dump(grid_search, open("../src/model.p", "wb"))

In [18]:
X_test.iloc[101, :]

Gender                       Male
Married                        No
Dependents                      0
Education            Not Graduate
Self_Employed                  No
ApplicantIncome              2346
CoapplicantIncome          1600.0
LoanAmount                  132.0
Loan_Amount_Term            360.0
Credit_History                1.0
Property_Area           Semiurban
Name: 386, dtype: object

In [19]:
json_data = {'Gender' : 'Female',
            'Married' : 'No',
            'Dependents' : '1',
            'Education' : 'Graduate',
            'Self_Employed' : 'No',
            'ApplicantIncome' : 2876,
            'CoapplicantIncome' : 1560.0,
            'LoanAmount' : 90.0,
            'Loan_Amount_Term' : 360.0,
            'Credit_History' : 1.0,
            'Property_Area' : 'Urban'}

Connect to app

In [22]:
import requests
url = "http://ec2-18-191-164-122.us-east-2.compute.amazonaws.com:4242/scoring"
#send get request and save response
res = requests.post(url=url, json=json_data)

In [23]:
# Return result
print(res.json())

[1]
