In [25]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn import grid_search
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

from random import seed
from random import randrange

import random
import pickle
%matplotlib inline

## Application Decision Prediction

### Random Forest Classifier

In [3]:
f = open("data","rb")
df2 = pickle.load(f)
f.close()

In [4]:
X = df2.loc[:, df2.columns != 'case_status']
Y = df2.case_status

df2.iloc[0]

agent_firm_name              9941
agent_state                    57
case_status                     1
class_of_admission             28
country_of_citizenship         79
decision_date                2012
employer_country                5
employer_name               42973
employer_num_employees    22348.6
employer_state                 40
employer_yr_estab            1700
job_info_work_state            40
pw_soc_code                151031
pw_source_name_9089             3
pw_amount_9089_new          75629
Name: A-07323-97014, dtype: object

In [33]:
X.shape

(356168, 14)

In [34]:
Y.shape

(356168,)

In [38]:
skf = StratifiedKFold(n_splits = 5)

In [39]:
#Parameters for random forest. To perform hyper parameter optimisation a list of multiple elements can be entered
#and the optimal value in that list will be picked using grid search
def parameter_set_random_forest(n_estimators = [10], criterion = ['gini'], max_depth = [None],\
                                min_samples_split = [2], min_samples_leaf = [1], min_weight_fraction_leaf = [0.0],\
                                max_features = ['auto'], max_leaf_nodes = [None], bootstrap = [True],\
                                oob_score = [False], random_state = [None], verbose = [0],warm_start = [False],\
                                class_weight = [None]):
    
    parameters_random_forest = {}
    parameters_random_forest['criterion'] = criterion
    parameters_random_forest['n_estimators'] = n_estimators
    parameters_random_forest['max_depth'] = max_depth
    parameters_random_forest['min_samples_split'] = min_samples_split
    parameters_random_forest['min_samples_leaf'] = min_samples_leaf
    parameters_random_forest['min_weight_fraction_leaf'] = min_weight_fraction_leaf
    parameters_random_forest['max_features'] = max_features
    parameters_random_forest['random_state'] = random_state
    parameters_random_forest['max_leaf_nodes'] = max_leaf_nodes
    parameters_random_forest['class_weight'] = class_weight
    parameters_random_forest['bootstrap'] = bootstrap
    parameters_random_forest['oob_score'] = oob_score
    parameters_random_forest['warm_start'] = warm_start
    
    return parameters_random_forest

In [40]:
models = []
for train_index, test_index in skf.split(X, Y):
    x_train = X.iloc[train_index,:]
    x_test = X.iloc[test_index,:]
    y_train = Y.iloc[train_index]
    y_test = Y.iloc[test_index]
    print("TRAIN:", x_train.shape, "TEST:", x_test.shape)
    random_forest_model = RandomForestClassifier()
    parameters_random_forest = parameter_set_random_forest(n_estimators=[10,20,30,40],max_depth=[35,50,75,100,None])
    model_gs = grid_search.GridSearchCV(random_forest_model, parameters_random_forest, scoring = 'roc_auc')
    model_gs.fit(x_train,y_train)
    predictions = model_gs.predict_proba(x_test)[:,1]
    train_acc = roc_auc_score(y_train, model_gs.predict_proba(x_train)[:,1])
    test_acc = roc_auc_score(y_test, predictions)
    print ("Train Accuracy :: ", roc_auc_score(y_train, model_gs.predict_proba(x_train)[:,1]))
    print ("Test Accuracy  :: ", roc_auc_score(y_test, predictions))
    models.append((model_gs,x_train,y_train,x_test,y_test,train_acc,test_acc))

TRAIN: (284934, 14) TEST: (71234, 14)
Train Accuracy ::  0.99773293303
Test Accuracy  ::  0.714986463825
TRAIN: (284934, 14) TEST: (71234, 14)
Train Accuracy ::  0.99791283679
Test Accuracy  ::  0.621117014149
TRAIN: (284934, 14) TEST: (71234, 14)
Train Accuracy ::  0.997539971009
Test Accuracy  ::  0.714445411967
TRAIN: (284934, 14) TEST: (71234, 14)
Train Accuracy ::  0.997587547823
Test Accuracy  ::  0.654751727386
TRAIN: (284936, 14) TEST: (71232, 14)
Train Accuracy ::  0.997317194383
Test Accuracy  ::  0.711647109377


In [41]:
model_out = open("saved_models","wb")
pickle.dump(models, model_out)
model_out.close()

In [20]:
model_in = open("saved_models","rb")
models  = pickle.load(model_in)
model_in.close()

In [37]:
#Randomly Select one of the trained models

best_ind = random.randint(0,5)
best_model = models[best_ind][0]

In [38]:
best_model.best_params_

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 75,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 40,
 'oob_score': False,
 'random_state': None,
 'warm_start': False}

In [39]:
best_model_out = open("best_model","wb")
pickle.dump(best_model, best_model_out)
best_model_out.close()

In [40]:
preds = best_model.predict(X)

In [41]:
percentage_positive = sum(preds)/len(preds)

In [42]:
print (percentage_positive)

0.833381999506
