In [1]:
import pandas as pd
import random as rn
import numpy as np
import re
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error,accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import ensemble

In [2]:
totDF = pd.read_csv('../data/raw/Cleaned_data_set.csv')

In [3]:
#downsample
sample_per_year = 20000
dwnSmplDF = totDF.groupby('birth_year',group_keys = False).apply(lambda x: x.sample(sample_per_year))

In [4]:
r1 = re.compile('.*reporting')
r2 = re.compile('.*imputed')

#to drop reporting, imputed and target column
cols_to_drop1 = list(filter((r1.match), totDF.columns))
cols_to_drop2 = list(filter((r2.match), totDF.columns))
cols_to_drop3 = ['admit_NICU']
cols_to_drop = cols_to_drop1 + cols_to_drop2 + cols_to_drop3

#columns to keep
cols_to_keep = [col for col in totDF.columns if col not in cols_to_drop]

#create df w/ columns to keep and target admit_nicu
X = dwnSmplDF[cols_to_keep].copy()
target = dwnSmplDF[['admit_NICU']].copy()

#select numeric and cat columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
catDF = X.select_dtypes(include=object).copy()
numDF = X.select_dtypes(include=numerics).copy()

#Encode cat columns
le = LabelEncoder()
catDF = catDF.apply(le.fit_transform)
encoded_target = target.apply(le.fit_transform)

#concat numeric and encoded df
cl_df = pd.concat([numDF,catDF],axis=1)

In [5]:
randomForest = ensemble.RandomForestClassifier()
grid_para_forest = [{
    'n_estimators': np.linspace(50,int(np.sqrt(len(cl_df))),10,dtype=int),
   # 'n_estimators': range(1000,10000,1000),
    #'min_samples_split' : [100,10,2],
    'min_samples_leaf' : range(100,1000,100)
}]
randomForest.set_params(random_state=108)
grid_search_forest = GridSearchCV(randomForest, grid_para_forest, scoring='accuracy', cv=5, n_jobs=-1)
%time grid_search_forest.fit(cl_df, encoded_target)

  self.best_estimator_.fit(X, y, **fit_params)


Wall time: 40min 43s


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=108,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'

In [6]:
print(
    '''Random Forest
sample size: {0}
best param : {1}
best score : {2}
r2         : {3}'''\
      .format(len(encoded_target), \
              grid_search_forest.best_params_,\
              grid_search_forest.best_score_, \
              r2_score(encoded_target, grid_search_forest.predict(cl_df)))
     )

Random Forest
sample size: 100000
best param : {'min_samples_leaf': 100, 'n_estimators': 109}
best score : 0.94302
r2         : 0.33316310727927356


In [8]:
bestRF = ensemble.RandomForestClassifier()
best_params = grid_search_forest.best_params_
bestRF.set_params(random_state=108,n_jobs= -1,oob_score = True,**best_params)
bestRF.fit(cl_df,encoded_target)

  after removing the cwd from sys.path.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=100, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=109,
                       n_jobs=-1, oob_score=True, random_state=108, verbose=0,
                       warm_start=False)

In [11]:
featureDF = pd.DataFrame({'ScorebestRF' : bestRF.feature_importances_, 'Features' : cl_df.columns})
#featureDF
print(featureDF.sort_values('ScoreBestRF').head(10))

Unnamed: 0,ScorebestRF,Features
0,0.000429,birth_year
1,0.000683,birth_month
2,0.001998,birth_time
3,0.000535,birth_day_of_wk
4,0.009370,birth_place
...,...,...
98,0.013351,suspect_chromo_disorder
99,0.003977,hypospadias
100,0.019625,infant_transferred
101,0.000732,infant_living_at_report


In [None]:
print(
'''Random Forest
OOB score: {0}
Feature Importances: {1}
Score: {2}'''\
        .format(bestRF.oob_score_,
                bestRF.feature_importances_,
                bestRF.score))