## Implementing hyperparameter tuning 

In [14]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
path="Data/"
numerical = pd.read_csv(path+'numerical.csv')
categorical = pd.read_csv(path+'categorical.csv')
targets = pd.read_csv(path+'target.csv')

In [3]:
# Since SMOTE works on numerical data only, we will first encode the categorical variables in this case 

In [3]:
encoder = OneHotEncoder(drop='first').fit(categorical)
encoded_categorical = encoder.transform(categorical).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)

In [4]:
data = pd.concat([numerical, encoded_categorical, targets], axis = 1)

In [5]:
regression_target = data['TARGET_D']
# data.head()
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis = 1)

In [6]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis=1)
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

0    90569
1    90569
Name: TARGET_B, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.20, random_state=0)

In [8]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [9]:
y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

In [10]:
# Now we can remove the column target d from the set of features 
X_train = X_train.drop(['TARGET_D'], axis = 1)
X_test = X_test.drop(['TARGET_D'], axis = 1)

## Grid Search

In [11]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100,500],
    'min_samples_split': [2, 4],
    'min_samples_leaf' : [1, 2],
    'max_features': ['sqrt']
#    'max_samples' : ['None', 0.5]
    }
clf = RandomForestClassifier(random_state=100)

In [12]:
grid_search = GridSearchCV(clf, param_grid, cv=5,return_train_score=True,n_jobs=-1,)

In [13]:
grid_search.fit(X_train,y_train)









GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=100),
             n_jobs=-1,
             param_grid={'max_features': ['sqrt'], 'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 4],
                         'n_estimators': [50, 100]},
             return_train_score=True)

In [15]:
grid_search.best_params_ #To check the best set of parameters returned

{'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

<b> Please check RandomSearch

## using the above results 

In [16]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(random_state=0, max_features='sqrt', 
                             min_samples_leaf=1, min_samples_split=2, n_estimators=100)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
print(np.mean(cross_val_scores))

0.9730384376509559


## Feature Importance

In [20]:
#Higher the score, the more important the feature is

In [22]:
clf.fit( X_train, y_train)

RandomForestClassifier(max_features='sqrt', random_state=0)

In [28]:
len(X_train.columns)

635

In [24]:
feature_names = X_train.columns
feature_names = list(feature_names)

In [25]:
df = pd.DataFrame(list(zip(feature_names, clf.feature_importances_)))
df.columns = ['columns_name', 'score_feature_importance']
df.sort_values(by=['score_feature_importance'], ascending = False)

Unnamed: 0,columns_name,score_feature_importance
384,69,0.038959
571,256,0.032151
585,270,0.030386
383,68,0.028648
382,67,0.025817
...,...,...
422,107,0.000000
423,108,0.000000
424,109,0.000000
425,110,0.000000


In [26]:
clf.feature_importances_

array([1.20256928e-03, 2.07725526e-03, 1.93240931e-03, 4.21893481e-03,
       1.43356286e-03, 3.64527105e-04, 9.22141044e-04, 1.08168304e-03,
       8.75696746e-04, 8.82070445e-04, 8.32651605e-04, 7.14632192e-04,
       7.68836051e-03, 1.48324117e-03, 1.45912288e-03, 1.47860284e-03,
       1.57605792e-03, 6.57451441e-04, 2.49830765e-03, 7.68244476e-04,
       7.35454914e-04, 1.59982273e-03, 6.39501407e-04, 5.24051792e-04,
       5.65025216e-04, 8.33648372e-04, 2.59148147e-04, 2.82933570e-04,
       2.92517907e-04, 3.01492496e-04, 2.15568111e-04, 1.80890046e-04,
       7.60091286e-05, 6.85210654e-04, 3.12049767e-04, 2.08463989e-04,
       5.29004296e-04, 6.94475403e-04, 6.82834482e-04, 6.60906604e-04,
       6.13980099e-04, 5.87368516e-04, 6.22224876e-04, 7.74949187e-04,
       7.75913588e-04, 8.39862253e-04, 8.50963433e-04, 7.90543782e-04,
       7.08889138e-04, 7.11461659e-04, 7.33635030e-04, 7.28806229e-04,
       6.79070726e-04, 6.58791315e-04, 7.09746963e-04, 7.07410997e-04,
      