In [6]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from joblib import dump
from time import time

In [2]:
train_data = np.load("../data/interim/train_data.npy", allow_pickle=True)
dev_data = np.load("../data/interim/dev_data.npy", allow_pickle=True)

In [3]:
columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous',
       'target', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'job_unknown', 'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'credit_default_no', 'credit_default_yes',
       'housing_no', 'housing_yes', 'personal_loan_no', 'personal_loan_yes',
       'contact_type_cellular', 'contact_type_telephone',
       'contact_type_unknown', 'month_apr', 'month_aug', 'month_dec',
       'month_feb', 'month_jan', 'month_jul', 'month_jun', 'month_mar',
       'month_may', 'month_nov', 'month_oct', 'month_sep',
       'previous_campaign_failure', 'previous_campaign_other',
       'previous_campaign_success', 'previous_campaign_unknown']

In [10]:
train_data.shape
dev_data.shape

(4521, 52)

In [13]:
combined_data = np.concatenate([train_data,dev_data])
df = pd.DataFrame(combined_data, columns=columns)
y = df['target']
X = df.drop(['target'], axis=1)
print(X.shape, y.shape)

(36169, 51) (36169,)


In [18]:
pipeline = Pipeline([('clf', RandomForestClassifier()),])

parameters = {
    'clf__n_estimators': (100,1000,5000),
    'clf__max_depth': (None, 5,10, 15),
    'clf__min_samples_split': (2,4,6)
}
random_search = RandomizedSearchCV(pipeline, parameters, n_iter=100, n_jobs=-1, verbose=1, cv=5, scoring="f1_weighted", random_state=42)
print("Performing random search")
print("Pipeline: ", [name for name, _ in pipeline.steps])
print("parameters: ")
pprint(parameters)
t0 = time()
random_search.fit(X, y)
print("tuned in %0.4fs" % (time()-t0))
print('Best_score for Random Forest Classifier',random_search.best_score_)
# print('Best_Parameters for Random Forest classifier',random_search.best_params_)

best_parameters = random_search.best_estimator_.get_params()
for parameter_name in sorted(parameters.keys()):
    print("\t%s: %r" %(parameter_name, best_parameters[parameter_name]))

# random_search = RandomForestClassifier(n_estimators=8,max_depth=10,random_state=12,verbose=0)

Performing random search
Pipeline:  ['clf']
parameters: 
{'clf__max_depth': (None, 5, 10, 15),
 'clf__min_samples_split': (2, 4, 6),
 'clf__n_estimators': (100, 1000, 5000)}
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 28.9min finished


tuned in 1887.1576s
Best_score for Random Forest Classifier 0.8945985379972147
	clf__max_depth: None
	clf__min_samples_split: 4
	clf__n_estimators: 5000


In [26]:
# def RF_Tuned_Param(X,Y):
#     # Using RandomSearchCV to find out the best set of parameters for RFR and use it for the regression model analysis and prediction:

#     # Number of trees in random forest:
#     n_estimators = [int(x) for x in np.linspace(start=5, stop= 50, num= 10)]

#     # Number of features to consider at every split:
#     max_features = ['auto','sqrt','log2']

#     # Maximun number of levels in tree:
#     max_depth = [int(x) for x in np.linspace(start=10, stop= 30, num= 5)]

#     # Minimum number of samples required to split a node:
#     min_samples_split = [5,10]

#     # Minimum number of samples required at each leaf node
#     min_samples_leaf = [1, 2, 5, 10]

#     # Bootstrap method:
#     bootstrap = [True,False]

#     # Create the parameter grid
#     param_grid_RF = {'n_estimators': n_estimators,
#                   'max_features': max_features,
#                   'max_depth': max_depth,
#                   'min_samples_split': min_samples_split,
#                   'min_samples_leaf': min_samples_leaf,
#                   'bootstrap': bootstrap}
#     pprint(param_grid_RF)

#     # forest = RandomForestRegressor()
#     random_search_forest = RandomizedSearchCV(RFR_Model, param_grid_RF, cv=3, n_jobs=6, verbose=1)
#     random_search_forest.fit(X, Y)

#     # The best parameters for the Random forest regressor obtained from GridSearch CV:
#     print('Best_Parameters for Random Forest class',random_search_forest.best_params_)

#     # The best score for Random forest regressor after GridSearch CV:
#     print('Best_score for Random Forest Regressor',random_search_forest.best_score_)

#     return random_search_forest, random_search_forest.best_params_, random_search_forest.best_score_


In [1]:
# # RF_Tuned_Param(X,y)
# dump(random_search.best_estimator_, "../models/model_randomforest.pkl")