In [1]:
from comet_ml import Experiment
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from joblib import dump
from time import time
import json
from sklearn.preprocessing import StandardScaler

In [2]:
with open('../data/raw/comet_creds.json') as file:
    comet_creds = json.load(file)
exp = Experiment(api_key=comet_creds['api_key'], project_name=comet_creds['project_name'],workspace=comet_creds["workspace"])    
print(exp)

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/fazleem/bank-marketing/3f0e6abfe9dc497eab40deb2f7fd2700



<comet_ml.Experiment object at 0x7f7ee0e8ff10>


In [3]:
train_data = np.load("../data/interim/train_data.npy", allow_pickle=True)
dev_data = np.load("../data/interim/dev_data.npy", allow_pickle=True)
save_model = "../models/model_randomforest.pkl"
n_experiments = 5

In [4]:
columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous',
       'target', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'job_unknown', 'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'credit_default_no', 'credit_default_yes',
       'housing_no', 'housing_yes', 'personal_loan_no', 'personal_loan_yes',
       'contact_type_cellular', 'contact_type_telephone',
       'contact_type_unknown', 'month_apr', 'month_aug', 'month_dec',
       'month_feb', 'month_jan', 'month_jul', 'month_jun', 'month_mar',
       'month_may', 'month_nov', 'month_oct', 'month_sep',
       'previous_campaign_failure', 'previous_campaign_other',
       'previous_campaign_success', 'previous_campaign_unknown']

In [5]:
train_data.shape
dev_data.shape

(4521, 52)

In [6]:
combined_data = np.concatenate([train_data,dev_data])
df = pd.DataFrame(combined_data, columns=columns)
y = df['target']
X = df.drop(['target'], axis=1)
print(X.shape, y.shape)

(36169, 51) (36169,)


In [17]:
# create a pipeline
pipeline_rf = Pipeline([('clfrf', RandomForestClassifier()),])
pipeline_gboost = Pipeline([('clfgb', GradientBoostingClassifier()),])
rf_parameters = {
    'clfrf__n_estimators': (100,1000,5000),
    'clfrf__max_depth': (None, 5,10, 15),
    'clfrf__min_samples_split': (2,4,6)
}
gb_parameters = {
    'clfgb__n_estimators': (100,1000,5000),
    'clfgb__max_depth': (None, 5,10, 15),
    'clfgb__min_samples_split': (2,4,6)
        }
sorted(pipeline_rf.get_params().keys())


random_search_rf = RandomizedSearchCV(pipeline_rf, rf_parameters, n_iter=10, n_jobs=-1, verbose=1, cv=5, scoring="f1_weighted", random_state=42)
random_search_gb = RandomizedSearchCV(pipeline_gboost, gb_parameters, n_iter=10, scoring='f1_weighted', n_jobs=-1, cv=5, verbose=1, random_state=1001)
# list of searches that i want to run
pipelines = [random_search_rf,random_search_gb]

#create dict
randomcv_dict = {0:'Random Forest', 1:'Gradient Boost'}

best_accuracy = 0.0
best_model = 0
best_search = ''

#fit the pipelines
for pipe in pipelines:
    pipe.fit(X,y)

for index,model in enumerate(pipelines):
    print("{} Train acc: {}" .format(randomcv_dict[index], model.score(X,y)))
    
# print("Performing random search")
# print("Pipeline: ", [name for name, _ in pipeline.steps])
# print("parameters: ")
# pprint(parameters)
# t0 = time()
# random_search.fit(X, y)
# print("tuned in %0.4fs" % (time()-t0))
# print('Best_score for Random Forest Classifier',random_search.best_score_)
# # print('Best_Parameters for Random Forest classifier',random_search.best_params_)

# best_parameters = random_search.best_estimator_.get_params()
# for parameter_name in sorted(parameters.keys()):
#     print("\t%s: %r" %(parameter_name, best_parameters[parameter_name]))


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.0min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 76.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 159.8min finished


Random Forest Train acc: 0.9821051222746898
XG Boost Train acc: 0.9234283246570725


In [21]:
for i in range(len(pipe.cv_results_['params'])):
#     exp = Experiment(api_key=comet_creds['api_key'], project_name=comet_creds['project_name'],workspace=comet_creds["workspace"])
    for index,values in pipe.cv_results_.items():
        if index == "params":
            exp.log_parameters(values[i])
        else:
            exp.log_metric(index,values[i])

In [None]:
# def RF_Tuned_Param(X,Y):
#     # Using RandomSearchCV to find out the best set of parameters for RFR and use it for the regression model analysis and prediction:

#     # Number of trees in random forest:
#     n_estimators = [int(x) for x in np.linspace(start=5, stop= 50, num= 10)]

#     # Number of features to consider at every split:
#     max_features = ['auto','sqrt','log2']

#     # Maximun number of levels in tree:
#     max_depth = [int(x) for x in np.linspace(start=10, stop= 30, num= 5)]

#     # Minimum number of samples required to split a node:
#     min_samples_split = [5,10]

#     # Minimum number of samples required at each leaf node
#     min_samples_leaf = [1, 2, 5, 10]

#     # Bootstrap method:
#     bootstrap = [True,False]

#     # Create the parameter grid
#     param_grid_RF = {'n_estimators': n_estimators,
#                   'max_features': max_features,
#                   'max_depth': max_depth,
#                   'min_samples_split': min_samples_split,
#                   'min_samples_leaf': min_samples_leaf,
#                   'bootstrap': bootstrap}
#     pprint(param_grid_RF)

#     # forest = RandomForestRegressor()
#     random_search_forest = RandomizedSearchCV(RFR_Model, param_grid_RF, cv=3, n_jobs=6, verbose=1)
#     random_search_forest.fit(X, Y)

#     # The best parameters for the Random forest regressor obtained from GridSearch CV:
#     print('Best_Parameters for Random Forest class',random_search_forest.best_params_)

#     # The best score for Random forest regressor after GridSearch CV:
#     print('Best_score for Random Forest Regressor',random_search_forest.best_score_)

#     return random_search_forest, random_search_forest.best_params_, random_search_forest.best_score_


In [22]:
# # RF_Tuned_Param(X,y)
dump(pipe.best_estimator_, save_model)

['../models/model_randomforest.pkl']