In [1]:
from comet_ml import Experiment
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from joblib import dump
from time import time
import json
from sklearn.preprocessing import StandardScaler

In [2]:
with open('../data/raw/comet_creds.json') as file:
    comet_creds = json.load(file)
exp = Experiment(api_key=comet_creds['api_key'], project_name=comet_creds['project_name'],workspace=comet_creds["workspace"])    
print(exp)

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/fazleem/bank-marketing/509cd777ed6e4282a75ebc800d62e8d7



<comet_ml.Experiment object at 0x7fc09c28ce50>


In [3]:
train_data = np.load("../data/interim/train_data.npy", allow_pickle=True)
dev_data = np.load("../data/interim/dev_data.npy", allow_pickle=True)
test_data = np.load("../data/interim/test_data.npy", allow_pickle=True)
save_model = "../models/model_randomforest.pkl"
n_experiments = 5

In [4]:
columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous',
       'target', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'job_unknown', 'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'credit_default_no', 'credit_default_yes',
       'housing_no', 'housing_yes', 'personal_loan_no', 'personal_loan_yes',
       'contact_type_cellular', 'contact_type_telephone',
       'contact_type_unknown', 'month_apr', 'month_aug', 'month_dec',
       'month_feb', 'month_jan', 'month_jul', 'month_jun', 'month_mar',
       'month_may', 'month_nov', 'month_oct', 'month_sep',
       'previous_campaign_failure', 'previous_campaign_other',
       'previous_campaign_success', 'previous_campaign_unknown']

In [8]:
train_data.shape
dev_data.shape

(4521, 52)

In [9]:
combined_data = np.concatenate([train_data,dev_data])
df = pd.DataFrame(combined_data, columns=columns)
df_test = pd.DataFrame(test_data, columns=columns)

(36169, 51) (36169,)


In [10]:
# store the x and y for test and train
y = df['target']
X = df.drop(['target'], axis=1)
print(X.shape, y.shape)

Y_test = df_test['target']
X_test = df_test.drop(['target'], axis=1)
print(X_test.shape, Y_test.shape)

(36169, 51) (36169,)
(9042, 51) (9042,)


In [12]:
#create pipelines for Random forest and Gradient Boost
## Pipeline creation steps
## 1. Data Preprocessing using Standard Scalar
## 2. Apply classifier
pipeline_rf = Pipeline([('scalar1', StandardScaler()), ('rf_classifier',RandomForestClassifier())])
pipeline_gb = Pipeline([('scalar1', StandardScaler()), ('gb_classifier',GradientBoostingClassifier())])
pipelines = [pipeline_rf, pipeline_gb]

In [14]:
best_accuracy = 0.0
best_model = 0
best_classifier = ''

# pipe_dict = {0:'Random Forest', 1:'Gradient Boost'}

In [None]:
"""Random Forest Parameter Tuning"""
def RF_Tuned_Param(X,Y):
    # Using RandomSearchCV to find out the best set of parameters for RFR and use it for the regression model analysis and prediction:

    # Number of trees in random forest:
    n_estimators = [int(x) for x in np.linspace(start=5, stop= 50, num= 10)]

    # Number of features to consider at every split:
    max_features = ['auto','sqrt','log2']

    # Maximun number of levels in tree:
    max_depth = [int(x) for x in np.linspace(start=10, stop= 30, num= 5)]

    # Minimum number of samples required to split a node:
    min_samples_split = [5,10]

    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 5, 10]

    # Bootstrap method:
    bootstrap = [True,False]

    # Create the parameter grid
    param_grid_RF = {'n_estimators': n_estimators,
                  'max_features': max_features,
                  'max_depth': max_depth,
                  'min_samples_split': min_samples_split,
                  'min_samples_leaf': min_samples_leaf,
                  'bootstrap': bootstrap}
    pprint(param_grid_RF)

    # forest = RandomForestRegressor()
    random_search_forest = RandomizedSearchCV(RFR_Model, param_grid_RF, cv=3, n_jobs=6, verbose=1)
    random_search_forest.fit(X, Y)

    # The best parameters for the Random forest regressor obtained from GridSearch CV:
    print('Best_Parameters for Random Forest class',random_search_forest.best_params_)

    # The best score for Random forest regressor after GridSearch CV:
    print('Best_score for Random Forest Regressor',random_search_forest.best_score_)

    return random_search_forest, random_search_forest.best_params_, random_search_forest.best_score_


In [15]:
#create a pipeline
pipe = Pipeline([('classifier', RandomForestClassifier())])
#create dictionary with learning algorithms and their hyper parameters

random_params = [
    {
    'classifier': [RandomForestClassifier()],
    'classifier__n_estimators': (100,1000,5000),
    'classifier__max_depth': (None, 5,10, 15),
    'classifier__min_samples_split': (2,4,6)
    },
    {
    'classifier': [GradientBoostingClassifier()],
    'classifier__n_estimators': (100,1000,5000),
    'classifier__max_depth': (None, 5,10, 15),
    'classifier__min_samples_split': (2,4,6)
    }
]
randomsearch = RandomizedSearchCV(pipe, random_params, n_iter=10, n_jobs=-1, verbose=1, cv=5, scoring="f1_weighted", random_state=42)
best_model = randomsearch.fit(X,y)

COMET ERROR: Failed to extract parameters from estimator


Fitting 5 folds for each of 10 candidates, totalling 50 fits


COMET ERROR: Failed to extract parameters from estimator
COMET ERROR: Failed to extract parameters from estimator


In [17]:
print(best_model.best_estimator_)
print(best_model.best_params_)
print(best_model.score(X_test,Y_test))

Pipeline(steps=[('classifier', GradientBoostingClassifier(max_depth=10))])
{'classifier__n_estimators': 100, 'classifier__min_samples_split': 2, 'classifier__max_depth': 10, 'classifier': GradientBoostingClassifier(max_depth=10)}
0.9019972496012364


In [20]:
dump(best_model.best_estimator_, open(save_model,'wb'))