### Running the random forest model with label enconder
* __November 2, 2019__
* last Update **Decemebr 4, 2019**

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn import preprocessing
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

### Read train data

In [2]:
X_train = pd.read_csv("X_train_df_2.csv")
y_train = pd.read_csv("y_train_df_2.csv")

In [3]:
X_train.head()

Unnamed: 0,channel_title,category,views,likes,dislikes,comment_total,day,month,weekday_weekend
0,478.0,3.0,0.029405,0.007717,0.001975,0.013207,4.0,0.0,0.0
1,760.0,8.0,0.164226,0.014618,0.035958,0.028425,4.0,1.0,0.0
2,895.0,3.0,0.216898,0.019497,0.009092,0.005394,2.0,0.0,1.0
3,32.0,7.0,0.027132,0.025938,0.000206,0.019794,6.0,0.0,0.0
4,811.0,3.0,0.924722,0.458274,0.033325,0.243302,1.0,0.0,0.0


In [4]:
X_train.drop(columns=["views"], inplace=True)

In [5]:
print("Shape of predictor training data: ", X_train.shape)
print("Shape of response training data: ", y_train.shape)

Shape of predictor training data:  (4849, 8)
Shape of response training data:  (4849, 1)


In [6]:
print("========== info() of features of training dataframe ================")
print(X_train.info())
print("=========== info() of response of training dataframe ===============")
print(y_train.info())
print("==========================")
print("Training features/predictors:")
print("==========================")
print(X_train.head())
print("==========================")
print("Training response:")
print("==========================")
print(y_train.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4849 entries, 0 to 4848
Data columns (total 8 columns):
channel_title      4849 non-null float64
category           4849 non-null float64
likes              4849 non-null float64
dislikes           4849 non-null float64
comment_total      4849 non-null float64
day                4849 non-null float64
month              4849 non-null float64
weekday_weekend    4849 non-null float64
dtypes: float64(8)
memory usage: 303.1 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4849 entries, 0 to 4848
Data columns (total 1 columns):
viewer_class    4849 non-null int64
dtypes: int64(1)
memory usage: 38.0 KB
None
Training features/predictors:
   channel_title  category     likes  dislikes  comment_total  day  month  \
0          478.0       3.0  0.007717  0.001975       0.013207  4.0    0.0   
1          760.0       8.0  0.014618  0.035958       0.028425  4.0    1.0   
2          895.0       3.0  0.019497  0.009092       0.005394  2.0    0.

In [7]:
X_train.head()

Unnamed: 0,channel_title,category,likes,dislikes,comment_total,day,month,weekday_weekend
0,478.0,3.0,0.007717,0.001975,0.013207,4.0,0.0,0.0
1,760.0,8.0,0.014618,0.035958,0.028425,4.0,1.0,0.0
2,895.0,3.0,0.019497,0.009092,0.005394,2.0,0.0,1.0
3,32.0,7.0,0.025938,0.000206,0.019794,6.0,0.0,0.0
4,811.0,3.0,0.458274,0.033325,0.243302,1.0,0.0,0.0


In [8]:
y_train.head()

Unnamed: 0,viewer_class
0,2
1,3
2,1
3,2
4,0


### random forest hyperparameters

  * n_estimator:  number of trees in the forest
  * max_features: maximum number of features considered for splitting a node
  * max_depth:    maximum number of levels in each decision trees
  * min_sample_leafs: min. number of data point allowed in a leaf node
  * min_sample_spilts: min. number of data points placed in a leaf before before the node is split
  * bootstrap: method of sampling data point (with or without replacement)

In [9]:
from pprint import pprint

### Randomized search cross-validation

In [10]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 700, stop = 1000, num = 2)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(700, 1000, num = 2)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [ 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [ 4, 8]
# Method of selecting samples for training each tree
bootstrap = [True]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True],
 'max_depth': [700, 1000, None],
 'max_features': ['auto', 'sqrt', 'log2'],
 'min_samples_leaf': [4, 8],
 'min_samples_split': [5, 10],
 'n_estimators': [700, 1000]}


In [11]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf_cls = RandomForestClassifier(random_state=42, oob_score=True)
# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf_cls, param_distributions = random_grid, n_iter = 10, cv = 5, random_state=0) #, verbose=2,  n_jobs = -1)

In [12]:
print(rf_cls)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=True, random_state=42, verbose=0, warm_start=False)


In [13]:
print(rf_random)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=True, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'n_estimators': [700, 1000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [700, 1000, None], 'min_samples_split': [5, 10], 'min_samples_leaf': [4, 8], 'bootstrap': [True]},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          return_train_score='warn', scoring=None, verbose=0)


In [14]:
# y_train.values.ravel()

In [15]:
t_start = datetime.now()

# Fit the random search model
rf_random.fit(X_train, y_train.values.ravel())

t_end = datetime.now()
print("Execution time: {}".format(t_end  - t_start))

Execution time: 0:03:38.269873


In [16]:
# dir(rf_random.cv_results_)

In [17]:
rf_random.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_n_estimators', 'param_min_samples_split', 'param_min_samples_leaf', 'param_max_features', 'param_max_depth', 'param_bootstrap', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])

In [18]:
# rf_random.cv_results_['params']

### mean train score

In [19]:
rf_random.cv_results_['mean_train_score']



array([0.8893069 , 0.89075055, 0.9015264 , 0.8309441 , 0.82217893,
       0.89075055, 0.8223338 , 0.89075055, 0.8893069 , 0.83099544])

### mean test score

In [20]:
rf_random.cv_results_['mean_test_score']

array([0.71581769, 0.71602392, 0.71973603, 0.70756857, 0.69375129,
       0.71602392, 0.69292638, 0.71602392, 0.71581769, 0.7077748 ])

In [21]:
rf_random.best_params_

{'n_estimators': 700,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': True}

### best test score

In [22]:
rf_random.best_score_

0.71973602804702

### save the best parameters

In [23]:
import joblib

In [24]:
joblib.dump(rf_random.best_estimator_, 'RF_3.pkl')

['RF_3.pkl']