In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
import scipy
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
import hyperopt
from hyperopt import *
from hyperopt import fmin, tpe, hp, space_eval
import matplotlib.pyplot as plt
%matplotlib inline 


<div class="alert alert-block alert-info">
<b>Loading the data:</b> We load the data from the mentioned path
</div>

In [2]:
path_of_input_file = r'D:\kaggle_trials\Facebook_metrics\dataset_Facebook.csv'
df                 = pd.read_csv(path_of_input_file,sep=';')
df.head(2)

Unnamed: 0,Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions
0,139441,Photo,2,12,4,3,0.0,2752,5091,178,109,159,3078,1640,119,4,79.0,17.0,100
1,139441,Status,2,12,3,10,0.0,10460,19057,1457,1361,1674,11710,6112,1108,5,130.0,29.0,164


<div class="alert alert-block alert-info">
<b>Preprocessing data :</b> We separate out the numerical and categorical columns from the data to be used for scaling and encoding respectively 
</div>

In [3]:
cols_needed           = list(df.columns)
cols_needed           = cols_needed[:len(cols_needed)-1]

possible_numeric_cols = list(df._get_numeric_data().columns)
possible_numeric_cols.remove('Lifetime Post Total Reach')

categorical_columns   = list(set(cols_needed)- set(possible_numeric_cols))

numerical_columns     = []
for i in range(len(possible_numeric_cols)):
    col_name  = possible_numeric_cols[i]
    if len(df[col_name].unique())<10:
        categorical_columns.append(col_name)
    else:
        numerical_columns.append(col_name)

<div class="alert alert-block alert-info">
<b>Missing value Treatment:</b> We impute the numerical missing values with their respective means and the categorical values with their modes.
</div>

In [4]:
for i in range(len(categorical_columns)):
    df[categorical_columns[i]] = df[categorical_columns[i]].fillna(df[categorical_columns[i]].mode()[0])
mean_impute_dict    ={}
for i in range(len(numerical_columns)):
    mean_impute_dict[numerical_columns[i]] = np.nanmean(np.float_(df[numerical_columns[i]].values))
for i in range(len(numerical_columns)):
    df[numerical_columns[i]]   = df[numerical_columns[i]].fillna(mean_impute_dict[numerical_columns[i]])

<div class="alert alert-block alert-info">
<b>Scaling and Encoding:</b> We scale and one hot encode the data to get the matrix we need for calculations
</div>

In [5]:
ohe                  = OneHotEncoder()
scalar               = MinMaxScaler()
encoded_matrix       = ohe.fit_transform(df[categorical_columns])
scaled_matrix        = scalar.fit_transform(df[numerical_columns])
X_complete_matrix    = scipy.sparse.hstack((encoded_matrix,scaled_matrix)).A
Y                    = scalar.fit_transform(df[['Lifetime Post Total Reach']])


<div class="alert alert-block alert-info">
<b>Train Test Split :</b> We split the data to train and test set 
</div>

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_complete_matrix, Y, test_size=0.2, random_state=42)

<div class="alert alert-block alert-info">
<b>Parameter Tuning and setting Grid for parameters:</b> We set up the grid for parameter tuning and then tune the parameters to get the optimal list of parameters to use
</div>

In [7]:
elastic_net_grid  = {'alpha' : hp.uniform('alpha',0.01,5),
                      'l1_ratio': hp.uniform('l1_ratio',0.0,1.0),
                     'precompute':hp.choice('precompute',[True,False])
                     }

In [8]:
def hyperopt_train_test(params):
    reg = ElasticNet(**params,random_state=19)
    return cross_val_score(reg, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, elastic_net_grid, algo=tpe.suggest, max_evals=30, trials=trials)
best_parameters = space_eval(elastic_net_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|████████████████████████████████████████████████| 30/30 [00:00<00:00, 60.65it/s, best loss: 0.0051953403282187045]
The best parameter tuned on training set is given by :-  {'alpha': 4.657603318145669, 'l1_ratio': 0.26295323521117164, 'precompute': False}


<div class="alert alert-block alert-info">
<b>Implementing the model:</b> We now implement the model with tuned parameters and get the R^2 score
</div>

In [9]:
model = ElasticNet(**best_parameters,random_state=19)
model.fit(X_train, y_train)

ElasticNet(alpha=4.657603318145669, copy_X=True, fit_intercept=True,
           l1_ratio=0.26295323521117164, max_iter=1000, normalize=False,
           positive=False, precompute=False, random_state=19,
           selection='cyclic', tol=0.0001, warm_start=False)

In [10]:
y_pred = model.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred,y_test))

The coefficient of determination is:-  -5.535177618157918e+31


<div class="alert alert-block alert-info">
<b>Conclusion:</b> We get a pretty bad R^2 score and we will see how Random Forest Regressor performs in this scenario
</div>

<div class="alert alert-block alert-info">
<b>Setting up a parameter gird for tuning :</b> We set up the parameter grid and tune the parameters 
</div>

In [11]:
random_forest_grid = {'n_estimators' : hp.choice('n_estimators',range(2,50)),
                      'min_samples_split' : hp.uniform('min_samples_split',0.01,0.95),
                      'min_samples_leaf'  : hp.choice('min_samples_leaf',range(1,10)),
                      'max_features'      : hp.choice('max_features',['auto','sqrt','log2',None])
                     }

In [12]:
def hyperopt_train_test(params):
    reg = RandomForestRegressor(**params,random_state=11)
    return cross_val_score(reg, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, random_forest_grid, algo=tpe.suggest, max_evals=30, trials=trials)
best_parameters = space_eval(random_forest_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|████████████████████████████████████████████████████| 30/30 [00:03<00:00,  5.27it/s, best loss: -0.73182358588332]
The best parameter tuned on training set is given by :-  {'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 0.05805969714810617, 'n_estimators': 35}


In [13]:
rf_regressor = RandomForestRegressor(**best_parameters,random_state=11)
rf_regressor.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=0.05805969714810617,
                      min_weight_fraction_leaf=0.0, n_estimators=35,
                      n_jobs=None, oob_score=False, random_state=11, verbose=0,
                      warm_start=False)

In [14]:
y_pred = rf_regressor.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred,y_test))

The coefficient of determination is:-  0.794042960476264


<div class="alert alert-block alert-info">
<b>Conclusion:</b> We get a pretty nice R2 score as compared to Elastic Net and we can consider the model. But we will also like to see if we can implement Extra Trees classifier over here or not
</div>

<div class="alert alert-block alert-info">
<b>Parameter Grid for Extra Trees:</b> We set up the parameter grid for Extra trees classifier and then tune it to get the results needed
</div>

In [15]:
extra_trees_grid  = {'n_estimators' : hp.choice('n_estimators',range(5,50)),
                      'min_samples_split' : hp.uniform('min_samples_split',0.01,0.95),
                      'min_samples_leaf'  : hp.choice('min_samples_leaf',range(1,10)),
                      'max_features'      : hp.choice('max_features',['auto','sqrt','log2',None])
                     }

In [16]:
def hyperopt_train_test(params):
    reg = ExtraTreesRegressor(**params,random_state=11)
    return cross_val_score(reg, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, extra_trees_grid, algo=tpe.suggest, max_evals=30, trials=trials)
best_parameters = space_eval(extra_trees_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|██████████████████████████████████████████████████| 30/30 [00:03<00:00,  8.84it/s, best loss: -0.7032988429034432]
The best parameter tuned on training set is given by :-  {'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 0.025913658432231645, 'n_estimators': 6}


In [17]:
et_regressor = ExtraTreesRegressor(**best_parameters,random_state=11)
et_regressor.fit(X_train,y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
                    max_features=None, max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=5, min_samples_split=0.025913658432231645,
                    min_weight_fraction_leaf=0.0, n_estimators=6, n_jobs=None,
                    oob_score=False, random_state=11, verbose=0,
                    warm_start=False)

In [18]:
y_pred = et_regressor.predict(X_test)
print('The coefficient of determination is:- ',r2_score(y_pred,y_test))

The coefficient of determination is:-  0.7575208346925255


<div class="alert alert-block alert-info">
<b>Conclusion:</b> The Extra trees classifier gave pretty better results as compared to Elastic Net.  The results were even comparable with Random Forest Classifier as well.
</div>