In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
## loading libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import time
# import zipfile 

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from transform import TargetTransform   # custom class for handling target value transformation

In [None]:
# with zipfile.ZipFile('data.zip', 'r') as z:
#     z.extractall()

In [3]:
# setting paths
data_dir = os.path.abspath('./_data')
train_data_fp = os.path.join(data_dir, 'train_clean_all_scaled.csv')
test_data_fp = os.path.join(data_dir, 'test_clean_all_scaled.csv')
data_fp = os.path.join(data_dir, 'data_clean_all_scaled.csv')
os.listdir(data_dir)

['data.csv',
 'data.zip',
 'data_clean.csv',
 'data_clean_all_scaled.csv',
 'data_no_missing.csv',
 'Saved-Models',
 'test.csv',
 'test_clean.csv',
 'test_clean_all_scaled.csv',
 'train_clean.csv',
 'train_clean_all_scaled.csv']

In [4]:
# load the training set
train_set = pd.read_csv(train_data_fp)
train_set.head(1)

Unnamed: 0,ch_type,length,first_or_rerun,episodes_in_season,is_movie,overlaped_with_game,temperature,Is_month_end,Is_month_start,Is_quarter_end,...,Month_12,Dayofweek_0,Dayofweek_1,Dayofweek_2,Dayofweek_3,Dayofweek_4,Dayofweek_5,Dayofweek_6,market_share,market_share_transformed
0,-2.552048,0.906181,-0.16533,0.13963,-0.10246,-0.274233,1.094244,-0.184477,-0.184101,-0.103987,...,-0.299952,2.452482,-0.409959,-0.413056,-0.410212,-0.410479,-0.404872,-0.40136,0.9,-0.001165


In [5]:
# cut the dependant variable(s) form training set
target = train_set.loc[:, 'market_share_transformed']
target_src = train_set.loc[:, 'market_share']
targets_joint = train_set.loc[:, ['market_share_transformed', 'market_share']]

train_set.drop(columns=['market_share', 'market_share_transformed'], inplace=True)

In [6]:
# constant variables for later use
cv = 4    # number of folds for cross-validation
rs = 7    # random-state number

In [7]:
# splitting the train set into train and validation sets
X_train, X_test, y_train, y_test = train_test_split(train_set, targets_joint, test_size=0.2, random_state=rs)

In [8]:
# transformer object that will be used later for inverse transforming predicted values.
tfms = TargetTransform(data=target_src)

In [9]:
y_train.head()

Unnamed: 0,market_share_transformed,market_share
117942,-0.007722,0.5
105927,0.021902,7.1
219211,0.00775,2.0
526282,0.001075,1.1
345705,-0.017927,0.2


In [10]:
# helper function for evaluating model performance with 3 different metrics
def evaluate(actual, prediction):
    return {'MAE': mean_absolute_error(actual, prediction),
            'MSE': mean_squared_error(actual, prediction),
            'R^2': r2_score(actual, prediction)}

## 1. RandomForestRegressor model

#### Grid search results for RandomForestRegressor:
* Best `max_depth`: 30
* Best `n_estimators`: 100

### RandomForestRegressor with transformed DV

In [26]:
# instantiate a RandomForestRegressor object using the researched hyper-parameters
rf = RandomForestRegressor(n_estimators=100, max_depth=30, n_jobs=-1, random_state=rs)

In [27]:
%%time
# training the RandomForest model with transformed y values
print(time.ctime())
rf.fit(X_train, y_train.iloc[:, 0].values)

Sat Feb 22 09:17:56 2020
Wall time: 7min 37s


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=30, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=7, verbose=0, warm_start=False)

In [31]:
# top 10 most important features for the RandomForest model
sorted(zip(rf.feature_importances_, X_train.columns), reverse=True)[:10]

[(0.21202570849572155, 'station_Unis TV+'),
 (0.16655498456942996, 'temperature'),
 (0.10847851680941811, 'Elapsed'),
 (0.08593567817972778, 'ch_type'),
 (0.051311459250471834, 'running_time'),
 (0.0335779138944239, 'station_RDS2+'),
 (0.01866659001326775, 'station_MusiquePlus+ (retired Aug 25, 2019)'),
 (0.018441731835395755, 'station_Evasion+'),
 (0.01718808315744735, 'genre_Amateur Sports'),
 (0.01275779177053862, 'length')]

In [32]:
# make predictions
rf_preds = rf.predict(X_test)

In [33]:
# inverse transforming the predicted values
rf_preds_inverse_transformed = tfms.inverse(rf_preds)
rf_preds_inverse_transformed.shape

(123332,)

In [34]:
# evaluating the RandomForest model
evaluate(y_test.iloc[:, 1].values, rf_preds_inverse_transformed)

{'MAE': 1.2265391244681203,
 'MSE': 5.067843954897217,
 'R^2': 0.7908991497007714}

### RandomForestRegressor with non-transformed DV

In [11]:
# instantiate a RandomForestRegressor object using the researched hyper-parameters
rf = RandomForestRegressor(n_estimators=100, max_depth=30, n_jobs=-1, random_state=rs)

In [12]:
%%time
# training the RandomForest model with non-transformed y values

print(time.ctime())
rf.fit(X_train, y_train.iloc[:, 1].values)

Sat Feb 22 09:02:55 2020
Wall time: 6min 56s


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=30, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=7, verbose=0, warm_start=False)

In [13]:
# top 10 most important features for the RandomForest model
sorted(zip(rf.feature_importances_, X_train.columns), reverse=True)[:10]

[(0.4412274061965948, 'station_TVA Total'),
 (0.08327789022301008, 'ch_type'),
 (0.08175299476662161, 'running_time'),
 (0.07666412804037741, 'temperature'),
 (0.0515017298269713, 'Elapsed'),
 (0.02831438551727177, 'station_SRC Total'),
 (0.025352121645227522,
  'genre_Infomercials, Promotional and Corporate Videos'),
 (0.014262759823071278, 'length'),
 (0.013762769174650526, 'station_RDI+'),
 (0.013671670994428588, 'station_LCN+')]

In [14]:
# make predictions
rf_preds = rf.predict(X_test)

In [15]:
# evaluating the RandomForest model
evaluate(y_test.iloc[:, 1].values, rf_preds)

{'MAE': 1.2114750468805762,
 'MSE': 4.458395933940401,
 'R^2': 0.8160451685066793}

### Hyper=Parameter tunning:

#### - best one: `n_estimators`=150, `max_depth`=None, `min_samples_leaf`=2, max_features=0.5

In [81]:
# instantiate a RandomForestRegressor object using the researched hyper-parameters
rf = RandomForestRegressor(n_estimators=150, max_depth=None, min_samples_leaf=2, max_features=0.5 ,n_jobs=-1, random_state=rs)

In [82]:
%%time
# training the RandomForest model with non-transformed y values

print(time.ctime())
rf.fit(X_train, y_train.iloc[:, 1].values)

Sat Feb 22 10:45:19 2020
Wall time: 12min 8s


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features=0.5, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=2,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=300, n_jobs=-1, oob_score=False,
                      random_state=7, verbose=0, warm_start=False)

In [83]:
# top 10 most important features for the RandomForest model
sorted(zip(rf.feature_importances_, X_train.columns), reverse=True)[:10]

[(0.35256718922732455, 'station_TVA Total'),
 (0.14255550585430055, 'ch_type'),
 (0.0703688716934912, 'running_time'),
 (0.06328686011778054, 'temperature'),
 (0.04554889681959827, 'Elapsed'),
 (0.04192298964043359, 'station_SRC Total'),
 (0.03653152432134657, 'length'),
 (0.024960604448960724,
  'genre_Infomercials, Promotional and Corporate Videos'),
 (0.020969200944876358, 'station_V Total'),
 (0.01330968582139777, 'station_TQ Total')]

In [84]:
# make predictions
rf_preds = rf.predict(X_test)

In [39]:
# # evaluating the RandomForest model
# # min_samples_leaf=2
# evaluate(y_test.iloc[:, 1].values, rf_preds)

{'MAE': 1.2115713322595871,
 'MSE': 4.393888042785177,
 'R^2': 0.8187067845729223}

In [44]:
# # evaluating the RandomForest model
# # min_samples_leaf=3
# evaluate(y_test.iloc[:, 1].values, rf_preds)

{'MAE': 1.2166649954170135,
 'MSE': 4.400843838292128,
 'R^2': 0.8184197862422802}

In [49]:
# # evaluating the RandomForest model
# # min_samples_leaf=5
# evaluate(y_test.iloc[:, 1].values, rf_preds)

{'MAE': 1.2285477446347959, 'MSE': 4.471535361067341, 'R^2': 0.815503031572484}

In [54]:
# # evaluating the RandomForest model
# # min_samples_leaf=2, max_features='sqrt'
# evaluate(y_test.iloc[:, 1].values, rf_preds)

{'MAE': 1.2985397716893128, 'MSE': 4.851387204051864, 'R^2': 0.799830223951989}

In [59]:
# # evaluating the RandomForest model
# # min_samples_leaf=2, max_features=0.5
# evaluate(y_test.iloc[:, 1].values, rf_preds)

{'MAE': 1.216466919559925, 'MSE': 4.392587281836075, 'R^2': 0.8187604543825917}

In [69]:
# # evaluating the RandomForest model
# # min_samples_leaf=2, max_features='log2', max_depth=None
# evaluate(y_test.iloc[:, 1].values, rf_preds)

{'MAE': 1.2927334599485232,
 'MSE': 4.919374969034451,
 'R^2': 0.7970250271869889}

In [74]:
# # evaluating the RandomForest model
# # min_samples_leaf=5, max_features=0.5, max_depth=None
# evaluate(y_test.iloc[:, 1].values, rf_preds)

{'MAE': 1.2290669738109814, 'MSE': 4.4705419773212, 'R^2': 0.8155440189011864}

In [64]:
# # evaluating the RandomForest model
# # min_samples_leaf=2, max_features=0.5, max_depth=None
# evaluate(y_test.iloc[:, 1].values, rf_preds)

{'MAE': 1.206812669313148, 'MSE': 4.377800509231889, 'R^2': 0.8193705613141062}

##### best one is:

In [79]:
# evaluating the RandomForest model
# min_samples_leaf=2, max_features=0.5, max_depth=None, n_estimators=150
evaluate(y_test.iloc[:, 1].values, rf_preds)

{'MAE': 1.2055198269905107,
 'MSE': 4.367772026317077,
 'R^2': 0.8197843397025809}

### RandomForestRegressor Cross-Validation

In [None]:
rf = RandomForestRegressor(n_estimators=100, max_depth=30, random_state=rs)

In [16]:
%%time

NMAEs = cross_val_score(estimator=rf, X=train_set.values, y=target_src.values, cv=cv, scoring='neg_mean_absolute_error')

CPU times: user 58min 24s, sys: 3.97 s, total: 58min 28s
Wall time: 58min 31s


In [33]:
print(f'Mean MEA for {cv} folds cross-validation: {NMAEs.mean() * -1}')

Mean MEA for 4 folds cross-validation: 1.6073213199539949


## 2. XGBRegressor model

#### Grid search results for XGBRegressor:
* Best  `max_depth`: 5
* Best  `n_estimators`: 100
* Best  `learning_rate`: 0.1

### XGBRegressor with transformed DV

In [20]:
# instantiate a XGBRegressor object using the researched hyper-parameters
xgr = XGBRegressor(max_depth=5, n_estimators=100, learning_rate=0.1, n_jobs=-1, random_state=rs)

In [21]:
%%time
# training the XGBoost model with transformed y values
print(time.ctime())
xgr.fit(X_train.values, y_train.iloc[:, 0].values)

Sat Feb 22 09:14:53 2020
Wall time: 1min 37s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=-1, nthread=None, objective='reg:linear', random_state=7,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [22]:
# top 10 most important features for the XGBoost model
sorted(zip(xgr.feature_importances_, X_train.columns), reverse=True)[:10]

[(0.29421237, 'station_Unis TV+'),
 (0.122598045, 'ch_type'),
 (0.050584905, 'station_RDS2+'),
 (0.04122238, 'station_Series++'),
 (0.039162338, 'station_MusiquePlus+ (retired Aug 25, 2019)'),
 (0.037397247, 'genre_Amateur Sports'),
 (0.037092704, 'station_RDI+'),
 (0.03607034, 'station_Evasion+'),
 (0.03411925, 'station_LCN+'),
 (0.025626002, 'station_TQ Total')]

In [23]:
# make predictions
xg_preds = xgr.predict(X_test.values)

In [24]:
# inverse transforming the predicted values
xg_preds_inverse_transformed = tfms.inverse(xg_preds)
xg_preds_inverse_transformed.shape

(123332,)

In [25]:
# evaluating the XGBoost model
evaluate(y_test.iloc[:, 1].values, xg_preds_inverse_transformed)

{'MAE': 1.5502820965879538,
 'MSE': 8.628997746145776,
 'R^2': 0.6439648138325909}

### XGBRegressor with non-transformed DV

In [16]:
# instantiate a XGBRegressor object using the researched hyper-parameters
xgr = XGBRegressor(max_depth=5, n_estimators=100, learning_rate=0.1, n_jobs=-1, random_state=rs)

In [17]:
%%time
# training the XGBoost model with transformed y values
print(time.ctime())
xgr.fit(X_train.values, y_train.iloc[:, 1].values)

Sat Feb 22 09:11:47 2020
Wall time: 1min 26s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=-1, nthread=None, objective='reg:linear', random_state=7,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [18]:
# make predictions
xg_preds = xgr.predict(X_test.values)

In [19]:
# evaluating the XGBoost model
evaluate(y_test.iloc[:, 1].values, xg_preds)

{'MAE': 1.4833311409991994,
 'MSE': 6.210317784723229,
 'R^2': 0.7437603168189131}

## Result:

* Both RandomForestRegressor and XGBRegressor made better predictions when trained with non-transformed (original) target value.
* Results didn't changed when using *_all_scaled* version of data