In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [54]:
# loading libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import time

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from transform import TargetTransform   # custom class for handling target value transformation

In [3]:
# setting paths
data_dir = os.path.abspath('./_data')
train_data_fp = os.path.join(data_dir, 'train_clean.csv')
test_data_fp = os.path.join(data_dir, 'test_clean.csv')
data_fp = os.path.join(data_dir, 'data_clean.csv')
os.listdir(data_dir)

['data.csv',
 'data.zip',
 'data_clean.csv',
 'data_no_missing.csv',
 'Saved-Models',
 'test.csv',
 'test_clean.csv',
 'train_clean.csv']

In [4]:
# load the training set
train_set = pd.read_csv(train_data_fp)
train_set.head(1)

Unnamed: 0,ch_type,length,first_or_rerun,episodes_in_season,is_movie,overlaped_with_game,temperature,Is_month_end,Is_month_start,Is_quarter_end,...,Month_12,Dayofweek_0,Dayofweek_1,Dayofweek_2,Dayofweek_3,Dayofweek_4,Dayofweek_5,Dayofweek_6,market_share,market_share_transformed
0,0,0.906181,0,1,0,0,1.094244,0,0,0,...,0,1,0,0,0,0,0,0,0.9,-0.001165


In [5]:
# cut the dependant variable(s) form training set
target = train_set.loc[:, 'market_share_transformed']
target_src = train_set.loc[:, 'market_share']
targets_joint = train_set.loc[:, ['market_share_transformed', 'market_share']]

train_set.drop(columns=['market_share', 'market_share_transformed'], inplace=True)

In [6]:
# constant variables for later use
cv = 5    # number of folds for cross-validation
rs = 7    # random-state number

In [7]:
# splitting the train set into train and validation sets
X_train, X_test, y_train, y_test = train_test_split(train_set, targets_joint, test_size=0.2, random_state=rs)

In [20]:
# transformer object that will be used later for inverse transforming predicted values.
tfms = TargetTransform(data=target_src)

In [22]:
y_train.head()

Unnamed: 0,market_share_transformed,market_share
117942,-0.007722,0.5
105927,0.021902,7.1
219211,0.00775,2.0
526282,0.001075,1.1
345705,-0.017927,0.2


In [26]:
def evaluate(actual, prediction):
    return {'MAE': mean_absolute_error(actual, prediction),
            'MSE': mean_squared_error(actual, prediction),
            'R^2': r2_score(actual, prediction)}

## 1. RandomForestRegressor model

#### Grid search results for RandomForestRegressor:
* Best `max_depth`: 30
* Best `n_estimators`: 100

In [21]:
# instantiate a RandomForestRegressor object using the researched hyper-parameters
rf = RandomForestRegressor(n_estimators=100, max_depth=30, n_jobs=-1, random_state=rs)

In [24]:
%%time
# training the RandomForest model with transformed y values
print(time.ctime())
rf.fit(X_train, y_train.iloc[:, 0].values)

Fri Feb 21 22:59:47 2020
Wall time: 7min 31s


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=30, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=7, verbose=0, warm_start=False)

In [39]:
# top 10 most important features for the RandomForest model
sorted(zip(rf.feature_importances_, X_train.columns), reverse=True)[:10]

[(0.21202570849572155, 'station_Unis TV+'),
 (0.16655498456942996, 'temperature'),
 (0.10847851680941811, 'Elapsed'),
 (0.08593567817972778, 'ch_type'),
 (0.051311459250471834, 'running_time'),
 (0.0335779138944239, 'station_RDS2+'),
 (0.01866659001326775, 'station_MusiquePlus+ (retired Aug 25, 2019)'),
 (0.018441731835395755, 'station_Evasion+'),
 (0.01718808315744735, 'genre_Amateur Sports'),
 (0.01275779177053862, 'length')]

In [41]:
# make predictions
rf_preds = rf.predict(X_test)

In [42]:
# inverse transforming the predicted values
rf_preds_inverse_transformed = tfms.inverse(rf_preds)
rf_preds_inverse_transformed.shape

(123332,)

In [44]:
# evaluating the RandomForest model
evaluate(y_test.iloc[:, 1].values, rf_preds_inverse_transformed)

{'MAE': 1.2265391244681203,
 'MSE': 5.067843954897217,
 'R^2': 0.7908991497007714}

In [48]:
# instantiate a RandomForestRegressor object using the researched hyper-parameters
rf = RandomForestRegressor(n_estimators=100, max_depth=30, n_jobs=-1, random_state=rs)

In [50]:
%%time
# training the RandomForest model with transformed y values

print(time.ctime())
rf.fit(X_train, y_train.iloc[:, 1].values)

Fri Feb 21 23:43:57 2020
Wall time: 7min 17s


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=30, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=7, verbose=0, warm_start=False)

In [51]:
# top 10 most important features for the RandomForest model
sorted(zip(rf.feature_importances_, X_train.columns), reverse=True)[:10]

[(0.4412274061965948, 'station_TVA Total'),
 (0.08327789022301008, 'ch_type'),
 (0.08175299476662161, 'running_time'),
 (0.07666412804037741, 'temperature'),
 (0.0515017298269713, 'Elapsed'),
 (0.02831438551727177, 'station_SRC Total'),
 (0.025352121645227522,
  'genre_Infomercials, Promotional and Corporate Videos'),
 (0.014262759823071278, 'length'),
 (0.013762769174650526, 'station_RDI+'),
 (0.013671670994428588, 'station_LCN+')]

In [52]:
# make predictions
rf_preds = rf.predict(X_test)

In [53]:
# evaluating the RandomForest model
evaluate(y_test.iloc[:, 1].values, rf_preds)

{'MAE': 1.2114750468805762,
 'MSE': 4.458395933940401,
 'R^2': 0.8160451685066793}

In [55]:
rf = RandomForestRegressor(n_estimators=100, max_depth=30, n_jobs=-1, random_state=rs)

In [None]:
cross_val_score(estimator=rf, X=train_set.values, y=target_src.values, cv=cv, scoring='neg_mean_absolute_error')

## 2. XGBRegressor model

#### Grid search results for XGBRegressor:
* Best  `max_depth`: 5
* Best  `n_estimators`: 100
* Best  `learning_rate`: 0.1

In [34]:
# instantiate a XGBRegressor object using the researched hyper-parameters
xgr = XGBRegressor(max_depth=5, n_estimators=100, learning_rate=0.1, n_jobs=-1, random_state=rs)

In [35]:
%%time
# training the XGBoost model
print(time.ctime())
xgr.fit(X_train.values, y_train.iloc[:, 0].values)

Fri Feb 21 23:23:15 2020
Wall time: 1min 27s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=-1, nthread=None, objective='reg:linear', random_state=7,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [40]:
# top 10 most important features for the XGBoost model
sorted(zip(xgr.feature_importances_, X_train.columns), reverse=True)[:10]

[(0.29421237, 'station_Unis TV+'),
 (0.122598045, 'ch_type'),
 (0.050584905, 'station_RDS2+'),
 (0.04122238, 'station_Series++'),
 (0.039162338, 'station_MusiquePlus+ (retired Aug 25, 2019)'),
 (0.037397247, 'genre_Amateur Sports'),
 (0.037092704, 'station_RDI+'),
 (0.03607034, 'station_Evasion+'),
 (0.03411925, 'station_LCN+'),
 (0.025626002, 'station_TQ Total')]

In [36]:
# make predictions
xg_preds = xgr.predict(X_test.values)

In [37]:
# inverse transforming the predicted values
xg_preds_inverse_transformed = tfms.inverse(xg_preds)
xg_preds_inverse_transformed.shape

(123332,)

In [38]:
# evaluating the XGBoost model
evaluate(y_test.iloc[:, 1].values, xg_preds_inverse_transformed)

{'MAE': 1.5502820965879538,
 'MSE': 8.628997746145776,
 'R^2': 0.6439648138325909}