In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# loading libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import zipfile
import time

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from transform import TargetTransform   # custom class for handle target value transformation

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
with zipfile.ZipFile('data.zip', 'r') as z:
  z.extractall()

In [4]:
# setting paths
data_dir = os.path.abspath('./')
train_data_fp = os.path.join(data_dir, 'train_clean.csv')
test_data_fp = os.path.join(data_dir, 'test_clean.csv')
data_fp = os.path.join(data_dir, 'data_clean.csv')
os.listdir(data_dir)

['.config',
 'transform.py',
 'test_clean.csv',
 'data_clean.csv',
 'train_clean.csv',
 '__pycache__',
 'data.zip',
 'sample_data']

In [5]:
# load the training set
train_set = pd.read_csv(train_data_fp)
train_set.head(1)

Unnamed: 0,ch_type,length,first_or_rerun,episodes_in_season,is_movie,overlaped_with_game,temperature,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed,running_time,episode_name_na,temperature_na,station_ARTV+,station_Canal D+,station_Canal Vie+,station_Evasion+,station_Historia+,station_LCN+,station_MAX+,station_Meteomedia+,"station_MusiquePlus+ (retired Aug 25, 2019)",station_PYC Online Stream Fr CBC-8+,station_PYC Online Stream Fr CBC-9+,station_RDI+,station_RDS+,station_RDS2+,station_SRC Total,station_Series++,station_TQ Total,station_TV5+,station_TVA Total,station_Teletoon Fr.+,station_Unis TV+,station_V Total,station_VRAK+,...,genre_Music Video Programs,genre_Music and Dance other than Music Video Programs or Clips,genre_News,genre_Ongoing Comedy Series (Sitcoms),genre_Ongoing Dramatic Series,genre_Other Drama,genre_Professional Sports,"genre_Programs of Comedy Sketches, Improvision, Unscripted Works",genre_Reality Programs,genre_Religion,genre_Reporting and Actualities,"genre_Specials, Mini-Series or Made-for-TV Feature Films",genre_Theatre Feature Films Aired on TV,genre_Unknown,genre_Variety,Year_2016,Year_2017,Year_2018,Year_2019,Month_1,Month_2,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12,Dayofweek_0,Dayofweek_1,Dayofweek_2,Dayofweek_3,Dayofweek_4,Dayofweek_5,Dayofweek_6,market_share,market_share_transformed
0,0,0.906181,0,1,0,0,1.094244,0,0,0,0,0,0,-1.748154,0.886248,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0.9,-0.001165


In [None]:
# cut the dependant variable(s) form training set
target = train_set.loc[:, 'market_share_transformed']
target_src = train_set.loc[:, 'market_share']
targets_joint = train_set.loc[:, ['market_share_transformed', 'market_share']]

train_set.drop(columns=['market_share', 'market_share_transformed'], inplace=True)

In [None]:
# constant variables for later use
cv = 3    # number of folds for cross-validation
rs = 7    # random-state number

## Finding best hyper-parameters using GridSearchCV

### 1. RandomForest model

In [None]:
rf = RandomForestRegressor()
rf_params = {
    'n_estimators': [15, 100],
    'max_depth': [30, None]
}

In [17]:
%%time
print(time.ctime())

rf_gs = GridSearchCV(estimator=rf, param_grid=rf_params, scoring='r2', cv=cv)
rf_gs_fit = rf_gs.fit(train_set, target)

Fri Feb 21 14:08:13 2020
CPU times: user 1h 7min 30s, sys: 6.65 s, total: 1h 7min 36s
Wall time: 1h 7min 55s


In [None]:
gs_df = pd.DataFrame(rf_gs_fit.cv_results_)

In [20]:
gs_df.sort_values(by='mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
1,451.115219,5.938734,4.599687,0.558624,30.0,100,"{'max_depth': 30, 'n_estimators': 100}",0.201089,0.377931,0.299883,0.292968,0.072361,1
3,507.30746,6.155405,5.227495,0.607408,,100,"{'max_depth': None, 'n_estimators': 100}",0.176958,0.363026,0.284496,0.274827,0.076269,2
2,75.421473,1.136062,0.87538,0.113301,,15,"{'max_depth': None, 'n_estimators': 15}",0.119271,0.336143,0.260641,0.238685,0.089889,3
0,69.247369,0.52277,0.751151,0.075398,30.0,15,"{'max_depth': 30, 'n_estimators': 15}",0.074356,0.350385,0.259376,0.228039,0.114846,4


#### Grid search results for RandomForestRegressor:
* Best `max_depth`: 30
* Best `n_estimators`: 100

### 2. XGBoost model

In [None]:
xgr = XGBRegressor()
xgr_params = {
    'n_estimators': [15, 100],
    'max_depth': [5, 30],
    'learning_rate': [0.1, 0.01, 0.001]
}

In [27]:
%%time
print(time.ctime())

xgr_gs = GridSearchCV(estimator=xgr, param_grid=xgr_params, scoring='neg_mean_absolute_error', cv=cv)
xgr_gs_fit = xgr_gs.fit(train_set, target)

Fri Feb 21 17:25:09 2020


  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


CPU times: user 2h 21min 33s, sys: 6.06 s, total: 2h 21min 39s
Wall time: 2h 22min 20s


In [None]:
xgr_gs_df = pd.DataFrame(xgr_gs_fit.cv_results_)

In [30]:
xgr_gs_df.sort_values(by='mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
1,140.946804,0.790593,0.992214,0.011308,0.1,5,100,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",-0.014535,-0.014713,-0.014867,-0.014705,0.000135,1
3,1199.104502,10.814775,11.971092,1.195842,0.1,30,100,"{'learning_rate': 0.1, 'max_depth': 30, 'n_est...",-0.017435,-0.015329,-0.016468,-0.016411,0.000861,2
2,79.497362,1.16066,0.555015,0.011143,0.1,30,15,"{'learning_rate': 0.1, 'max_depth': 30, 'n_est...",-0.099329,-0.104325,-0.105763,-0.103139,0.002757,3
0,24.995006,0.786074,0.408407,0.001672,0.1,5,15,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",-0.100183,-0.104071,-0.105753,-0.103336,0.002332,4
7,455.528314,1.9238,1.884242,0.13511,0.01,30,100,"{'learning_rate': 0.01, 'max_depth': 30, 'n_es...",-0.180339,-0.184763,-0.186013,-0.183705,0.002434,5
5,135.817747,0.999372,0.906606,0.045527,0.01,5,100,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",-0.181592,-0.18464,-0.186093,-0.184108,0.001876,6
6,64.189791,3.373042,0.475168,0.011581,0.01,30,15,"{'learning_rate': 0.01, 'max_depth': 30, 'n_es...",-0.431106,-0.433446,-0.43399,-0.432847,0.001251,7
4,22.793482,0.516258,0.415764,0.014538,0.01,5,15,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",-0.431724,-0.433374,-0.433991,-0.43303,0.000957,8
11,415.357828,22.484486,1.448025,0.125051,0.001,30,100,"{'learning_rate': 0.001, 'max_depth': 30, 'n_e...",-0.453878,-0.455967,-0.456452,-0.455433,0.001117,9
9,131.906623,0.947448,0.743238,0.007703,0.001,5,100,"{'learning_rate': 0.001, 'max_depth': 5, 'n_es...",-0.454336,-0.455908,-0.456442,-0.455562,0.000894,10


#### Grid search results for XGBRegressor:
* Best  `max_depth`: 5
* Best  `n_estimators`: 100
* Best  `learning_rate`: 0.1