In [1]:
# loading libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from transform import TargetTransform   # custom class for handle target value transformation

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# setting paths
data_dir = os.path.abspath('./_data')
train_data_fp = os.path.join(data_dir, 'train_clean.csv')
test_data_fp = os.path.join(data_dir, 'test_clean.csv')
data_fp = os.path.join(data_dir, 'data_clean.csv')
os.listdir(data_dir)

['data.csv',
 'data_clean.csv',
 'data_no_missing.csv',
 'test.csv',
 'test_clean.csv',
 'train_clean.csv']

In [3]:
train_set = pd.read_csv(train_data_fp)
train_set.head(1)

Unnamed: 0,ch_type,length,first_or_rerun,episodes_in_season,is_movie,overlaped_with_game,temperature,Is_month_end,Is_month_start,Is_quarter_end,...,Month_12,Dayofweek_0,Dayofweek_1,Dayofweek_2,Dayofweek_3,Dayofweek_4,Dayofweek_5,Dayofweek_6,market_share,market_share_transformed
0,0,0.906181,0,1,0,0,1.094244,0,0,0,...,0,1,0,0,0,0,0,0,0.9,-0.001165


In [4]:
target = train_set.loc[:, 'market_share_transformed']
target_src = train_set.loc[:, 'market_share']
targets_joint = train_set.loc[:, ['market_share_transformed', 'market_share']]

train_set.drop(columns=['market_share', 'market_share_transformed'], inplace=True)

In [5]:
targets_joint

Unnamed: 0,market_share_transformed,market_share
0,-0.001165,0.9
1,-0.007722,0.5
2,-0.013415,0.3
3,0.005935,1.7
4,0.008815,2.2
...,...,...
616651,-0.077182,0.0
616652,0.001075,1.1
616653,-0.013415,0.3
616654,-0.010210,0.4


In [6]:
rs = 7

X_train, X_test, y_train, y_test = train_test_split(train_set, targets_joint, test_size=0.2, random_state=rs)

In [7]:
X_train.shape, y_train.shape

((493324, 91), (493324, 2))

In [8]:
X_test.shape, y_test.shape

((123332, 91), (123332, 2))

In [15]:
rf_regressor = RandomForestRegressor(n_jobs=-1, random_state=rs)

In [16]:
rf_regressor.fit(X_train, y_train.iloc[:, 0])

preds = rf_regressor.predict(X_test)

In [21]:
sorted(zip(rf_regressor.feature_importances_, X_train.columns), reverse=True)[:15]

[(0.21284136986674948, 'temperature'),
 (0.18079331203616866, 'station_Unis TV+'),
 (0.11991037906745879, 'Elapsed'),
 (0.07327655757019758, 'ch_type'),
 (0.050017958404122424, 'running_time'),
 (0.02863249135980259, 'station_RDS2+'),
 (0.015928970845525126, 'station_MusiquePlus+ (retired Aug 25, 2019)'),
 (0.015736801983140328, 'station_Evasion+'),
 (0.014671906789228212, 'genre_Amateur Sports'),
 (0.012688061711648213, 'length'),
 (0.010702262413893123, 'station_RDI+'),
 (0.010004028521366958, 'station_ARTV+'),
 (0.009942313975206315, 'station_VRAK+'),
 (0.009834169413138269, 'station_TVA Total'),
 (0.009295721928494496, 'station_Series++')]

In [25]:
def evaluate(actual, prediction):
    return {
        'MAE': mean_absolute_error(actual, prediction),
        'MSE': mean_squared_error(actual, prediction),
        'R^2': r2_score(actual, prediction)
    }

In [27]:
tfms = TargetTransform(data=target_src)

In [28]:
preds_tf = tfms.inverse(preds)

In [31]:
evaluate(actual=y_test.iloc[:, 1], prediction=preds_tf)

{'MAE': 1.1977181947670723,
 'MSE': 4.865042261965713,
 'R^2': 0.7992668119278457}

In [33]:
xgb_regressor = XGBRegressor(n_jobs=-1, random_state=rs)

In [34]:
xgb_regressor.fit(X_train, y_train.iloc[:, 0])

xg_preds = xgb_regressor.predict(X_test)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




In [35]:
sorted(zip(xgb_regressor.feature_importances_, X_train.columns), reverse=True)[:15]

[(0.25840703, 'station_Unis TV+'),
 (0.12847042, 'ch_type'),
 (0.054695167, 'station_RDS2+'),
 (0.047006793, 'station_RDI+'),
 (0.039019894, 'genre_Amateur Sports'),
 (0.035412952, 'station_Evasion+'),
 (0.033464096, 'station_MusiquePlus+ (retired Aug 25, 2019)'),
 (0.03280076, 'station_Series++'),
 (0.032279097, 'station_TQ Total'),
 (0.030781277, 'station_LCN+'),
 (0.029672926, 'station_TVA Total'),
 (0.021927195, 'station_PYC Online Stream Fr CBC-8+'),
 (0.02172271, 'station_PYC Online Stream Fr CBC-9+'),
 (0.019327993, 'genre_Interstitial'),
 (0.018675428, 'station_ARTV+')]

In [36]:
xg_preds_tf = tfms.inverse(xg_preds)

In [37]:
evaluate(actual=y_test.iloc[:, 1], prediction=xg_preds_tf)

{'MAE': 1.7010638099602957,
 'MSE': 11.137765831718959,
 'R^2': 0.5404522462465275}

In [38]:
from sklearn.decomposition import PCA

In [39]:
pca = PCA(n_components=None)
pca.fit(train_set)
varinces = pca.explained_variance_ratio_

In [41]:
varinces[:10].sum()

0.6288830738803108

In [42]:
pca = PCA(n_components=10)

X_train_dr = pca.fit_transform(X_train)
X_test_dr = pca.transform(X_test)

X_train_dr.shape, X_test_dr.shape

((493324, 10), (123332, 10))

In [46]:
rf_regressor_dr = RandomForestRegressor(n_jobs=-1, random_state=rs)

Wall time: 0 ns


In [47]:
%%time
rf_regressor_dr.fit(X_train_dr, y_train.iloc[:, 0])

preds_dr = rf_regressor_dr.predict(X_test_dr)

Wall time: 6min 11s


In [48]:
preds_dr_tf = tfms.inverse(preds_dr)

In [49]:
evaluate(actual=y_test.iloc[:, 1], prediction=preds_dr_tf)

{'MAE': 1.4660773799195772,
 'MSE': 8.175037297697104,
 'R^2': 0.6626953660393353}

In [None]:
rf = RandomForestRegressor()
params = {
    'n_estimators': [10, 100, 200],
    'max_depth': [30, 60, 90, None],
}