In [42]:
import numpy as np
import pandas as pd
import time

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from scipy.stats import randint, uniform

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import joblib

# Model finetuning

### Data importeren

In [28]:
kijkcijfers = pd.read_csv('./data/feature_eng/kijkcijfers_target_encoded.csv')

# Enkel numerieke kolommen
kijkcijfers = kijkcijfers.select_dtypes(include=[np.number])

X = kijkcijfers.drop(columns=['viewers'])
y = kijkcijfers['viewers']

# Pas standard scaler toe op de features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data in training en test set
# De test set wordt pas later gebruikt om de beste modellen te evalueren
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

## XGBRegressor finetuning

Eerst snel zoeken met RandomizedSearchCV

In [32]:
# laat parameters random variëren binnen parameter grenzen
param_grid = {
    'n_estimators': randint(100, 1000),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.6, 0.4),  # Ensuring the range is within [0, 1]
    'colsample_bytree': uniform(0.6, 0.4),  # Ensuring the range is within [0, 1]
    'gamma': uniform(0, 0.5),
    'reg_lambda': uniform(0.1, 10),
    'reg_alpha': uniform(0, 10)
}

xgb = XGBRegressor()

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=100,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=2
)

start = time.time()

random_search.fit(X_train, y_train)

print(f'Random search took {time.time() - start} seconds\n')
print(f'Beste parameters: {random_search.best_params_}')
print(f'Beste score: {random_search.best_score_}')

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Random search took 891.7988214492798 seconds

Beste parameters: {'colsample_bytree': np.float64(0.7905971869094794), 'gamma': np.float64(0.035934907139148986), 'learning_rate': np.float64(0.026048677772942556), 'max_depth': 9, 'min_child_weight': 1, 'n_estimators': 782, 'reg_alpha': np.float64(5.253528767144523), 'reg_lambda': np.float64(3.0564542834344333), 'subsample': np.float64(0.9490934488850663)}
Beste score: -48478.08046875


In [None]:
# laat parameters random variëren binnen parameter grenzen
param_grid = {
    'n_estimators': range(500, 1000, 50),
    'learning_rate': [0.015, 0.02, 0.025, 0.03],
    'max_depth': randint(7, 10),
    'min_child_weight': randint(1, 3),
    'subsample': [1.0],
    'colsample_bytree': [0.8],
    'gamma': uniform(0.01, 0.05),
    'reg_lambda': [3],
    'reg_alpha': range(0, 5, 1)
}

xgb = XGBRegressor()

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=100,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=2
)

start = time.time()

random_search.fit(X_train, y_train)

print(f'Random search took {time.time() - start} seconds\n')
print(f'Beste parameters: {random_search.best_params_}')
print(f'Beste score: {random_search.best_score_}')

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Random search took 1265.5441715717316 seconds

Beste parameters: {'colsample_bytree': 0.8, 'gamma': np.float64(0.04693476676031165), 'learning_rate': 0.03, 'max_depth': 9, 'min_child_weight': 2, 'n_estimators': 950, 'reg_alpha': 1.0, 'reg_lambda': 3, 'subsample': 1.0}
Beste score: -48499.4109375


Nu diepgaander zoeken met GridSearchCV

In [36]:
param_grid= {
    'n_estimators': [850],
    'learning_rate': [0.025, 0.03, 0.035],
    'max_depth': [8, 9],
    'min_child_weight': [1,2,3],
    'subsample': [1.0],
    'colsample_bytree': [0.8],
    'gamma': [0.035, 0.04, 0.045],
    'reg_lambda': [3],
    'reg_alpha': range(0, 5, 1)
}

grid_search= GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=2
)

start = time.time()

grid_search.fit(X_train, y_train)

print(f'Grid search took {time.time() - start} seconds\n')
print(f'Beste parameters: {grid_search.best_params_}')
print(f'Beste score: {grid_search.best_score_}')

Fitting 3 folds for each of 270 candidates, totalling 810 fits
Grid search took 2132.7597584724426 seconds

Beste parameters: {'colsample_bytree': 0.8, 'gamma': 0.035, 'learning_rate': 0.03, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 850, 'reg_alpha': 1, 'reg_lambda': 3, 'subsample': 1.0}
Beste score: -49185.951822916664


Fitting 3 folds for each of 270 candidates, totalling 810 fits\
Grid search took 2132.7597584724426 seconds\

Beste parameters: {'colsample_bytree': 0.8, 'gamma': 0.035, 'learning_rate': 0.03, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 850, 'reg_alpha': 1, 'reg_lambda': 3, 'subsample': 1.0}\
Beste score: -49185.951822916664

In [38]:
param_grid= {
    'n_estimators': [850],
    'learning_rate': [0.03],
    'max_depth': [9],
    'min_child_weight': [4],
    'subsample': [1.0],
    'colsample_bytree': [0.8],
    'gamma': [0.015, 0.02 ,0.025],
    'reg_lambda': [3],
    'reg_alpha': [1]
}

grid_search= GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=2
)

start = time.time()

grid_search.fit(X_train, y_train)

print(f'Grid search took {time.time() - start} seconds\n')
print(f'Beste parameters: {grid_search.best_params_}')
print(f'Beste score: {grid_search.best_score_}')

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Grid search took 61.28970527648926 seconds

Beste parameters: {'colsample_bytree': 0.8, 'gamma': 0.015, 'learning_rate': 0.03, 'max_depth': 9, 'min_child_weight': 4, 'n_estimators': 850, 'reg_alpha': 1, 'reg_lambda': 3, 'subsample': 1.0}
Beste score: -48588.90234375


Finaal getuned model extraheren en resultaten op de test set berekenen

In [40]:
best_params = grid_search.best_params_
tuned_model = grid_search.best_estimator_

y_pred = tuned_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
accuracy = 1 - mape

print(f'Mean absolute error: {mae}')
print(f'Mean absolute percentage error: {mape}')
print(f'Accuracy: {accuracy}')

Mean absolute error: 47205.0625
Mean absolute percentage error: 0.1372043341398239
Accuracy: 0.8627956658601761


Model opslaan in pkl file

In [43]:
# Sla het model op
joblib.dump(tuned_model, './models/tuned_xgb_model.pkl')

print("Model saved successfully.")

Model saved successfully.


## RandomForestRegressor finetuning

In [47]:
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, 50, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

rf = RandomForestRegressor()

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=20,
    cv=3,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=2
)

random_search.fit(X_train, y_train)

print(f'Beste parameters: {random_search.best_params_}')
print(f'Beste score: {random_search.best_score_}')

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Beste parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': False}
Beste score: -52034.94091757815


In [49]:
pd.DataFrame(y_train).describe()

Unnamed: 0,viewers
count,48406.0
mean,446206.5
std,278722.4
min,18475.0
25%,230043.0
50%,359936.5
75%,605797.8
max,2494114.0
