In [17]:
import numpy as np
import pandas as pd
import time

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from xgboost import XGBRegressor

In [18]:
kijkcijfers = pd.read_csv('./data/feature_eng/kijkcijfers_target_encoded.csv')

# Enkel numerieke kolommen
kijkcijfers = kijkcijfers.select_dtypes(include=[np.number])

X = kijkcijfers.drop(columns=['viewers'])
y = kijkcijfers['viewers']

# Pas standard scaler toe op de features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data in training en test set
# De test set wordt pas later gebruikt om de beste modellen te evalueren
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

# Model Selection

In [15]:
# Functie om scores van een model te berekenen
def get_scores(model, cv=5):
    print(f"Model: {model.__class__.__name__}")
    # Bereken MAE scores
    mae_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error')
    mae = -np.mean(mae_scores)
    
    # Bereken MAPE scores
    mape_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_percentage_error')
    mape = -np.mean(mape_scores) * 100
    
    print(f"MAE: {mae:.2f}")
    print(f"MAPE: {mape:.2f}%")

# Functie om van lijst van modellen scores te berekenen
def evaluate_models(models):
    for model in models:
        start_time = time.time()
        get_scores(model)
        end_time = time.time()
        duration = end_time - start_time
        minutes, seconds = divmod(duration, 60)
        print(f"Duration: {int(minutes)} minutes {seconds:.2f} seconds\n")

## Get scores for multiple kinds of models

In [19]:
lin_reg = LinearRegression()
dt_reg = DecisionTreeRegressor()
rf_reg = RandomForestRegressor()
knn_reg = KNeighborsRegressor()
gb_reg = GradientBoostingRegressor()
xgb_reg = XGBRegressor()

evaluate_models([xgb_reg, lin_reg, dt_reg, rf_reg, knn_reg, gb_reg])

Model: XGBRegressor
MAE: 51303.74
MAPE: 14.97%
Duration: 0 minutes 6.51 seconds

Model: LinearRegression
MAE: 65334.08
MAPE: 19.66%
Duration: 0 minutes 2.11 seconds

Model: DecisionTreeRegressor
MAE: 75803.01
MAPE: 21.64%
Duration: 0 minutes 11.51 seconds

Model: RandomForestRegressor
MAE: 52549.61
MAPE: 15.73%
Duration: 11 minutes 24.63 seconds

Model: KNeighborsRegressor
MAE: 76876.50
MAPE: 23.19%
Duration: 0 minutes 9.42 seconds

Model: GradientBoostingRegressor
MAE: 56922.50
MAPE: 17.09%
Duration: 5 minutes 49.32 seconds



OUTPUT-----------------\
Model: XGBRegressor\
MAE: 51303.74\
MAPE: 14.97%\
Duration: 0 minutes 6.51 seconds

Model: LinearRegression\
MAE: 65334.08\
MAPE: 19.66%\
Duration: 0 minutes 2.11 seconds

Model: DecisionTreeRegressor\
MAE: 75803.01\
MAPE: 21.64%\
Duration: 0 minutes 11.51 seconds

Model: RandomForestRegressor\
MAE: 52549.61\
MAPE: 15.73%\
Duration: 11 minutes 24.63 seconds

Model: KNeighborsRegressor\
MAE: 76876.50\
MAPE: 23.19%\
Duration: 0 minutes 9.42 seconds

Model: GradientBoostingRegressor\
MAE: 56922.50\
MAPE: 17.09%\
Duration: 5 minutes 49.32 seconds

Model met beste MAE = `XGBRegressor`\
2de optie is `RandomForestRegressor`\
Enige probleem is dat deze lang duurt om te trainen.