In [19]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [20]:
kijkcijfers = pd.read_csv('./data/feature_eng/kijkcijfers_target_encoded.csv')

# Enkel numerieke kolommen
kijkcijfers = kijkcijfers.select_dtypes(include=[np.number])

X = kijkcijfers.drop(columns=['viewers'])
y = kijkcijfers['viewers']

# Split data in training en test set
# De test set wordt pas later gebruikt om de beste modellen te evalueren
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

# Model Selection

In [None]:
# Functie om scores van een model te berekenen
def get_scores(model, cv=5):
    print(f"Model: {model.__class__.__name__}")
    # Bereken MAE scores
    mae_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error')
    mae = -np.mean(mae_scores)
    
    # Bereken MAPE scores
    mape_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_percentage_error')
    mape = -np.mean(mape_scores) * 100
    
    print(f"MAE: {mae:.2f}")
    print(f"MAPE: {mape:.2f}%")

# Functie om van lijst van modellen scores te berekenen
def evaluate_models(models):
    for model in models:
        get_scores(model)
        print()

## Get scores for multiple kinds of models

In [29]:
lin_reg = LinearRegression()
dt_reg = DecisionTreeRegressor()
rf_reg = RandomForestRegressor()
knn_reg = KNeighborsRegressor()
svr_reg = SVR()
gb_reg = GradientBoostingRegressor()

evaluate_models([lin_reg, dt_reg, rf_reg, knn_reg, svr_reg, gb_reg])

Model: LinearRegression
MAE: 100761.42
MAPE: 32.20%

Model: DecisionTreeRegressor
MAE: 79431.20
MAPE: 21.70%

Model: RandomForestRegressor
MAE: 58783.60
MAPE: 16.62%

Model: KNeighborsRegressor
MAE: 83482.60
MAPE: 24.92%

Model: SVR
MAE: 211115.74
MAPE: 54.71%

Model: GradientBoostingRegressor
MAE: 77949.08
MAPE: 23.05%



OUTPUT-----------------\
Model: LinearRegression\
MAE: 100,761.42\
MAPE: 32.20%

Model: DecisionTreeRegressor\
MAE: 79,431.20\
MAPE: 21.70%

Model: RandomForestRegressor\
MAE: 58,783.60\
MAPE: 16.35%

Model: KNeighborsRegressor\
MAE: 83,482.60\
MAPE: 24.92%

Model: SVR\
MAE: 211,115.74\
MAPE: 54.71%

Model: GradientBoostingRegressor\
MAE: 77,949.08\
MAPE: 23.05%

Model met beste performantie = `RandomForestRegressor`