In [46]:
import pandas as pd
import numpy as np
import holidays
from sklearn.model_selection import cross_val_score

# Feature engineering

In [47]:
kijkcijfers = pd.read_csv('./data/processed/tv_kijkcijfers_weer.csv')

## Timestamp feature engineering

Maak nieuwe kolommen van de timestamp kolom

In [48]:
kijkcijfers['timestamp'] = pd.to_datetime(kijkcijfers['timestamp'])

# Functie om seizoen uit datum te halen
def get_season(date):
    if date.month in [3, 4, 5]:
        return 'lente'
    elif date.month in [6, 7, 8]:
        return 'zomer'
    elif date.month in [9, 10, 11]:
        return 'herfst'
    else:
        return 'winter'

kijkcijfers['season'] = kijkcijfers['timestamp'].apply(get_season)

# weekday toevoegen
kijkcijfers['weekday'] = kijkcijfers['timestamp'].dt.weekday

# uur toevoegen
kijkcijfers['hour'] = kijkcijfers['timestamp'].dt.hour

# dag toevoegen
kijkcijfers['day'] = kijkcijfers['timestamp'].dt.day

# maand toevoegen
kijkcijfers['month'] = kijkcijfers['timestamp'].dt.month

# isWeekend toevoegen
kijkcijfers['isWeekend'] = kijkcijfers['weekday'].apply(lambda x: 1 if x in [5, 6] else 0)

# isPrimeTime toevoegen
kijkcijfers['isPrimeTime'] = kijkcijfers['hour'].apply(lambda x: 1 if x >= 18 and x <= 23 else 0)

# isHoliday toevoegen
be_holidays = holidays.BE()
kijkcijfers['isHoliday'] = kijkcijfers['timestamp'].apply(lambda x: 1 if x in be_holidays else 0)

kijkcijfers

Unnamed: 0,timestamp,channel,program,duration_sec,live,viewers,weather_code,temperature,rain,windspeed,snowfall,precipitation,season,weekday,hour,day,month,isWeekend,isPrimeTime,isHoliday
0,2016-10-01 19:00:05,EEN,HET 7 UUR-JOURNAAL,1898.0,0,721850,53.0,13.9,0.5,13.8,0.0,0.5,herfst,5,19,1,10,1,1,0
1,2016-10-01 20:41:00,EEN,FC DE KAMPIOENEN,2319.0,0,709606,1.0,12.8,0.0,14.9,0.0,0.0,herfst,5,20,1,10,1,1,0
2,2016-10-01 20:13:36,EEN,WEG ZIJN WIJ,1484.0,0,548239,1.0,12.8,0.0,14.9,0.0,0.0,herfst,5,20,1,10,1,1,0
3,2016-10-01 19:38:10,EEN,IEDEREEN BEROEMD,1741.0,0,523610,53.0,13.9,0.5,13.8,0.0,0.5,herfst,5,19,1,10,1,1,0
4,2016-10-01 19:52:06,VTM,COMEDY TOPPERS,1480.0,0,496216,53.0,13.9,0.5,13.8,0.0,0.5,herfst,5,19,1,10,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60503,2025-02-24 17:12:53,VRT 1,DE RIDDER,2947.0,0,207717,51.0,11.4,0.1,16.1,0.0,0.1,winter,0,17,24,2,0,0,0
60504,2025-02-24 19:59:02,PLAY4,DE TAFEL VAN GERT,3959.0,0,197158,1.0,10.1,0.0,15.2,0.0,0.0,winter,0,19,24,2,0,1,0
60505,2025-02-24 18:24:13,VTM,MILO,1428.0,0,192692,0.0,11.0,0.0,16.2,0.0,0.0,winter,0,18,24,2,0,1,0
60506,2025-02-24 21:19:40,PLAY4,"DE EXPEDITIE, NAMIBIE",3340.0,0,188187,1.0,9.3,0.0,15.2,0.0,0.0,winter,0,21,24,2,0,1,0


## Lag Features toevoegen

In [49]:
def add_lag(df, n):
    for i in range(1, n+1):
        df[f'viewers_lag{i}'] = df.sort_values('timestamp').groupby('program')['viewers'].shift(i)
        df[f'viewers_lag{i}'] = df[f'viewers_lag{i}'].fillna(df.groupby('program')['viewers'].transform('mean'))
    return df

# Voeg lag features toe
kijkcijfers = add_lag(kijkcijfers, 3)
kijkcijfers[kijkcijfers['program'] == 'NIEUWS 19U VTM'][['timestamp', 'viewers', 'viewers_lag1']]

Unnamed: 0,timestamp,viewers,viewers_lag1
6,2016-10-01 18:59:49,424041,586307.972681
27,2016-10-02 18:59:49,505759,424041.000000
47,2016-10-03 18:59:49,611342,505759.000000
66,2016-10-04 18:59:49,626729,611342.000000
86,2016-10-05 18:59:49,642121,626729.000000
...,...,...,...
60412,2025-02-20 18:59:48,592316,569245.000000
60432,2025-02-21 18:59:48,537591,592316.000000
60449,2025-02-22 18:59:28,564260,537591.000000
60471,2025-02-23 18:59:45,574592,564260.000000


## Category feature engineering

Bekijk de cardinaliteit van categorical features om te beslissen wat er mee te doen

In [50]:
kijkcijfers_cat = kijkcijfers[['program', 'channel', 'live', 'weather_code', 'season']]

for column in kijkcijfers_cat.columns:
    print(f'Cardinaliteit van [{column}]: {kijkcijfers_cat[column].nunique()}')

Cardinaliteit van [program]: 5925
Cardinaliteit van [channel]: 32
Cardinaliteit van [live]: 4
Cardinaliteit van [weather_code]: 13
Cardinaliteit van [season]: 4


Categorical attributen live en channel hebben laag genoege cardinaliteit om **one hot encoding** toe te passen

In [51]:
from sklearn.preprocessing import OneHotEncoder

# One hot encoding voor 'live' en 'channel'
cat_encoder = OneHotEncoder()
kijkcijfers_cat = kijkcijfers[['live', 'channel', 'weather_code', 'season']]
kijkcijfers_cat_1hot = cat_encoder.fit_transform(kijkcijfers_cat)

# Maak een dataframe van de one hot encoding
one_hot_output = pd.DataFrame(kijkcijfers_cat_1hot.toarray(), columns=cat_encoder.get_feature_names_out(), index=kijkcijfers_cat.index)

# Voeg de one hot encoding toe aan de dataset
kijkcijfers = kijkcijfers.drop(columns=['live', 'channel', 'weather_code', 'season'])
kijkcijfers = pd.concat([kijkcijfers, one_hot_output], axis=1)
kijkcijfers.sample(5)

Unnamed: 0,timestamp,program,duration_sec,viewers,temperature,rain,windspeed,snowfall,precipitation,weekday,...,weather_code_61.0,weather_code_63.0,weather_code_65.0,weather_code_71.0,weather_code_73.0,weather_code_75.0,season_herfst,season_lente,season_winter,season_zomer
47769,2023-05-13 19:00:03,HET 7 UUR-JOURNAAL,2353.0,656428,17.2,0.0,14.1,0.0,0.0,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7279,2017-10-04 19:00:04,HET 7 UUR-JOURNAAL,2569.0,922206,13.8,0.0,22.6,0.0,0.0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
15903,2018-12-10 18:30:31,BLOKKEN,1632.0,695337,5.6,0.0,14.6,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
37496,2021-12-11 22:13:11,HIT THE ROAD JACQUES,2509.0,173855,4.1,0.0,18.0,0.0,0.0,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
16000,2018-12-15 18:59:54,NIEUWS 19U VTM,2609.0,573713,0.4,0.0,23.2,0.0,0.0,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Aangezien de cardinaliteit van `program` bijna 6000 is, zou **one hot encoding** op deze kolom resulteren in een veel to hoge dimensionaliteit

Toch denk ik dat de programmanaam veel nuttige informatie kan bevatten om voorspellingen te maken.\
Er zijn 2 methoden die ik wil testen om de informatie binnen deze kolom te kunnen behouden:
- ***CountVectorizer***
  - Identificeert meest voorkomende woorden en maakt hier indicator kolommen van
- ***Target Encoding***
  - i.p.v. categorie zomaar om te zetten naar een getal, categorie vervangen door bv de gemiddelde viewers voor die categorie

### Count Vectorization

In [52]:
# CountVectorizer toepassen op 'program'
from sklearn.feature_extraction.text import CountVectorizer

# AI Gegenereerde lijst van stopwoorden voor titels van vlaamse en engelse tv-programma's
stopwords_vlaams_engels = [
    "het", "de", "een", "van", "in", "met", "en", "voor", "op", "door", "te", 
    "is", "zijn", "als", "uit", "aan", "om", "tot", "bij", "over", "onder", "niet",
    "this", "the", "of", "a", "an", "and", "to", "on", "in", "for", "with", 
    "at", "by", "from", "about", "as", "that", "it", "is", "was", "were", "be",
    "you", "your", "he", "she", "we", "they", "them", "him", "her", "its",
    "or", "but", "so", "if", "then", "which", "who", "what", "how", "where",
    "een", "het", "de", "van", "en", "voor", "op", "uit", "door", "om", "tot",
    "met", "aan", "niet", "te", "bij", "als", "over", "onder", "dat", "die",
    "wat", "wel", "kan", "zal", "zijn", "heb", "hebt", "heeft", "we", "wij",
    "jij", "je", "ik", "hij", "zij", "hun", "hen", "ons"
]

# Maak CountVectorizer aan
vectorizer = CountVectorizer(max_features=100, stop_words=stopwords_vlaams_engels)
program_features = vectorizer.fit_transform(kijkcijfers['program'])

# Maak een dataframe van de CountVectorizer
program_features_df = pd.DataFrame(program_features.toarray(), columns=vectorizer.get_feature_names_out(), index=kijkcijfers.index)

# Voeg de CountVectorizer toe aan de dataset
kijkcijfers_count_vect = kijkcijfers.drop(columns=['program'])
kijkcijfers_count_vect = pd.concat([kijkcijfers, program_features_df], axis=1)
kijkcijfers_count_vect.sample(5)

Unnamed: 0,timestamp,program,duration_sec,viewers,temperature,rain,windspeed,snowfall,precipitation,weekday,...,vtm,wereld,wielrennen,wild,witse,wk,zake,zevende,zomer,zoo
56221,2024-07-21 15:00:03,NATIONAAL DEFILE,9127.0,247415,22.4,0.1,12.1,0.0,0.1,6,...,0,0,0,0,0,0,0,0,0,0
2639,2017-02-13 19:00:03,HET 7 UUR-JOURNAAL,2391.0,1153014,5.0,0.0,20.9,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
19790,2019-06-22 15:38:25,WIELRENNEN. HEISTSE PIJL,6235.0,167873,22.2,0.0,11.2,0.0,0.0,5,...,0,0,1,0,0,0,0,0,0,0
15484,2018-11-19 20:39:22,HOE ZAL IK HET ZEGGEN?,2892.0,707088,3.1,0.0,25.9,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
9181,2018-01-08 19:43:16,IEDEREEN BEROEMD,1170.0,919333,3.4,0.0,14.8,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0


### Target Encoding

In [53]:
# Target encoding toepassen op 'program'
from category_encoders import TargetEncoder

# Maak TargetEncoder aan
target_encoder = TargetEncoder()

# Fit en transform de target encoder
kijkcijfers_target_encoded = kijkcijfers.copy()

kijkcijfers_target_encoded['program'] = target_encoder.fit_transform(kijkcijfers['program'], kijkcijfers['viewers'])

kijkcijfers_target_encoded.sample(10)

Unnamed: 0,timestamp,program,duration_sec,viewers,temperature,rain,windspeed,snowfall,precipitation,weekday,...,weather_code_61.0,weather_code_63.0,weather_code_65.0,weather_code_71.0,weather_code_73.0,weather_code_75.0,season_herfst,season_lente,season_winter,season_zomer
27434,2020-07-25 20:30:09,407497.3,5568.0,149703,19.5,0.9,13.6,0.0,0.9,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
44289,2022-11-18 19:00:02,892381.1,2523.0,784994,7.8,0.0,12.7,0.0,0.0,4,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
20517,2019-07-28 12:33:57,517556.2,1529.0,126102,19.7,0.0,19.8,0.0,0.0,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
23801,2020-01-11 19:54:53,491217.9,1486.0,635438,5.4,0.0,24.5,0.0,0.0,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
52370,2023-12-31 18:59:49,586308.0,2697.0,477206,8.6,0.0,34.8,0.0,0.0,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17757,2019-03-12 12:59:55,242493.8,1625.0,204682,7.8,0.0,32.1,0.0,0.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
15014,2018-10-26 17:45:14,196082.6,1282.0,170724,9.3,1.0,18.7,0.0,1.0,4,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
45725,2023-01-30 23:11:34,485780.4,2356.0,197491,2.7,0.0,10.6,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
58608,2024-11-22 20:45:08,1056834.0,5651.0,1374164,2.4,0.0,13.5,0.0,0.0,4,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
49694,2023-08-18 13:00:04,409248.2,1946.0,313112,23.3,0.0,9.7,0.0,0.0,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Correlaties

Bekijk de beste correlaties

In [54]:
data_te_num = kijkcijfers_target_encoded.select_dtypes(include=[np.number])
data_cv_num = kijkcijfers_count_vect.select_dtypes(include=[np.number])

corr_matrix_te = data_te_num.corr()
corr_matrix_cv = data_cv_num.corr()

corr_values_te = corr_matrix_te['viewers'].abs().sort_values(ascending=False)
corr_values_cv = corr_matrix_cv['viewers'].abs().sort_values(ascending=False)

print('TARGET ENCODING')
print('Correlatie met target:')
print(corr_values_te.head(25))

print('\nCOUNT VECTORIZER')
print('\nCorrelatie met target:')
print(corr_values_cv.head(25))

print(f"\nAantal kolommen COUNT VECTORIZER: {len(corr_values_cv)}")
print(f"Aantal kolommen TARGET ENCODING: {len(corr_values_te)}")

TARGET ENCODING
Correlatie met target:
viewers               1.000000
viewers_lag1          0.894596
viewers_lag2          0.893305
viewers_lag3          0.876292
program               0.826051
channel_EEN           0.336426
isPrimeTime           0.291918
temperature           0.187035
season_zomer          0.166814
channel_Canvas        0.154436
channel_CANVAS        0.143160
channel_VTM           0.131361
weekday               0.126778
hour                  0.125685
isWeekend             0.121478
season_winter         0.110575
channel_VRT CANVAS    0.095733
duration_sec          0.094682
channel_PLAY4         0.093206
channel_VIER          0.080846
channel_VTM2          0.077282
channel_Q2            0.072797
channel_VRT 1         0.066492
channel_VITAYA        0.064470
channel_PLAY5         0.057432
Name: viewers, dtype: float64

COUNT VECTORIZER

Correlatie met target:
viewers           1.000000
viewers_lag1      0.894596
viewers_lag2      0.893305
viewers_lag3      0.876292
thuis 

Beiden hebben mooie correlaties\
De dataframe met CountVectorizer heeft wel 100 kolommen meer, wat veel meer tijd nodig heeft om te trainen

## Dataframes wegschrijven

In [55]:
kijkcijfers_target_encoded.to_csv('./data/feature_eng/kijkcijfers_target_encoded.csv', index=False)
kijkcijfers_count_vect.to_csv('./data/feature_eng/kijkcijfers_count_vect.csv', index=False)

## Testen beide dataframes met simpele modellen

### Target Encoding

Ridge Regression: MAE = 65.970\
Random Forest Regressor: MAE = 53.475

In [56]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

def test_models(models, X, y, cv=10, scoring='neg_mean_absolute_error'):
    results = {}
    for model in models:
        model_name = type(model).__name__
        scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
        results[model_name] = {
            'scores': scores,
            'mean_score': scores.mean()
        }
        print(f"{model_name} - Cross-validation scores: {scores}")
        print(f"{model_name} - Mean cross-validation score: {scores.mean()}")
    return results

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data_te_num.drop(columns=['viewers']))

models = [Ridge(), RandomForestRegressor()]
y = data_te_num['viewers']

results = test_models(models, X_scaled, y)

Ridge - Cross-validation scores: [-72598.36720956 -67834.96485691 -66546.78924579 -63672.23913494
 -71184.91156143 -65674.94706248 -62265.62729573 -61739.75615252
 -63039.40523291 -65145.16656657]
Ridge - Mean cross-validation score: -65970.21743188429
RandomForestRegressor - Cross-validation scores: [-58360.99960998 -53617.51955049 -52401.10049248 -49893.32585523
 -61007.99304082 -53617.90526855 -50939.93741696 -50804.04293175
 -50990.75739669 -53117.28402149]
RandomForestRegressor - Mean cross-validation score: -53475.08655844353


### CountVectorizer

Ridge Regression: MAE = 65.970\
Random Forest Regressor: MAE = 53.441

In [59]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

def test_models(models, X, y, cv=10, scoring='neg_mean_absolute_error'):
    results = {}
    for model in models:
        model_name = type(model).__name__
        scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
        results[model_name] = {
            'scores': scores,
            'mean_score': scores.mean()
        }
        print(f"{model_name} - Cross-validation scores: {scores}")
        print(f"{model_name} - Mean cross-validation score: {scores.mean()}")
    return results

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data_te_num.drop(columns=['viewers']))

models = [Ridge(), RandomForestRegressor()]
y = data_te_num['viewers']

results = test_models(models, X_scaled, y)

Ridge - Cross-validation scores: [-72598.36720956 -67834.96485691 -66546.78924579 -63672.23913494
 -71184.91156143 -65674.94706248 -62265.62729573 -61739.75615252
 -63039.40523291 -65145.16656657]
Ridge - Mean cross-validation score: -65970.21743188429
RandomForestRegressor - Cross-validation scores: [-58093.16752438 -53701.40121633 -52388.55352008 -50068.92906958
 -61086.458549   -53352.32104446 -50939.52665675 -51101.82269873
 -50603.27967769 -53081.14554545]
RandomForestRegressor - Mean cross-validation score: -53441.660550243316


### Conclusie

De Target Encoded (TE) trainingset heeft bij trainen met zowel Ridge- als RandomForest Regression een lagere MAE dan dezelfde modellen getraind met de CountVectorized (CV) data.\
\
Daarnaast heeft de TE data een veel lagere dimensionaliteit dan de CV data en kost het trainen veel minder tijd.