In [7]:
import pandas as pd
import numpy as np
import holidays
from sklearn.model_selection import cross_val_score

# Feature engineering

In [8]:
kijkcijfers = pd.read_csv('./data/processed/tv_kijkcijfers_weer.csv')

## Timestamp feature engineering

Maak nieuwe kolommen van de timestamp kolom

In [9]:
kijkcijfers['timestamp'] = pd.to_datetime(kijkcijfers['timestamp'])

# Functie om seizoen uit datum te halen
def get_season(date):
    if date.month in [3, 4, 5]:
        return 'lente'
    elif date.month in [6, 7, 8]:
        return 'zomer'
    elif date.month in [9, 10, 11]:
        return 'herfst'
    else:
        return 'winter'

kijkcijfers['season'] = kijkcijfers['timestamp'].apply(get_season)

# weekday toevoegen
kijkcijfers['weekday'] = kijkcijfers['timestamp'].dt.weekday

# uur toevoegen
kijkcijfers['hour'] = kijkcijfers['timestamp'].dt.hour

# dag toevoegen
kijkcijfers['day'] = kijkcijfers['timestamp'].dt.day

# maand toevoegen
kijkcijfers['month'] = kijkcijfers['timestamp'].dt.month

# isWeekend toevoegen
kijkcijfers['isWeekend'] = kijkcijfers['weekday'].apply(lambda x: 1 if x in [5, 6] else 0)

# isPrimeTime toevoegen
kijkcijfers['isPrimeTime'] = kijkcijfers['hour'].apply(lambda x: 1 if x >= 18 and x <= 23 else 0)

# isHoliday toevoegen
be_holidays = holidays.BE()
kijkcijfers['isHoliday'] = kijkcijfers['timestamp'].apply(lambda x: 1 if x in be_holidays else 0)

kijkcijfers

Unnamed: 0,timestamp,channel,program,duration_sec,live,viewers,weather_code,temperature,rain,windspeed,snowfall,precipitation,season,weekday,hour,day,month,isWeekend,isPrimeTime,isHoliday
0,2016-10-01 19:00:05,EEN,HET 7 UUR-JOURNAAL,1898.0,0,721850,53.0,13.9,0.5,13.8,0.0,0.5,herfst,5,19,1,10,1,1,0
1,2016-10-01 20:41:00,EEN,FC DE KAMPIOENEN,2319.0,0,709606,1.0,12.8,0.0,14.9,0.0,0.0,herfst,5,20,1,10,1,1,0
2,2016-10-01 20:13:36,EEN,WEG ZIJN WIJ,1484.0,0,548239,1.0,12.8,0.0,14.9,0.0,0.0,herfst,5,20,1,10,1,1,0
3,2016-10-01 19:38:10,EEN,IEDEREEN BEROEMD,1741.0,0,523610,53.0,13.9,0.5,13.8,0.0,0.5,herfst,5,19,1,10,1,1,0
4,2016-10-01 19:52:06,VTM,COMEDY TOPPERS,1480.0,0,496216,53.0,13.9,0.5,13.8,0.0,0.5,herfst,5,19,1,10,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60503,2025-02-24 17:12:53,VRT 1,DE RIDDER,2947.0,0,207717,51.0,11.4,0.1,16.1,0.0,0.1,winter,0,17,24,2,0,0,0
60504,2025-02-24 19:59:02,PLAY4,DE TAFEL VAN GERT,3959.0,0,197158,1.0,10.1,0.0,15.2,0.0,0.0,winter,0,19,24,2,0,1,0
60505,2025-02-24 18:24:13,VTM,MILO,1428.0,0,192692,0.0,11.0,0.0,16.2,0.0,0.0,winter,0,18,24,2,0,1,0
60506,2025-02-24 21:19:40,PLAY4,"DE EXPEDITIE, NAMIBIE",3340.0,0,188187,1.0,9.3,0.0,15.2,0.0,0.0,winter,0,21,24,2,0,1,0


## Category feature engineering

Bekijk de cardinaliteit van categorical features om te beslissen wat er mee te doen

In [10]:
kijkcijfers_cat = kijkcijfers[['program', 'channel', 'live', 'weather_code', 'season']]

for column in kijkcijfers_cat.columns:
    print(f'Cardinaliteit van [{column}]: {kijkcijfers_cat[column].nunique()}')

Cardinaliteit van [program]: 5925
Cardinaliteit van [channel]: 32
Cardinaliteit van [live]: 4
Cardinaliteit van [weather_code]: 13
Cardinaliteit van [season]: 4


Categorical attributen live en channel hebben laag genoege cardinaliteit om **one hot encoding** toe te passen

In [11]:
from sklearn.preprocessing import OneHotEncoder

# One hot encoding voor 'live' en 'channel'
cat_encoder = OneHotEncoder()
kijkcijfers_cat = kijkcijfers[['live', 'channel', 'weather_code', 'season']]
kijkcijfers_cat_1hot = cat_encoder.fit_transform(kijkcijfers_cat)

# Maak een dataframe van de one hot encoding
one_hot_output = pd.DataFrame(kijkcijfers_cat_1hot.toarray(), columns=cat_encoder.get_feature_names_out(), index=kijkcijfers_cat.index)

# Voeg de one hot encoding toe aan de dataset
kijkcijfers = kijkcijfers.drop(columns=['live', 'channel', 'weather_code', 'season'])
kijkcijfers = pd.concat([kijkcijfers, one_hot_output], axis=1)
kijkcijfers.sample(5)

Unnamed: 0,timestamp,program,duration_sec,viewers,temperature,rain,windspeed,snowfall,precipitation,weekday,...,weather_code_61.0,weather_code_63.0,weather_code_65.0,weather_code_71.0,weather_code_73.0,weather_code_75.0,season_herfst,season_lente,season_winter,season_zomer
51795,2023-12-01 20:49:32,GRACE,5312.0,498851,-0.6,0.0,13.3,0.0,0.0,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8239,2017-11-21 19:00:06,HET 7 UUR-JOURNAAL,2640.0,1112336,9.6,0.0,22.9,0.0,0.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4937,2017-06-08 21:31:50,CONTROL PEDRO,2059.0,164671,20.9,0.0,11.6,0.0,0.0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2051,2017-01-14 12:59:52,NIEUWS 13U VTM,1773.0,316750,2.6,0.2,19.9,0.0,0.2,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
21635,2019-09-24 22:39:57,FAROEK,2366.0,223984,15.1,0.0,17.2,0.0,0.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


Aangezien de cardinaliteit van `program` bijna 6000 is, zou **one hot encoding** op deze kolom resulteren in een veel to hoge dimensionaliteit

Toch denk ik dat de programmanaam veel nuttige informatie kan bevatten om voorspellingen te maken.\
Er zijn 2 methoden die ik wil testen om de informatie binnen deze kolom te kunnen behouden:
- ***CountVectorizer***
  - Identificeert meest voorkomende woorden en maakt hier indicator kolommen van
- ***Target Encoding***
  - i.p.v. categorie zomaar om te zetten naar een getal, categorie vervangen door bv de gemiddelde viewers voor die categorie

### Count Vectorization

In [12]:
# CountVectorizer toepassen op 'program'
from sklearn.feature_extraction.text import CountVectorizer

# AI Gegenereerde lijst van stopwoorden voor titels van vlaamse en engelse tv-programma's
stopwords_vlaams_engels = [
    "het", "de", "een", "van", "in", "met", "en", "voor", "op", "door", "te", 
    "is", "zijn", "als", "uit", "aan", "om", "tot", "bij", "over", "onder", "niet",
    "this", "the", "of", "a", "an", "and", "to", "on", "in", "for", "with", 
    "at", "by", "from", "about", "as", "that", "it", "is", "was", "were", "be",
    "you", "your", "he", "she", "we", "they", "them", "him", "her", "its",
    "or", "but", "so", "if", "then", "which", "who", "what", "how", "where",
    "een", "het", "de", "van", "en", "voor", "op", "uit", "door", "om", "tot",
    "met", "aan", "niet", "te", "bij", "als", "over", "onder", "dat", "die",
    "wat", "wel", "kan", "zal", "zijn", "heb", "hebt", "heeft", "we", "wij",
    "jij", "je", "ik", "hij", "zij", "hun", "hen", "ons"
]

# Maak CountVectorizer aan
vectorizer = CountVectorizer(max_features=100, stop_words=stopwords_vlaams_engels)
program_features = vectorizer.fit_transform(kijkcijfers['program'])

# Maak een dataframe van de CountVectorizer
program_features_df = pd.DataFrame(program_features.toarray(), columns=vectorizer.get_feature_names_out(), index=kijkcijfers.index)

# Voeg de CountVectorizer toe aan de dataset
kijkcijfers_count_vect = kijkcijfers.drop(columns=['program'])
kijkcijfers_count_vect = pd.concat([kijkcijfers, program_features_df], axis=1)
kijkcijfers_count_vect.sample(5)

Unnamed: 0,timestamp,program,duration_sec,viewers,temperature,rain,windspeed,snowfall,precipitation,weekday,...,vtm,wereld,wielrennen,wild,witse,wk,zake,zevende,zomer,zoo
54820,2024-05-05 11:00:04,DE ZEVENDE DAG,6980.0,235555,15.4,0.0,12.6,0.0,0.0,6,...,0,0,0,0,0,0,0,1,0,0
55546,2024-06-10 18:20:52,DE KOTMADAM,1454.0,154983,12.6,0.0,31.3,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
50210,2023-09-13 19:47:55,IEDEREEN BEROEMD,1298.0,636204,18.3,0.0,9.3,0.0,0.0,2,...,0,0,0,0,0,0,0,0,0,0
27150,2020-06-26 18:21:41,DE BUURTPOLITIE,1523.0,203571,23.7,0.3,10.1,0.0,0.3,4,...,0,0,0,0,0,0,0,0,0,0
13216,2018-07-28 13:38:05,THE GUARDIAN,2572.0,88126,23.7,0.3,26.1,0.0,0.3,5,...,0,0,0,0,0,0,0,0,0,0


### Target Encoding

In [13]:
# Target encoding toepassen op 'program'
from category_encoders import TargetEncoder

# Maak TargetEncoder aan
target_encoder = TargetEncoder()

# Fit en transform de target encoder
kijkcijfers_target_encoded = kijkcijfers.copy()

kijkcijfers_target_encoded['program'] = target_encoder.fit_transform(kijkcijfers['program'], kijkcijfers['viewers'])

kijkcijfers_target_encoded.sample(10)

Unnamed: 0,timestamp,program,duration_sec,viewers,temperature,rain,windspeed,snowfall,precipitation,weekday,...,weather_code_61.0,weather_code_63.0,weather_code_65.0,weather_code_71.0,weather_code_73.0,weather_code_75.0,season_herfst,season_lente,season_winter,season_zomer
55645,2024-06-18 13:41:38,1060872.0,1447.0,138906,17.5,2.2,4.7,0.0,2.2,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
34408,2021-07-10 18:08:46,427364.1,2708.0,302397,17.2,4.8,4.2,0.0,4.8,5,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10464,2018-03-13 20:41:27,493232.2,2172.0,644211,4.5,0.0,6.5,0.0,0.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48778,2023-07-03 20:41:39,401587.0,1455.0,331791,17.9,0.0,27.8,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25275,2020-03-24 20:32:17,233464.3,2944.0,357702,5.7,0.0,12.9,0.0,0.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
56402,2024-07-30 21:15:03,205631.9,2474.0,223367,23.4,0.0,9.0,0.0,0.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
41961,2022-07-21 20:56:58,390798.0,10394.0,196276,18.3,0.0,14.3,0.0,0.0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
13781,2018-08-26 19:56:53,483874.0,3005.0,694407,15.3,0.4,18.4,0.0,0.4,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
41450,2022-06-25 20:59:34,629580.9,2189.0,511474,15.6,1.4,8.6,0.0,1.4,5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
53830,2024-03-13 20:55:51,590404.7,3852.0,901975,11.1,0.0,13.7,0.0,0.0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Correlaties

Bekijk de beste correlaties

In [14]:
data_te_num = kijkcijfers_target_encoded.select_dtypes(include=[np.number])
data_cv_num = kijkcijfers_count_vect.select_dtypes(include=[np.number])

corr_matrix_te = data_te_num.corr()
corr_matrix_cv = data_cv_num.corr()

corr_values_te = corr_matrix_te['viewers'].abs().sort_values(ascending=False)
corr_values_cv = corr_matrix_cv['viewers'].abs().sort_values(ascending=False)

print('TARGET ENCODING')
print('Correlatie met target:')
print(corr_values_te.head(25))

print('\nCOUNT VECTORIZER')
print('\nCorrelatie met target:')
print(corr_values_cv.head(25))

print(f"\nAantal kolommen COUNT VECTORIZER: {len(corr_values_cv)}")
print(f"Aantal kolommen TARGET ENCODING: {len(corr_values_te)}")

TARGET ENCODING
Correlatie met target:
viewers                           1.000000
program                           0.826051
channel_EEN                       0.336426
isPrimeTime                       0.291918
temperature                       0.187035
season_zomer                      0.166814
channel_Canvas                    0.154436
channel_CANVAS                    0.143160
channel_VTM                       0.131361
weekday                           0.126778
hour                              0.125685
isWeekend                         0.121478
season_winter                     0.110575
channel_VRT CANVAS                0.095733
duration_sec                      0.094682
channel_PLAY4                     0.093206
channel_VIER                      0.080846
channel_VTM2                      0.077282
channel_Q2                        0.072797
channel_VRT 1                     0.066492
channel_VITAYA                    0.064470
channel_PLAY5                     0.057432
channel_ELEVEN 

Beiden hebben mooie correlaties\
De dataframe met CountVectorizer heeft wel 100 kolommen meer, wat veel meer tijd nodig heeft om te trainen

## Dataframes wegschrijven

In [15]:
kijkcijfers_target_encoded.to_csv('./data/feature_eng/kijkcijfers_target_encoded.csv', index=False)
kijkcijfers_count_vect.to_csv('./data/feature_eng/kijkcijfers_count_vect.csv', index=False)

## Testen beide dataframes met simpele modellen

### Target Encoding

Ridge Regression: MAE = 102.813\
Random Forest Regressor: MAE = 71.491

In [16]:
from sklearn.model_selection import cross_val_score

def test_models(models, X, y, cv=10, scoring='neg_mean_absolute_error'):
    results = {}
    for model in models:
        model_name = type(model).__name__
        scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
        results[model_name] = {
            'scores': scores,
            'mean_score': scores.mean()
        }
        print(f"{model_name} - Cross-validation scores: {scores}")
        print(f"{model_name} - Mean cross-validation score: {scores.mean()}")
    return results

from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

models = [Ridge(), RandomForestRegressor()]
X = data_te_num.drop(columns=['viewers'])
y = data_te_num['viewers']

results = test_models(models, X, y)

Ridge - Cross-validation scores: [-108600.28496379  -98300.26984898  -99281.89565507  -92713.99855419
 -121812.62437561 -103517.49063506  -96356.32145911 -107685.16745567
  -97685.50905503 -102181.68698241]
Ridge - Mean cross-validation score: -102813.5248984918
RandomForestRegressor - Cross-validation scores: [-74926.15046108 -65093.31820691 -65186.5337878  -59892.69414477
 -92471.05905305 -72678.72697405 -65453.10635928 -84123.35576103
 -65662.36347438 -69554.67848264]
RandomForestRegressor - Mean cross-validation score: -71504.19867050005


### CountVectorizer

Ridge Regression: MAE = 113.126\
Random Forest Regressor: MAE = 79.768

In [17]:
from sklearn.model_selection import cross_val_score

def test_models(models, X, y, cv=10, scoring='neg_mean_absolute_error'):
    results = {}
    for model in models:
        model_name = type(model).__name__
        scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
        results[model_name] = {
            'scores': scores,
            'mean_score': scores.mean()
        }
        print(f"{model_name} - Cross-validation scores: {scores}")
        print(f"{model_name} - Mean cross-validation score: {scores.mean()}")
    return results

from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

models = [Ridge(), RandomForestRegressor()]
X = data_cv_num.drop(columns=['viewers'])
y = data_cv_num['viewers']

results = test_models(models, X, y)

Ridge - Cross-validation scores: [-112859.75209956 -109726.392665   -110969.64126099 -107074.21096346
 -133824.63317509 -112977.27891945 -108836.81566171 -120204.70282941
 -106709.80090811 -108233.59342663]
Ridge - Mean cross-validation score: -113141.6821909416
RandomForestRegressor - Cross-validation scores: [ -80696.45099006  -68693.45436154  -73086.28450752  -67365.20761466
 -102234.80552471  -81261.93015948  -74369.45028196  -95169.23747314
  -74061.88308099  -80137.06959504]
RandomForestRegressor - Mean cross-validation score: -79707.57735890974


### Conclusie

De Target Encoded (TE) trainingset heeft bij trainen met zowel Ridge- als RandomForest Regression een lagere MAE dan dezelfde modellen getraind met de CountVectorized (CV) data.\
\
Daarnaast heeft de TE data een veel lagere dimensionaliteit dan de CV data en kost het trainen veel minder tijd.