In [8]:
import pandas as pd
import numpy as np
import holidays

In [9]:
df = pd.read_csv('./data/processed/tv_kijkcijfers_weer.csv')
df.head()

Unnamed: 0,date,program,channel,startTime,duration,viewers,live,hour,temperature,weather_code,precipitation,rain,snowfall,cloudcover,windspeed
0,2016-10-01,HET 7 UUR-JOURNAAL,EEN,19:00:05,1898.0,721850,0,19,13.9,53.0,0.5,0.5,0.0,27.0,13.8
1,2016-10-01,FC DE KAMPIOENEN,EEN,20:41:00,2319.0,709606,0,20,12.8,1.0,0.0,0.0,0.0,24.0,14.9
2,2016-10-01,WEG ZIJN WIJ,EEN,20:13:36,1484.0,548239,0,20,12.8,1.0,0.0,0.0,0.0,24.0,14.9
3,2016-10-01,IEDEREEN BEROEMD,EEN,19:38:10,1741.0,523610,0,19,13.9,53.0,0.5,0.5,0.0,27.0,13.8
4,2016-10-01,COMEDY TOPPERS,VTM,19:52:06,1480.0,496216,0,19,13.9,53.0,0.5,0.5,0.0,27.0,13.8


In [11]:
# Date omzetten naar datetime
df['date'] = pd.to_datetime(df['date'])

# Date opsplitsen in verschillende kolommen
df['day_of_week'] = df['date'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
df['month'] = df['date'].dt.month
df['season'] = df['month'].apply(lambda x: 'Winter' if x in [12, 1, 2] else ('Spring' if x in [3, 4, 5] else ('Summer' if x in [6, 7, 8] else 'Fall')))

# Get Belgian holidays
be_holidays = holidays.BE(years=range(2015,2025))
holiday_dates = list(be_holidays.keys())

df['is_holiday'] = df['date'].isin(pd.to_datetime(holiday_dates)).astype(int)

df['startTime'] = pd.to_datetime(df['startTime'])

# startTime opsplitsen in verschillende kolommen
df['hour'] = df['startTime'].dt.hour

# prime_time flag
df['prime_time'] = df['hour'].apply(lambda x: 1 if 18 <= x <= 23 else 0)

# cyclical_time encoding
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

# Display the updated dataframe
df.tail()

Unnamed: 0,date,program,channel,startTime,duration,viewers,live,hour,temperature,weather_code,...,cloudcover,windspeed,day_of_week,is_weekend,month,season,is_holiday,prime_time,hour_sin,hour_cos
60523,2025-02-25,MILO,VTM,2025-03-14 18:24:47,1346.0,194820,0,18,,,...,,,1,0,2,Winter,0,1,-1.0,-1.83697e-16
60524,2025-02-25,NIEUWS 13U VTM,VTM,2025-03-14 12:59:48,1361.0,194748,0,12,,,...,,,1,0,2,Winter,0,0,1.224647e-16,-1.0
60525,2025-02-25,TER ZAKE,VRT CANVAS,2025-03-14 20:00:03,2082.0,179528,0,20,,,...,,,1,0,2,Winter,0,1,-0.8660254,0.5
60526,2025-02-25,DOOD SPOOR,PLAY4,2025-03-14 21:22:41,2990.0,168685,0,21,,,...,,,1,0,2,Winter,0,1,-0.7071068,0.7071068
60527,2025-02-25,DE VUILSTE JOBS VAN VLAANDEREN,VTM,2025-03-14 21:59:07,2756.0,141432,0,21,,,...,,,1,0,2,Winter,0,1,-0.7071068,0.7071068


In [12]:
corr_matrix = df.corr(numeric_only=True)
corr_matrix

Unnamed: 0,duration,viewers,live,hour,temperature,weather_code,precipitation,rain,snowfall,cloudcover,windspeed,day_of_week,is_weekend,month,is_holiday,prime_time,hour_sin,hour_cos
duration,1.0,-0.094681,0.011955,-0.027832,0.038845,0.005166,0.003212,0.003111,0.001039,0.004361,-0.004014,0.166958,0.161593,0.013495,0.012499,-0.079177,0.05462,-0.01795
viewers,-0.094681,1.0,-0.042707,0.125677,-0.187035,0.015305,0.009152,0.007556,0.011399,0.021998,0.031959,-0.126727,-0.121445,-0.0357,0.007515,0.291901,-0.317873,0.130263
live,0.011955,-0.042707,1.0,-0.002037,0.013952,0.028477,0.049706,0.044032,0.042399,0.055248,-0.059321,-0.001323,0.002447,0.128167,-0.012244,0.006202,0.008009,0.003168
hour,-0.027832,0.125677,-0.002037,1.0,-0.154561,-0.111609,-0.035117,-0.035325,-0.002396,-0.083471,-0.135241,-0.121937,-0.131029,-0.007481,-0.024284,0.842725,-0.45821,0.968439
temperature,0.038845,-0.187035,0.013952,-0.154561,1.0,-0.057,-0.006953,0.004922,-0.078474,-0.072963,-0.094736,0.010804,0.01058,0.183707,0.02026,-0.129092,0.033152,-0.164306
weather_code,0.005166,0.015305,0.028477,-0.111609,-0.057,1.0,0.560771,0.545953,0.156733,0.34749,0.268816,0.014445,0.010982,-0.020396,-0.003493,-0.107599,0.046867,-0.118511
precipitation,0.003212,0.009152,0.049706,-0.035117,-0.006953,0.560771,1.0,0.988633,0.18084,0.181256,0.153865,0.015858,0.016498,0.012113,-0.004685,-0.03703,0.011015,-0.037825
rain,0.003111,0.007556,0.044032,-0.035325,0.004922,0.545953,0.988633,1.0,0.030918,0.178275,0.151472,0.015031,0.016322,0.015292,-0.003675,-0.037361,0.010957,-0.03819
snowfall,0.001039,0.011399,0.042399,-0.002396,-0.078474,0.156733,0.18084,0.030918,1.0,0.038831,0.032022,0.007133,0.002956,-0.019464,-0.007109,-0.001834,0.001528,-0.001668
cloudcover,0.004361,0.021998,0.055248,-0.083471,-0.072963,0.34749,0.181256,0.178275,0.038831,1.0,0.175221,0.01752,0.016396,0.021561,0.010037,-0.079336,0.011972,-0.090294


In [13]:
corr_matrix['viewers'].sort_values(ascending=False)

viewers          1.000000
prime_time       0.291901
hour_cos         0.130263
hour             0.125677
windspeed        0.031959
cloudcover       0.021998
weather_code     0.015305
snowfall         0.011399
precipitation    0.009152
rain             0.007556
is_holiday       0.007515
month           -0.035700
live            -0.042707
duration        -0.094681
is_weekend      -0.121445
day_of_week     -0.126727
temperature     -0.187035
hour_sin        -0.317873
Name: viewers, dtype: float64

In [20]:
df_num = df.select_dtypes(include=[np.number])
df_num.dropna(inplace=True)
df_num.head()

Unnamed: 0,duration,viewers,live,hour,temperature,weather_code,precipitation,rain,snowfall,cloudcover,windspeed,day_of_week,is_weekend,month,is_holiday,prime_time,hour_sin,hour_cos
0,1898.0,721850,0,19,13.9,53.0,0.5,0.5,0.0,27.0,13.8,5,1,10,0,1,-0.965926,0.258819
1,2319.0,709606,0,20,12.8,1.0,0.0,0.0,0.0,24.0,14.9,5,1,10,0,1,-0.866025,0.5
2,1484.0,548239,0,20,12.8,1.0,0.0,0.0,0.0,24.0,14.9,5,1,10,0,1,-0.866025,0.5
3,1741.0,523610,0,19,13.9,53.0,0.5,0.5,0.0,27.0,13.8,5,1,10,0,1,-0.965926,0.258819
4,1480.0,496216,0,19,13.9,53.0,0.5,0.5,0.0,27.0,13.8,5,1,10,0,1,-0.965926,0.258819


In [28]:
X = df_num.drop('viewers', axis=1)
y = df_num['viewers']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

y_pred = lin_reg.predict(X_test)
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Absolute Percentage Error:", mean_absolute_percentage_error(y_test, y_pred))

Mean Absolute Error: 190616.47849954313
Mean Absolute Percentage Error: 0.5883095131934954


In [26]:
y.describe().apply(lambda x: format(x, 'f'))

count      60508.000000
mean      446055.258164
std       278183.653571
min        15887.000000
25%       229910.000000
50%       360262.000000
75%       606146.000000
max      2494114.000000
Name: viewers, dtype: object