In [46]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import numpy as np


In [47]:
data = pd.read_csv('../raw_data/hourly_final_dataset.csv')
df = data


In [48]:
print(data.head())


             timestamp  temperature_2m (°C)  relative_humidity_2m (%)  \
0  2021-01-01 00:00:00                 -1.4                        97   
1  2021-01-01 01:00:00                 -0.1                        97   
2  2021-01-01 02:00:00                 -0.9                        97   
3  2021-01-01 03:00:00                 -1.5                        97   
4  2021-01-01 04:00:00                 -1.4                        97   

   apparent_temperature (°C)  precipitation (mm)  rain (mm)  \
0                       -4.7                 0.0        0.0   
1                       -3.2                 0.0        0.0   
2                       -3.9                 0.0        0.0   
3                       -4.7                 0.0        0.0   
4                       -4.5                 0.0        0.0   

   wind_speed_10m (km/h)  wind_speed_100m (km/h)  day_of_week_sin  \
0                    6.1                    11.6        -0.433884   
1                    6.1                    

In [49]:
feature_cols = df[["temperature_2m (°C)", "relative_humidity_2m (%)", "apparent_temperature (°C)", "precipitation (mm)", "rain (mm)", "wind_speed_10m (km/h)", "wind_speed_100m (km/h)", "day_of_week_sin",	"day_of_week_cos", "month_sin",	"month_cos", "isHoliday"]]
target_col = df[["traditional_baguette", "croissant","coupe","pain_au_chocolat","baguette","banette","cereal_baguette"]]


In [50]:
X = feature_cols
y = target_col


In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [52]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [53]:
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
}


In [54]:
train_data = lgb.Dataset(X_train_scaled, label=y_train)
test_data = lgb.Dataset(X_test_scaled, label=y_test, reference=train_data)


In [55]:
num_round = 10000
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rmse = cross_val_score(lgb.LGBMRegressor(**params, num_boost_round=num_round), X_train_scaled, y_train.values.ravel(), scoring='neg_root_mean_squared_error', cv=kf)
print(f'Cross-validated RMSE: {np.mean(-cv_rmse)}')


ValueError: Found input variables with inconsistent numbers of samples: [12249, 85743]

In [None]:
model = lgb.train(params, train_data, num_boost_round=num_round, valid_sets=[test_data])


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002382 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1225
[LightGBM] [Info] Number of data points in the train set: 12249, number of used features: 12
[LightGBM] [Info] Start training from score 7.764144


In [None]:
y_pred = model.predict(X_test_scaled, num_iteration=model.best_iteration)


In [None]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Final Test RMSE: {rmse}')


Final Test RMSE: 14.039546600309533
