In [1]:
import pandas as pd
from helpers.data_loader import *
from helpers.metrics import train_and_evaluate_model, mean_squared_error
from model_composer.model_builders import *
from model_composer.prepare import *
from helpers.logger import *
import time
from sklearn.model_selection import train_test_split

In [2]:
target = 'temp'
model_names = [ 'xgb', 'gbm']
model_func = [ create_xgboost_model, create_lightgbm_model]
window_sizes = [7, 14, 30, 60, 180, 365]


In [3]:
def create_sliding_window_features(df, target_column, window_size):
    X, y = [], []
    for i in range(len(df) - window_size):
        X.append(df.drop(columns=[target_column]).iloc[i:i + window_size])
        y.append(df.iloc[i + window_size][target_column])
    return np.array(X), np.array(y)

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score


def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_name, window_size, hyperparams=None):
    # Record start time
    if hyperparams is not None:
        grid_search = GridSearchCV(estimator=model, param_grid=hyperparams,scoring='neg_mean_squared_error', cv=3, verbose=1)

        grid_search.fit(X_train, y_train)

        model = grid_search.best_estimator_

        start_time = time.time()
        model.fit(X_train, y_train)
        training_time = time.time() - start_time
    else:

        start_time = time.time()

        # Fit the model
        model.fit(X_train, y_train)

        # Calculate training time
        training_time = time.time() - start_time

    # Make predictions
    y_pred = model.predict(X_test)

    # Ensure y_pred is 1D
    y_pred = y_pred.flatten()
    y_test = y_test.flatten()

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    smape = 100 * np.mean(2 * np.abs(y_pred - y_test) / (np.abs(y_pred) + np.abs(y_test)))
    r2 = r2_score(y_test, y_pred)
    forecast_bias = np.mean(y_pred - y_test)

    # Compile metrics dictionary
    metrics = {
        'model_name': model_name,
        'window_size': window_size,
        'rmse': rmse,
        'mae': mae,
        'smape': smape,
        'r2': r2,
        'forecast_bias': forecast_bias,
        'training_time': training_time
    }

    return model, metrics
        

In [5]:


# Load the data
data = pd.read_csv('../../data/Germany_20140101_20231231.csv')
df = preprocess_df(data)



for index, model_name in enumerate(model_names):
    for window_size in window_sizes:
        X, y = create_sliding_window_features(df, target, window_size)
        n_samples, window, n_features = X.shape
        X = X.reshape((n_samples, window * n_features))
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=False)
        model = model_func[index]()
        model, metrics = train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_name, window_size)
        log(metrics, 'noTuning')

Metrics have been logged to: ../results/noTuning/xgb/metrics.csv
Metrics have been logged to: ../results/noTuning/xgb/metrics.csv
Metrics have been logged to: ../results/noTuning/xgb/metrics.csv
Metrics have been logged to: ../results/noTuning/xgb/metrics.csv
Metrics have been logged to: ../results/noTuning/xgb/metrics.csv
Metrics have been logged to: ../results/noTuning/xgb/metrics.csv


python(31597) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003861 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 35336
[LightGBM] [Info] Number of data points in the train set: 10220, number of used features: 217
[LightGBM] [Info] Start training from score 10.654002
Metrics have been logged to: ../results/noTuning/gbm/metrics.csv
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007940 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 70672
[LightGBM] [Info] Number of data points in the train set: 10215, number of used features: 434
[LightGBM] [Info] Start training from score 10.657230
Metrics have been logged to: ../results/noTuning/gbm/metrics.csv
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017263 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] 

In [None]:
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2]
}

for index, model_name in enumerate(model_names):
    for window_size in window_sizes:
        X, y = create_sliding_window_features(df, target, window_size)
        n_samples, window, n_features = X.shape
        X = X.reshape((n_samples, window * n_features))
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=False)
        model = model_func[index]()
        model, metrics = train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_name, window_size, hyperparams=param_grid)
        log(metrics, 'tuning')

Fitting 3 folds for each of 324 candidates, totalling 972 fits
Metrics have been logged to: ../results/tuning/xgb/metrics.csv
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Metrics have been logged to: ../results/tuning/xgb/metrics.csv
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Metrics have been logged to: ../results/tuning/xgb/metrics.csv
Fitting 3 folds for each of 324 candidates, totalling 972 fits
