In [1]:
import pandas as pd
from helpers.data_loader import *
from helpers.metrics import train_and_evaluate_model
from model_composer.model_builders import *
from model_composer.prepare import *
from helpers.logger import *
from sklearn.model_selection import train_test_split

In [3]:
target = 'temp'
model_names = [ 'xgb', 'gbm']
model_func = [ create_xgboost_model, create_lightgbm_model]
window_sizes = [7, 14, 30, 60, 180, 365]


In [4]:
def create_sliding_window_features(df, target_column, window_size):
    X, y = [], []
    for i in range(len(df) - window_size):
        X.append(df.drop(columns=[target_column]).iloc[i:i + window_size])
        y.append(df.iloc[i + window_size][target_column])
    return np.array(X), np.array(y)

In [5]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_name, window_size, hyperparams=None):
    # Record start time
    if hyperparams is not None:
        grid_search = GridSearchCV(estimator=model, param_grid=hyperparams,scoring='neg_mean_squared_error', cv=3, verbose=1)

        grid_search.fit(X_train, y_train)

        model = grid_search.best_estimator_

        start_time = time.time()
        model.fit(X_train, y_train)
        training_time = time.time() - start_time
    else:

        start_time = time.time()

        # Fit the model
        model.fit(X_train, y_train)

        # Calculate training time
        training_time = time.time() - start_time

    # Make predictions
    y_pred = model.predict(X_test)

    # Ensure y_pred is 1D
    y_pred = y_pred.flatten()
    y_test = y_test.flatten()

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    smape = 100 * np.mean(2 * np.abs(y_pred - y_test) / (np.abs(y_pred) + np.abs(y_test)))
    r2 = r2_score(y_test, y_pred)
    forecast_bias = np.mean(y_pred - y_test)

    # Compile metrics dictionary
    metrics = {
        'model_name': model_name,
        'window_size': window_size,
        'rmse': rmse,
        'mae': mae,
        'smape': smape,
        'r2': r2,
        'forecast_bias': forecast_bias,
        'training_time': training_time
    }

    return model, metrics
        

In [None]:


# Load the data
data = pd.read_csv('../data/Germany_20140101_20231231.csv')
df = preprocess_df(data)



for index, model_name in enumerate(model_names):
    for window_size in window_sizes:
        X, y = create_sliding_window_features(df, target, window_size)
        n_samples, window, n_features = X.shape
        X = X.reshape((n_samples, window * n_features))
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=False)
        model = model_func[index]()
        model, metrics = train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_name, window_size)
        log(metrics, 'noTuning')

In [None]:
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.01],
    'subsample': [0.8, 1.0]
}

for index, model_name in enumerate(model_names):
    for window_size in window_sizes:
        X, y = create_sliding_window_features(df, target, window_size)
        n_samples, window, n_features = X.shape
        X = X.reshape((n_samples, window * n_features))
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=False)
        model = model_func[index]()
        model, metrics = train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_name, window_size, hyperparams=param_grid)
        log(metrics, 'noTuning')

In [2]:
import pandas as pd
from helpers.logger import *
# Load the dataset
file_path = '../data/Germany_20140101_20231231.csv'
df = pd.read_csv(file_path)
df = preprocess_df(df)

In [3]:
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [4]:
window_size = 7

In [None]:
def create_sliding_window_features(df, target_column, window_size):
    X, y = [], []
    for i in range(len(df) - window_size):
        X.append(df.drop(columns=[target_column]).iloc[i:i + window_size])
        y.append(df.iloc[i + window_size][target_column])
    return np.array(X), np.array(y)

# Assume we want to predict 'temp' (average temperature)
target_column = 'temp'
X, y = create_sliding_window_features(df, target_column, window_size)

# Reshape X to be 2D: (samples, features)
n_samples, window, n_features = X.shape
X = X.reshape((n_samples, window * n_features))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=False)

# Initialize and train the XGBoost model
model = XGBRegressor()

In [26]:
x_train, y_train, x_val, y_val, x_test, y_test = split_time_series_data(df, "temp", window_size=14)

In [27]:
model = XGBRegressor()

In [16]:
X_train


array([[7.0, 2.0, 4.5, ..., 8.566666666666666, 16.3, 7.731944444444444],
       [9.0, 2.0, 6.0, ..., 8.55, 16.316666666666666, 7.764444444444444],
       [10.0, 4.0, 7.1, ..., 8.55, 16.35, 7.798611111111111],
       ...,
       [8.2, -7.4, 5.4, ..., 8.066666666666666, 16.45, 8.395277777777778],
       [9.2, 7.0, 6.3, ..., 8.066666666666666, 16.466666666666665,
        8.407777777777778],
       [9.5, 3.1, 7.8, ..., 8.066666666666666, 16.483333333333334,
        8.422222222222222]], dtype=object)

In [8]:
import time
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV

def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_name, window_size, hyperparams=None):
    # Record start time
    if hyperparams is not None:
        grid_search = GridSearchCV(estimator=model, param_grid=hyperparams,scoring='neg_mean_squared_error', cv=3, verbose=1)

        grid_search.fit(X_train, y_train)
        
        model = grid_search.best_estimator_

        start_time = time.time()
        model.fit(X_train, y_train)
        training_time = time.time() - start_time
    else:
    
        start_time = time.time()
    
        # Fit the model
        model.fit(X_train, y_train)
    
        # Calculate training time
        training_time = time.time() - start_time

    # Make predictions
    y_pred = model.predict(X_test)

    # Ensure y_pred is 1D
    y_pred = y_pred.flatten()
    y_test = y_test.flatten()

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    smape = 100 * np.mean(2 * np.abs(y_pred - y_test) / (np.abs(y_pred) + np.abs(y_test)))
    r2 = r2_score(y_test, y_pred)
    forecast_bias = np.mean(y_pred - y_test)

    # Compile metrics dictionary
    metrics = {
        'model_name': model_name,
        'window_size': window_size,
        'rmse': rmse,
        'mae': mae,
        'smape': smape,
        'r2': r2,
        'forecast_bias': forecast_bias,
        'training_time': training_time
    }

    return model, metrics

In [9]:
_, metrics = train_and_evaluate_model(model, X_train, y_train, X_test, y_test)
log(metrics, 'noTuning')

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.01],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

_, metrics = train_and_evaluate_model(model, X_train, y_train, X_test, y_test, hyperparams=param_grid)
log(metrics, 'Tuning')



Metrics have been logged to: ..\results\noTuning\XGBoost\metrics.csv
Fitting 3 folds for each of 32 candidates, totalling 96 fits
Metrics have been logged to: ..\results\Tuning\XGBoost\metrics.csv
