# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFECV, RFE
import xgboost

from skopt import BayesSearchCV
from skopt.plots import plot_objective
from skopt.space import Real, Categorical, Integer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MaxAbsScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import PredefinedSplit


import optuna

import sklearn.metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb
from helpers import *

from sklearn.model_selection import cross_val_score

# auto reloading library (mainly for altering helpers.py)
%load_ext autoreload
%autoreload 2


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)


In [3]:
drop_cols = ['time', 'date_forecast', 'snow_density:kgm3', 'date_calc', 'monthYear', 'dayMonthYear']


In [4]:
"""
Optuna example that optimizes a classifier configuration for cancer dataset
using XGBoost.

In this example, we optimize the validation accuracy of cancer detection
using XGBoost. We optimize both the choice of booster model and its
hyperparameters.

"""

def objective(trial):
    # Load data
    Xy_train, _ = get_splitted_data()

    # Add features
    Xy_train = add_features(Xy_train.copy())

    X_train, y_train = split_Xy_X_y(Xy_train)

    # shuffle data
    if trial.suggest_categorical('shuffle', [True, False]):
        X_train = X_train.sample(frac=1, random_state=42).reset_index(drop=True)

    # drop columns
    X_train = X_train.drop(columns=drop_cols,errors='ignore')

    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    impute_features = X_train.loc[:, X_train.isna().any()].columns.tolist()



    # set column transformer
    columnTransformer = ColumnTransformer(
        transformers=[
            ('imputer', SimpleImputer(strategy='constant'),impute_features),
            ('oneHotEncoder', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ],
        remainder='passthrough',  # Dont drop remaining columns
        n_jobs=-1
    )

    # build the pipeline
    pipeline = Pipeline(steps=[
        ('columnTransformer', columnTransformer),
        ('statusSaver', StatusSaver()),
        ('estimator', xgboost.XGBRegressor(
            random_state=42,
            learning_rate=0.1,
            max_depth=6,
            reg_alpha=8,
            reg_lambda=5,
            n_estimators=trial.suggest_int('n_estimators', 100, 1000, 100),
            colsample_bytree=1,
            min_child_weight=3,
            ))
    ])
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, error_score='raise')
    return scores.mean()

if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, timeout=60)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))


[I 2023-11-02 09:32:26,636] A new study created in memory with name: no-name-d7c72334-dbc1-4cb5-bdea-7a66762ce2a6


[I 2023-11-02 09:32:45,537] Trial 0 finished with value: -2.273872243769831 and parameters: {'shuffle': True, 'n_estimators': 600}. Best is trial 0 with value: -2.273872243769831.
[I 2023-11-02 09:33:00,432] Trial 1 finished with value: -2.1234206030627414 and parameters: {'shuffle': True, 'n_estimators': 500}. Best is trial 1 with value: -2.1234206030627414.
[I 2023-11-02 09:33:15,099] Trial 2 finished with value: 0.7295621726345344 and parameters: {'shuffle': False, 'n_estimators': 500}. Best is trial 2 with value: 0.7295621726345344.
[I 2023-11-02 09:33:33,819] Trial 3 finished with value: 0.706877614107545 and parameters: {'shuffle': False, 'n_estimators': 700}. Best is trial 2 with value: 0.7295621726345344.


Number of finished trials:  4
Best trial:
  Value: 0.7295621726345344
  Params: 
    shuffle: False
    n_estimators: 500


# Test model on test data

In [5]:
# predict on estimated data
m1_pred = pd.Series(m1.predict(X_test))
t=m1_pred.copy()
#m1_pred = pd.Series(full_scaler.inverse_transform(m1_pred.values.reshape(-1, 1)).flatten())
m1_pred = y_scaler.inverse_transform(m1_pred, X_test['building_id'])
Xy_test['m1_pred'] = m1_pred

# calculate abs diff
Xy_test['abs_diff'] = np.abs(Xy_test['pv_measurement'] - Xy_test['m1_pred'])
Xy_test['diff'] = (Xy_test['pv_measurement'] - Xy_test['m1_pred'])

# calculate mae
mae = Xy_test['abs_diff'].mean()
print('MAE:', mae)


NameError: name 'm1' is not defined

In [None]:
sns.lineplot(data=Xy_test, x='time', y='pv_measurement', hue='building_id', legend=False)
plt.xticks(rotation=90);


In [None]:
sns.lineplot(data=Xy_test, x='time', y='diff', hue='building_id', legend=False)
plt.xticks(rotation=90);


In [None]:
# Creating the submission file
m1.fit(X, y)

# prepare dataframes
y_test_pred = pd.Series(m1.predict(X_submission))
# y_test_pred = pd.Series(full_scaler.inverse_transform(
#     y_test_pred.values.reshape(-1, 1)).flatten())
#y_test_pred = y_scaler.inverse_transform(y_test_pred, X_t['building_id']).copy()

# remove negative predictions
y_test_pred.iloc[y_test_pred < 0] = 0

# rename columns etc.
y_test_pred = y_test_pred.reset_index().rename(
    columns={'pv_measurement': 'prediction', 'index': 'id'})

# save submission file
y_test_pred.to_csv(
    'feature_extraction.csv', index=False, header=True)


#