In [5]:
import pandas as pd
import matplotlib.pylab as plt
# import torch 
from sklearn.metrics import mean_absolute_error
from preprocessing.preprocess_data import DataSet, ReLU, pred_to_delivery, make_categorical, remap
from autogluon.tabular import TabularPredictor

%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)



In [6]:
selected_features = ['date_forecast', 'absolute_humidity_2m:gm3',
       'clear_sky_energy_1h:J', 'clear_sky_rad:W',
       'cloud_base_agl:m', 'dew_or_rime:idx', 'dew_point_2m:K',
       'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W', 'direct_rad_1h:J',
       'effective_cloud_cover:p', 'elevation:m', 'fresh_snow_12h:cm',
       'fresh_snow_1h:cm', 'fresh_snow_24h:cm', 'fresh_snow_3h:cm',
       'fresh_snow_6h:cm', 'is_in_shadow:idx', 'is_day:idx', 
       'msl_pressure:hPa', 'precip_5min:mm', 'precip_type_5min:idx',
       'pressure_100m:hPa', 'pressure_50m:hPa', 'prob_rime:p',
       'rain_water:kgm2', 'relative_humidity_1000hPa:p', 'sfc_pressure:hPa',
       'snow_depth:cm', 'snow_drift:idx',
       'snow_melt_10min:mm', 'snow_water:kgm2', 'sun_azimuth:d',
       'sun_elevation:d', 'super_cooled_liquid_water:kgm2', 't_1000hPa:K',
       'total_cloud_cover:p', 'visibility:m', 'wind_speed_10m:ms',
       'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms', 'wind_speed_w_1000hPa:ms']

made_features = ['location', 'type', 'is_day:idx', 'is_in_shadow:idx', 'dew_or_rime:idx']

drop_feature = 'diffuse_rad:W'

In [7]:
data_collection = DataSet()
data_collection.select_features(selected_features)
data_collection.resample_to_hourly()
data_collection.remove_nans(drop_feature)
data_collection.add_location()
data_collection.add_type()

data_collection.combine_obs_est()
data_collection.drop_bad_data()
data_collection.cyclic_time_encoding()

In [8]:
X_a = data_collection.X_train['a']
X_b = data_collection.X_train['b']
X_c = data_collection.X_train['c']

y_a = data_collection.Y_train['a']
y_b = data_collection.Y_train['b']
y_c = data_collection.Y_train['c']

for f in made_features:
    if f not in ['location', 'type']:
        X_a[f] = X_a[f].map(remap)
        X_b[f] = X_b[f].map(remap)
        X_c[f] = X_c[f].map(remap)

make_categorical(X_a,made_features)
make_categorical(X_b,made_features)
make_categorical(X_c,made_features)


In [9]:

drop_cols = ['location', 'time']

df_a = pd.concat([X_a, y_a], axis=1).drop(columns=drop_cols)
df_b = pd.concat([X_b, y_b], axis=1).drop(columns=drop_cols)
df_c = pd.concat([X_c, y_c], axis=1).drop(columns=drop_cols)


In [10]:
seed = 246

data = dict()

# sample 50% of the data for each building with type = 0
df_a_tune = df_a[df_a['type'] == 0].sample(frac=0.5, random_state=seed)
df_b_tune = df_b[df_b['type'] == 0].sample(frac=0.5, random_state=seed)   
df_c_tune = df_c[df_c['type'] == 0].sample(frac=0.5, random_state=seed)

# drop these rows from the original data
df_a_train = df_a.drop(df_a_tune.index)
df_b_train = df_b.drop(df_b_tune.index)
df_c_train = df_c.drop(df_c_tune.index)

data['a'] = [df_a_train, df_a_tune]
data['b'] = [df_b_train, df_b_tune]
data['c'] = [df_c_train, df_c_tune]

# Model

In [11]:
#3 hours (per model)
time_in_sek = 60*60*2



In [12]:
label = 'pv_measurement'
predictor_a = TabularPredictor(label=label, eval_metric='mae').fit(
    train_data = data['a'][0], 
    time_limit = time_in_sek,
    presets='best_quality',
    num_bag_folds=8,
    num_stack_levels=0,
    tuning_data = data['a'][1],
    use_bag_holdout= True
)

No path specified. Models will be saved in: "AutogluonModels/ag-20231109_161927/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 60s
AutoGluon will save models to "AutogluonModels/ag-20231109_161927/"
AutoGluon Version:  0.8.2
Python Version:     3.10.7
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 19.6.0: Thu Jan 13 01:26:33 PST 2022; root:xnu-6153.141.51~3/RELEASE_X86_64
Disk Space Avail:   246.98 GB / 500.07 GB (49.4%)
Train Data Rows:    31864
Train Data Columns: 47
Tuning Data Rows:    2197
Tuning Data Columns: 47
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 650.65332, 1179.83452)
	If 'regression' is not the corr

[1000]	valid_set's l1: 170.555
[2000]	valid_set's l1: 163.36


		_format_eval_result() missing 1 required positional argument: 'show_stdv'
Detailed Traceback:
Traceback (most recent call last):
  File "/Users/johanvikmathisen/Desktop/Fag/Matematikk/Solar-Energy-Prediction/myenv/lib/python3.10/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1733, in _train_and_save
    model = self._train_single(X, y, model, X_val, y_val, total_resources=total_resources, **model_fit_kwargs)
  File "/Users/johanvikmathisen/Desktop/Fag/Matematikk/Solar-Energy-Prediction/myenv/lib/python3.10/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1684, in _train_single
    model = model.fit(X=X, y=y, X_val=X_val, y_val=y_val, total_resources=total_resources, **model_fit_kwargs)
  File "/Users/johanvikmathisen/Desktop/Fag/Matematikk/Solar-Energy-Prediction/myenv/lib/python3.10/site-packages/autogluon/core/models/abstract/abstract_model.py", line 829, in fit
    out = self._fit(**kwargs)
  File "/Users/johanvikmathisen/Desktop/Fag/Matematikk/Sola

[1000]	valid_set's l1: 172


		_format_eval_result() missing 1 required positional argument: 'show_stdv'
Detailed Traceback:
Traceback (most recent call last):
  File "/Users/johanvikmathisen/Desktop/Fag/Matematikk/Solar-Energy-Prediction/myenv/lib/python3.10/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1733, in _train_and_save
    model = self._train_single(X, y, model, X_val, y_val, total_resources=total_resources, **model_fit_kwargs)
  File "/Users/johanvikmathisen/Desktop/Fag/Matematikk/Solar-Energy-Prediction/myenv/lib/python3.10/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1684, in _train_single
    model = model.fit(X=X, y=y, X_val=X_val, y_val=y_val, total_resources=total_resources, **model_fit_kwargs)
  File "/Users/johanvikmathisen/Desktop/Fag/Matematikk/Solar-Energy-Prediction/myenv/lib/python3.10/site-packages/autogluon/core/models/abstract/abstract_model.py", line 829, in fit
    out = self._fit(**kwargs)
  File "/Users/johanvikmathisen/Desktop/Fag/Matematikk/Sola

In [19]:
predictor_b = TabularPredictor(label=label, eval_metric='mae').fit(
    train_data = data['b'][0], 
    time_limit = time_in_sek,
    presets='best_quality',
    num_bag_folds=8,
    num_stack_levels=0,
    tuning_data = data['b'][1],
    use_bag_holdout=True
)

No path specified. Models will be saved in: "AutogluonModels/ag-20231108_154110/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutogluonModels/ag-20231108_154110/"
AutoGluon Version:  0.8.2
Python Version:     3.10.7
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 19.6.0: Thu Jan 13 01:26:33 PST 2022; root:xnu-6153.141.51~3/RELEASE_X86_64
Disk Space Avail:   250.56 GB / 500.07 GB (50.1%)
Train Data Rows:    31019
Train Data Columns: 39
Tuning Data Rows:    1800
Tuning Data Columns: 39
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (1152.3, -0.0, 99.69624, 196.54802)
	If 'regression' is not the corr

In [20]:
predictor_c = TabularPredictor(label=label, eval_metric='mae').fit(
    train_data = data['c'][0], 
    time_limit = time_in_sek,
    presets='best_quality',
    num_bag_folds=8,
    num_stack_levels=0,
    tuning_data = data['c'][1],
    use_bag_holdout=True
)

No path specified. Models will be saved in: "AutogluonModels/ag-20231108_161942/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutogluonModels/ag-20231108_161942/"
AutoGluon Version:  0.8.2
Python Version:     3.10.7
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 19.6.0: Thu Jan 13 01:26:33 PST 2022; root:xnu-6153.141.51~3/RELEASE_X86_64
Disk Space Avail:   250.17 GB / 500.07 GB (50.0%)
Train Data Rows:    24606
Train Data Columns: 39
Tuning Data Rows:    1465
Tuning Data Columns: 39
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
	Label info (max, min, mean, stddev): (999.6, 0.0, 79.70535, 168.37633)
	If 'regression' is not the c

In [21]:
predictor_a.refit_full()
predictor_b.refit_full()
predictor_c.refit_full()

Refitting models via `predictor.refit_full` using all of the data (combined train and validation)...
	Models trained in this way will have the suffix "_FULL" and have NaN validation score.
	This process is not bound by time_limit, but should take less time than the original `predictor.fit` call.
	To learn more, refer to the `.refit_full` method docstring which explains how "_FULL" models differ from normal models.
Fitting 1 L1 models ...
Fitting model: KNeighborsUnif_BAG_L1_FULL ...
	0.04s	 = Training   runtime
Fitting 1 L1 models ...
Fitting model: KNeighborsDist_BAG_L1_FULL ...
	0.04s	 = Training   runtime
Fitting 1 L1 models ...
Fitting model: RandomForestMSE_BAG_L1_FULL ...
	12.6s	 = Training   runtime
Fitting 1 L1 models ...
Fitting model: CatBoost_BAG_L1_FULL ...
	196.32s	 = Training   runtime
Fitting 1 L1 models ...
Fitting model: ExtraTreesMSE_BAG_L1_FULL ...
	3.22s	 = Training   runtime
Fitting model: WeightedEnsemble_L2_FULL | Skipping fit via cloning parent ...
	0.26s	 = Tra

{'KNeighborsUnif_BAG_L1': 'KNeighborsUnif_BAG_L1_FULL',
 'KNeighborsDist_BAG_L1': 'KNeighborsDist_BAG_L1_FULL',
 'RandomForestMSE_BAG_L1': 'RandomForestMSE_BAG_L1_FULL',
 'CatBoost_BAG_L1': 'CatBoost_BAG_L1_FULL',
 'ExtraTreesMSE_BAG_L1': 'ExtraTreesMSE_BAG_L1_FULL',
 'WeightedEnsemble_L2': 'WeightedEnsemble_L2_FULL'}

In [13]:
predictor_a.leaderboard(silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForestMSE_BAG_L1,-112.622077,1.237438,38.85455,1.237438,38.85455,1,True,3
1,WeightedEnsemble_L2,-112.622077,1.238633,39.043427,0.001195,0.188877,2,True,4
2,KNeighborsDist_BAG_L1,-170.236486,2.462112,0.072379,2.462112,0.072379,1,True,2
3,KNeighborsUnif_BAG_L1,-170.539814,2.645159,0.056876,2.645159,0.056876,1,True,1


In [23]:
predictor_b.leaderboard(silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-15.877613,1.386984,2216.911359,0.004326,0.24333,2,True,6
1,ExtraTreesMSE_BAG_L1,-16.557575,0.5106,2.931665,0.5106,2.931665,1,True,5
2,CatBoost_BAG_L1,-16.641778,0.260628,2198.271318,0.260628,2198.271318,1,True,4
3,RandomForestMSE_BAG_L1,-16.789961,0.61143,15.465046,0.61143,15.465046,1,True,3
4,KNeighborsUnif_BAG_L1,-29.486076,2.252394,0.049953,2.252394,0.049953,1,True,1
5,KNeighborsDist_BAG_L1,-29.645831,2.303939,0.074051,2.303939,0.074051,1,True,2
6,WeightedEnsemble_L2_FULL,,,212.836344,,0.24333,2,True,12
7,RandomForestMSE_BAG_L1_FULL,,,16.027623,,16.027623,1,True,9
8,KNeighborsUnif_BAG_L1_FULL,,,0.046014,,0.046014,1,True,7
9,KNeighborsDist_BAG_L1_FULL,,,0.030418,,0.030418,1,True,8


In [24]:
predictor_c.leaderboard(silent=True)


Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-14.043377,2.309985,1909.175653,0.000759,0.229273,2,True,6
1,CatBoost_BAG_L1,-14.083426,0.15897,1905.577559,0.15897,1905.577559,1,True,4
2,ExtraTreesMSE_BAG_L1,-15.582089,0.66573,3.315713,0.66573,3.315713,1,True,5
3,RandomForestMSE_BAG_L1,-16.913265,0.753975,17.336973,0.753975,17.336973,1,True,3
4,KNeighborsDist_BAG_L1,-20.7875,1.484526,0.053108,1.484526,0.053108,1,True,2
5,KNeighborsUnif_BAG_L1,-20.882629,1.43478,0.033141,1.43478,0.033141,1,True,1
6,WeightedEnsemble_L2_FULL,,,182.296379,,0.229273,2,True,12
7,RandomForestMSE_BAG_L1_FULL,,,17.596197,,17.596197,1,True,9
8,KNeighborsUnif_BAG_L1_FULL,,,0.03413,,0.03413,1,True,7
9,KNeighborsDist_BAG_L1_FULL,,,0.026928,,0.026928,1,True,8


# Predictions

In [25]:
test_a = data_collection.X_test_estimated['a'].drop(columns=['location', 'date_forecast'])
test_b = data_collection.X_test_estimated['b'].drop(columns=['location', 'date_forecast'])
test_c = data_collection.X_test_estimated['c'].drop(columns=['location', 'date_forecast'])


In [26]:
y_pred_a = predictor_a.predict(test_a)
y_pred_b = predictor_b.predict(test_b)
y_pred_c = predictor_c.predict(test_c)

In [27]:
final_pred = pd.concat([y_pred_a, y_pred_b, y_pred_c]).reset_index(drop=True)
final_pred = ReLU(final_pred)

mean of final pred:  540.3197


In [30]:
save = True
model_name = "AutoGluon_3"

if save:
    pred_to_delivery(ReLU(final_pred),'Delivered_preds/' + model_name + '.csv')