In [1]:
!git clone https://github.com/Cajeux1999/AEMO-Solar-Energy-Forecasting.git -q

In [2]:
!pip install tsfresh
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tsfresh.utilities.dataframe_functions import roll_time_series
from tsfresh.feature_extraction import EfficientFCParameters
from tsfresh import extract_features

REPO_NAME = "AEMO-Solar-Energy-Forecasting"
%cd {REPO_NAME}

/content/AEMO-Solar-Energy-Forecasting


In [3]:
#Importing the train_size defined in the config.py file, to generate the EDA study based on the training dataset
from src import config, utils

train_size = config.TRAIN_SIZE

#Defining the filepath to load the dataset
filepath = '/content/AEMO-Solar-Energy-Forecasting/data/raw/'

In [4]:
#Importing New South Wales power output series
nsw1_solar = pd.read_csv(filepath+'nsw_solar.csv').drop(columns=['Unnamed: 0'])
nsw1_solar["ds"] = pd.to_datetime(nsw1_solar["ds"]).dt.tz_localize(None)

## TSFresh

In [5]:
# Create a rolling dataset to then extract TSFresh Features.
df_rolled = roll_time_series(nsw1_solar, column_id="unique_id", column_sort="ds")

Rolling: 100%|██████████| 20/20 [00:32<00:00,  1.62s/it]


In [6]:
df_rolled.head(10)

Unnamed: 0,ds,y,unique_id,id
0,2024-07-01 00:00:00,0.041056,NSW1,"(NSW1, 2024-07-01 00:00:00)"
1,2024-07-01 00:00:00,0.041056,NSW1,"(NSW1, 2024-07-01 00:30:00)"
2,2024-07-01 00:30:00,0.041056,NSW1,"(NSW1, 2024-07-01 00:30:00)"
3,2024-07-01 00:00:00,0.041056,NSW1,"(NSW1, 2024-07-01 01:00:00)"
4,2024-07-01 00:30:00,0.041056,NSW1,"(NSW1, 2024-07-01 01:00:00)"
5,2024-07-01 01:00:00,0.041056,NSW1,"(NSW1, 2024-07-01 01:00:00)"
6,2024-07-01 00:00:00,0.041056,NSW1,"(NSW1, 2024-07-01 01:30:00)"
7,2024-07-01 00:30:00,0.041056,NSW1,"(NSW1, 2024-07-01 01:30:00)"
8,2024-07-01 01:00:00,0.041056,NSW1,"(NSW1, 2024-07-01 01:30:00)"
9,2024-07-01 01:30:00,0.041056,NSW1,"(NSW1, 2024-07-01 01:30:00)"


In [None]:
df_rolled.drop(columns='unique_id',inplace=True)

#Actual feature generation with TSFresh for a rolled time-series format.
settings = EfficientFCParameters()
df_features = extract_features(df_rolled, column_id="id", column_sort="ds", default_fc_parameters=settings)

In [20]:
# The extract_features method generated more than 700 different features.
df_features.tail(5)

Unnamed: 0,unique_id,ds,y__variance_larger_than_standard_deviation,y__has_duplicate_max,y__has_duplicate_min,y__has_duplicate,y__sum_values,y__abs_energy,y__mean_abs_change,y__mean_change,...,y__fourier_entropy__bins_5,y__fourier_entropy__bins_10,y__fourier_entropy__bins_100,y__permutation_entropy__dimension_3__tau_1,y__permutation_entropy__dimension_4__tau_1,y__permutation_entropy__dimension_5__tau_1,y__permutation_entropy__dimension_6__tau_1,y__permutation_entropy__dimension_7__tau_1,y__query_similarity_count__query_None__threshold_0.0,y__mean_n_absolute_max__number_of_maxima_7
8780,NSW1,2024-12-30 22:00:00,1.0,0.0,1.0,1.0,7266493.0,15217490000.0,119.941864,2e-06,...,0.090729,0.090729,0.6134,1.151429,1.691774,2.335228,2.905612,3.427962,,3387.983149
8781,NSW1,2024-12-30 22:30:00,1.0,0.0,1.0,1.0,7266493.0,15217490000.0,119.928205,2e-06,...,0.090729,0.090729,0.6134,1.151354,1.691647,2.335036,2.905713,3.42801,,3387.983149
8782,NSW1,2024-12-30 23:00:00,1.0,0.0,1.0,1.0,7266493.0,15217490000.0,119.914549,2e-06,...,0.090729,0.090729,0.6134,1.151278,1.691519,2.334843,2.905462,3.428056,,3387.983149
8783,NSW1,2024-12-30 23:30:00,1.0,0.0,1.0,1.0,7266493.0,15217490000.0,119.900896,2e-06,...,0.090729,0.090729,0.6134,1.151202,1.691392,2.33465,2.905211,3.427751,,3387.983149
8784,NSW1,2024-12-31 00:00:00,1.0,0.0,1.0,1.0,7266493.0,15217490000.0,119.887246,2e-06,...,0.090729,0.090729,0.6134,1.151127,1.691265,2.334457,2.90496,3.427447,,3387.983149


In [21]:
#Importing exogenous variables for New South Wales
exog = pd.read_csv(filepath+'solar_exogenous.csv')
exog["ds"] = pd.to_datetime(exog["period_end"]).dt.tz_localize(None)
exog.drop(columns=['period','period_end'],inplace=True)

In [23]:
# Adding to the feature dataset the exogenous features (air_temp, dni, ghi)
df_features['ds'] = pd.to_datetime(df_features['ds'])
exog['ds'] = pd.to_datetime(exog['ds'])

df_features = pd.merge(df_features, exog, on='ds', how='left')

In [27]:
# Adding cyclical features to the dataset, to represent time
df_features = utils.add_cyclical_features(df_features)

In [29]:
# Final dataset with created features
df_features.tail(5)

Unnamed: 0,unique_id,ds,y__variance_larger_than_standard_deviation,y__has_duplicate_max,y__has_duplicate_min,y__has_duplicate,y__sum_values,y__abs_energy,y__mean_abs_change,y__mean_change,...,y__mean_n_absolute_max__number_of_maxima_7,air_temp,dni,ghi,halfhour_sin,halfhour_cos,day_sin,day_cos,month_sin,month_cos
8780,NSW1,2024-12-30 22:00:00,1.0,0.0,1.0,1.0,7266493.0,15217490000.0,119.941864,2e-06,...,3387.983149,24,0,0,-0.5,0.866025,-0.2012985,0.97953,-2.449294e-16,1.0
8781,NSW1,2024-12-30 22:30:00,1.0,0.0,1.0,1.0,7266493.0,15217490000.0,119.928205,2e-06,...,3387.983149,24,0,0,-0.382683,0.92388,-0.2012985,0.97953,-2.449294e-16,1.0
8782,NSW1,2024-12-30 23:00:00,1.0,0.0,1.0,1.0,7266493.0,15217490000.0,119.914549,2e-06,...,3387.983149,23,0,0,-0.258819,0.965926,-0.2012985,0.97953,-2.449294e-16,1.0
8783,NSW1,2024-12-30 23:30:00,1.0,0.0,1.0,1.0,7266493.0,15217490000.0,119.900896,2e-06,...,3387.983149,23,0,0,-0.130526,0.991445,-0.2012985,0.97953,-2.449294e-16,1.0
8784,NSW1,2024-12-31 00:00:00,1.0,0.0,1.0,1.0,7266493.0,15217490000.0,119.887246,2e-06,...,3387.983149,23,0,0,0.0,1.0,-2.449294e-16,1.0,-2.449294e-16,1.0


In [30]:
df_features.dtypes.value_counts()

Unnamed: 0,count
float64,783
int64,3
datetime64[ns],1
object,1


## Preprocessing of Feature Selection

In [31]:
!pip install arfs
import arfs
import arfs.preprocessing as arfspp
from arfs.preprocessing import dtype_column_selector, OrdinalEncoderPandas
import arfs.feature_selection as arfsfs
from sklearn.pipeline import Pipeline

Collecting arfs
  Downloading arfs-3.0.0-py3-none-any.whl.metadata (13 kB)
Collecting pandas>=2.2.3 (from arfs)
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Downloading arfs-3.0.0-py3-none-any.whl (776 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.9/776.9 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas, arfs
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.2
    Uninstalling pandas-2.2.2:
      Successfully uninstalled pandas-2.2.2
[31mERROR: pip's dependency resolver does not currently take into accou

In [32]:
'''
Basic feature selection to filter columns:
1. with more than 5% missing values
2. high cardinality
3. high collinearity
4. zero variance
'''
basic_fs_pipeline = Pipeline(
    [
        ("missing", arfsfs.MissingValueThreshold(threshold=0.05)),
        ("unique", arfsfs.UniqueValuesThreshold(threshold=1)),
        ("cardinality", arfsfs.CardinalityThreshold(threshold=10)),
        ("collinearity", arfsfs.CollinearityThreshold(threshold=0.75)),
    ]
)

In [33]:
X = basic_fs_pipeline.fit_transform(df_features)

  return wcov(x, y, w) / np.sqrt(wcov(x, x, w) * wcov(y, y, w))


In [39]:
# the preprocessing method filtered from 788 columns to 520 features.
X.tail(5)

Unnamed: 0,ds,y__variance_larger_than_standard_deviation,y__has_duplicate_max,y__has_duplicate_min,y__has_duplicate,y__mean_second_derivative_central,y__last_location_of_maximum,y__last_location_of_minimum,y__minimum,y__symmetry_looking__r_0.1,...,y__ratio_beyond_r_sigma__r_1,y__ratio_beyond_r_sigma__r_2,y__ratio_beyond_r_sigma__r_3,y__fourier_entropy__bins_2,y__fourier_entropy__bins_3,air_temp,halfhour_sin,halfhour_cos,day_sin,day_cos
8780,2024-12-30 22:00:00,1.0,0.0,1.0,1.0,0.0,0.943059,0.913905,0.003056,0.0,...,0.227537,0.034734,0.0,0.079983,0.090729,24,-0.5,0.866025,-0.2012985,0.97953
8781,2024-12-30 22:30:00,1.0,0.0,1.0,1.0,0.0,0.942951,0.913801,0.003056,0.0,...,0.227511,0.03473,0.0,0.079983,0.090729,24,-0.382683,0.92388,-0.2012985,0.97953
8782,2024-12-30 23:00:00,1.0,0.0,1.0,1.0,0.0,0.942844,0.913697,0.003056,0.0,...,0.227485,0.034726,0.0,0.079983,0.090729,23,-0.258819,0.965926,-0.2012985,0.97953
8783,2024-12-30 23:30:00,1.0,0.0,1.0,1.0,0.0,0.942737,0.913593,0.003056,0.0,...,0.227459,0.034722,0.0,0.079983,0.090729,23,-0.130526,0.991445,-0.2012985,0.97953
8784,2024-12-31 00:00:00,1.0,0.0,1.0,1.0,0.0,0.942629,0.913489,0.003056,0.0,...,0.227433,0.034718,0.0,0.079983,0.090729,23,0.0,1.0,-2.449294e-16,1.0


In [37]:
X.to_csv('/content/AEMO-Solar-Energy-Forecasting/data/df_features_preprocessed.csv')