In [67]:
import pandas as pd
from pandas import DataFrame
from typing import Literal, List
import numpy as np

In [68]:
cleaned_dataset_address = "../dataset/interim/past_dataset.csv"

In [69]:
past_knowledge = pd.read_csv(cleaned_dataset_address, parse_dates=["datetime"])

In [70]:
past_knowledge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5939 entries, 0 to 5938
Data columns (total 33 columns):
 #   Column                                         Non-Null Count  Dtype         
---  ------                                         --------------  -----         
 0   general_dam_occupancy_rate                     5939 non-null   float64       
 1   datetime                                       5939 non-null   datetime64[ns]
 2   temperature_2m_max                             5939 non-null   float64       
 3   temperature_2m_min                             5939 non-null   float64       
 4   temperature_2m_mean                            5939 non-null   float64       
 5   apparent_temperature_max                       5939 non-null   float64       
 6   apparent_temperature_min                       5939 non-null   float64       
 7   apparent_temperature_mean                      5939 non-null   float64       
 8   daylight_duration                              5939 non-nu

In [71]:
class FeatureExtractor:
    def __init__(
        self,
        past_knowledge: DataFrame,
        cyclical_feature_names: List[str],
        lag_size: int = 30,
        window_size: int = 30,
    ):
        self.PAST_KNOWLEDGE = past_knowledge.sort_values(by="datetime")
        self.cyclical_feature_names = cyclical_feature_names
        self.lag_size = lag_size
        self.window_size = window_size

    def transform(self, df: DataFrame) -> DataFrame:
        return (
            df.sort_values("datetime")
            .pipe(self._add_lag_features)
            .pipe(self._add_rolling_window_features)
            .pipe(self._add_exponential_moving_features)
            .pipe(self._drop_columns_with_same_values)
            .pipe(self._expand_datetime)
            .pipe(self._add_fourier_features)
            .pipe(
                lambda df: df.astype(
                    {col: "float32" for col in df.select_dtypes("number").columns}
                )
            )
            .bfill()
        )

    def _add_lag_features(
        self,
        df: DataFrame,
        fillna_with: Literal["ffill", "bfill"] | None = "bfill",
    ) -> DataFrame:
        df["datetime"] = pd.to_datetime(df["datetime"])

        df = df.sort_values("datetime")

        full_date_range = pd.date_range(
            start=self.PAST_KNOWLEDGE["datetime"].min(),
            end=df["datetime"].max(),
            freq="D",
        )
        full_df = pd.DataFrame({"datetime": full_date_range})
        full_df = full_df.merge(self.PAST_KNOWLEDGE, on="datetime", how="left")

        columns_to_use = self.PAST_KNOWLEDGE.select_dtypes(
            include="number"
        ).columns.tolist()

        created_features = []
        for col in columns_to_use:
            for i in range(1, self.lag_size + 1):
                created_col_name = f"{col}_lag_{i}"
                created_features.append(full_df[col].shift(i).rename(created_col_name))

        lags_df = pd.concat([full_df["datetime"], *created_features], axis=1)

        df = df.merge(
            lags_df,
            on="datetime",
            how="left",
        )

        if fillna_with == "ffill":
            df = df.ffill()
        elif fillna_with == "bfill":
            df = df.bfill()

        return df

    def _add_rolling_window_features(
        self,
        df: DataFrame,
        fillna_with: Literal["ffill", "bfill"] | None = "ffill",
    ) -> DataFrame:
        df["datetime"] = pd.to_datetime(df["datetime"])

        df = df.sort_values("datetime")

        full_date_range = pd.date_range(
            start=self.PAST_KNOWLEDGE["datetime"].min(),
            end=df["datetime"].max(),
            freq="D",
        )
        full_df = pd.DataFrame({"datetime": full_date_range}).merge(
            self.PAST_KNOWLEDGE, on="datetime", how="left"
        )

        columns_to_use = self.PAST_KNOWLEDGE.select_dtypes(
            include=["number"]
        ).columns.tolist()

        metrics = ["mean", "std", "min", "max", "median", "var"]

        created_features = []
        for col in columns_to_use:
            for size in range(2, self.window_size + 1):
                rolling_window_feature = (
                    full_df[col]
                    .rolling(window=size, min_periods=1)
                    .agg(metrics)
                    .rename(columns=lambda metric: f"{col}_rw{size}_{metric}")
                )
                created_features.append(rolling_window_feature)

        window_df = pd.concat([full_df["datetime"], *created_features], axis=1)

        df = df.merge(
            window_df,
            on="datetime",
            how="left",
        )

        if fillna_with == "ffill":
            df = df.ffill()
        elif fillna_with == "bfill":
            df = df.bfill()

        return df

    def _drop_columns_with_same_values(self, df: DataFrame, threshold=0.9) -> DataFrame:
        to_drop = [
            col
            for col in df.columns
            if df[col].value_counts(normalize=True, dropna=False).values[0] >= threshold
        ]
        return df.drop(columns=to_drop)

    def _add_exponential_moving_features(
        self, df: pd.DataFrame, up_to: int = 30
    ) -> pd.DataFrame:
        df["datetime"] = pd.to_datetime(df["datetime"])

        df = df.sort_values("datetime")

        full_date_range = pd.date_range(
            start=self.PAST_KNOWLEDGE["datetime"].min(),
            end=df["datetime"].max(),
            freq="D",
        )
        full_df = pd.DataFrame({"datetime": full_date_range}).merge(
            self.PAST_KNOWLEDGE, on="datetime", how="left"
        )

        columns_to_use = self.PAST_KNOWLEDGE.select_dtypes(
            include=["number"]
        ).columns.tolist()

        columns_to_use = self.PAST_KNOWLEDGE.select_dtypes(include="number").columns
        metrics = ["mean"]
        created_features = []

        for col in columns_to_use:
            for span in range(2, up_to + 1):
                feature = (
                    full_df[col]
                    .ewm(span=span, adjust=False)
                    .agg(metrics)
                    .rename(columns=lambda metric: f"{col}_em_{span}_{metric}")
                )
                created_features.append(feature)

        pd.concat([df, *created_features], axis=1)

        exponential_moving_df = pd.concat(
            [full_df["datetime"], *created_features], axis=1
        )

        df = df.merge(
            exponential_moving_df,
            on="datetime",
            how="left",
        )
        return df

    def _expand_datetime(self, df: DataFrame, column: str = "datetime") -> DataFrame:
        return df.assign(
            **{
                "year": lambda a_df: a_df[column].dt.year,
                "month": lambda a_df: a_df[column].dt.month,
                "day": lambda a_df: a_df[column].dt.day,
                "hour": lambda a_df: a_df[column].dt.hour,
                "day_of_year": lambda a_df: a_df[column].dt.dayofyear,
                "week_of_year": lambda a_df: a_df[column].dt.isocalendar().week,
                "quarter": lambda a_df: a_df[column].dt.quarter,
                # "season": lambda a_df: a_df[column].dt.month % 12 // 3 + 1,
                "is_weekend": lambda a_df: (a_df[column].dt.weekday >= 5).map(
                    {True: 1, False: 0}
                ),
            }
        )

    def _add_fourier_features(self, df: pd.DataFrame, num_terms: int = 7) -> DataFrame:
        for col, max_val in self.cyclical_feature_names.items():
            source = self._get_column_source(df, col)

            for i in range(1, num_terms + 1):
                operation = 2 * np.pi * i * source[col] / max_val

                df[f"fourier_sin_{col}_{i}"] = np.sin(operation)
                df[f"fourier_cos_{col}_{i}"] = np.cos(operation)

        return df

    def _get_column_source(self, df: DataFrame, col: str) -> List[str]:
        if col in df.columns:
            source = df
        elif col in self.PAST_KNOWLEDGE.columns:
            source = self.PAST_KNOWLEDGE
        else:
            raise KeyError(f"{col} not found both in df and past knowledge.")
        return source


In [None]:
parameters = {
    "past_knowledge": past_knowledge,
    "cyclical_feature_names": {
        "month": 12,
        "day": 31,
        "day_of_year": 365,
        "week_of_year": 52,
        "quarter": 4,
        # "season": 4,
        "is_weekend": 2,
        "precipitation_hours": 24,
    },
    "lag_size": 30,
    "window_size": 30,
}

prediction_df = (
    DataFrame(index=pd.date_range("2021-04-07", "2021-05-07", freq="D"))
    .reset_index()
    .rename(columns={"index": "datetime"})
)

In [73]:
prediction_df.head()

Unnamed: 0,datetime
0,2021-04-07
1,2021-04-08
2,2021-04-09
3,2021-04-10
4,2021-04-11


In [74]:
feature_extractor = FeatureExtractor(**parameters)

In [75]:
past_knowledge["precipitation_hours"]

0       17.0
1        1.0
2       16.0
3        1.0
4        0.0
        ... 
5934     4.0
5935     9.0
5936    15.0
5937     8.0
5938     6.0
Name: precipitation_hours, Length: 5939, dtype: float64

In [76]:
past_knowledge["precipitation_hours"]

0       17.0
1        1.0
2       16.0
3        1.0
4        0.0
        ... 
5934     4.0
5935     9.0
5936    15.0
5937     8.0
5938     6.0
Name: precipitation_hours, Length: 5939, dtype: float64

In [77]:
prediction_df = feature_extractor.transform(prediction_df)

In [78]:
prediction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Columns: 5393 entries, datetime to fourier_cos_precipitation_hours_7
dtypes: datetime64[ns](1), float32(5392)
memory usage: 653.3 KB


In [79]:
prediction_df_null_counts = prediction_df.isna().sum()
prediction_df_null_counts[prediction_df_null_counts > 0]

Series([], dtype: int64)

In [80]:
prediction_df.iloc[:, :40].head(5)

Unnamed: 0,datetime,general_dam_occupancy_rate_lag_5,general_dam_occupancy_rate_lag_6,general_dam_occupancy_rate_lag_7,general_dam_occupancy_rate_lag_8,general_dam_occupancy_rate_lag_9,general_dam_occupancy_rate_lag_10,general_dam_occupancy_rate_lag_11,general_dam_occupancy_rate_lag_12,general_dam_occupancy_rate_lag_13,...,temperature_2m_max_lag_8,temperature_2m_max_lag_9,temperature_2m_max_lag_10,temperature_2m_max_lag_11,temperature_2m_max_lag_12,temperature_2m_max_lag_13,temperature_2m_max_lag_14,temperature_2m_max_lag_15,temperature_2m_max_lag_16,temperature_2m_max_lag_17
0,2021-04-07,73.459999,73.18,72.830002,72.239998,71.959999,71.559998,71.139999,70.629997,69.830002,...,10.195499,9.3455,10.7955,13.1455,10.9955,9.0455,6.7455,5.3455,7.3955,12.7455
1,2021-04-08,73.660004,73.459999,73.18,72.830002,72.239998,71.959999,71.559998,71.139999,70.629997,...,9.945499,10.195499,9.3455,10.7955,13.1455,10.9955,9.0455,6.7455,5.3455,7.3955
2,2021-04-09,73.760002,73.660004,73.459999,73.18,72.830002,72.239998,71.959999,71.559998,71.139999,...,11.445499,9.945499,10.195499,9.3455,10.7955,13.1455,10.9955,9.0455,6.7455,5.3455
3,2021-04-10,74.699997,73.760002,73.660004,73.459999,73.18,72.830002,72.239998,71.959999,71.559998,...,17.495501,11.445499,9.945499,10.195499,9.3455,10.7955,13.1455,10.9955,9.0455,6.7455
4,2021-04-11,75.879997,74.699997,73.760002,73.660004,73.459999,73.18,72.830002,72.239998,71.959999,...,14.8955,17.495501,11.445499,9.945499,10.195499,9.3455,10.7955,13.1455,10.9955,9.0455
