In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from feature_engine.timeseries.forecasting.lag_features import LagFeatures

from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


#### Data 

Air Quality Dataset from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Air+Quality).

In [2]:
df = pd.read_csv('../../Datasets/AirQualityUCI_ready.csv', parse_dates=['Date_Time'], index_col=['Date_Time'])

# SELECT only data btw APR 2004 & APR 2005
df = df.query("index >= '2004-04-01' and index <='2005-04-30' ")

# resample the data  freq=1H
df = df.asfreq(freq='1H')

# Remove measurements from fixed stations.
# We'll only be using sensor data.
remove = [f for f in df.columns if '_true' in f]
# Remove adjusted humidity.
remove.append('AH')

df.drop(columns=remove, inplace=True)

# remove negative inputs
df[df<0]=np.nan


# Fill missing data
df = df.fillna(method="ffill")

In [3]:
target = "NO2_sensor"
raw_features = ["CO_sensor", "NMHC_sensor", "NOX_sensor", "O3_sensor", "T", "RH"]

### How to create Lag Features

In [5]:
freq = [f'{i}H' for i in range(1, 7*24)]

In [6]:
features_to_lag = raw_features.append(target)
lag_transform = LagFeatures(variables=features_to_lag, freq=freq)
lag_feat = lag_transform.fit_transform(df)
lag_feat.head()

Unnamed: 0_level_0,CO_sensor,NMHC_sensor,NOX_sensor,NO2_sensor,O3_sensor,T,RH,CO_sensor_lag_1H,NMHC_sensor_lag_1H,NOX_sensor_lag_1H,...,O3_sensor_lag_166H,T_lag_166H,RH_lag_166H,CO_sensor_lag_167H,NMHC_sensor_lag_167H,NOX_sensor_lag_167H,NO2_sensor_lag_167H,O3_sensor_lag_167H,T_lag_167H,RH_lag_167H
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-04-04 00:00:00,1224.0,892.0,884.0,1580.0,923.0,16.7,56.5,,,,...,,,,,,,,,,
2004-04-04 01:00:00,1215.0,843.0,929.0,1551.0,862.0,15.9,59.2,1224.0,892.0,884.0,...,,,,,,,,,,
2004-04-04 02:00:00,1115.0,782.0,980.0,1500.0,752.0,15.2,62.4,1215.0,843.0,929.0,...,,,,,,,,,,
2004-04-04 03:00:00,1124.0,793.0,965.0,1521.0,791.0,14.7,65.0,1115.0,782.0,980.0,...,,,,,,,,,,
2004-04-04 04:00:00,1028.0,682.0,1090.0,1448.0,697.0,14.3,65.3,1124.0,793.0,965.0,...,,,,,,,,,,


In [7]:
print('Features created :',lag_feat.shape[1]-df.shape[1])

Features created : 1169


In [8]:
# remove NaN due to lags
lag_feat.dropna(inplace=True)

## Feature Selection and Modeling
* We can use LASSO to determine a small set of features from all the lag features that may be helpful for forecasting.

In [11]:
# use the LagFeatures transformer from feature-engine 
lag_feat = LagFeatures(variables=features_to_lag, freq=freq)
# Scale the data using StandardScaler
df_scaled = StandardScaler().fit_transform(df)
df_scaled = pd.DataFrame(data=df_scaled, columns=df.columns, index=df.index)
# create lag features
df_with_lags = lag_feat.fit_transform(df_scaled)

# remove NaN due to lags
df_with_lags.dropna(inplace=True)

In [12]:
# We will only use lag features to avoid data leakage
feat_cols = [f for f in df_with_lags.columns if 'lag' in f]

In [13]:
# Create the data matrix and target variable
# We standardise the features and target so that
# we can use LASSO for feature selection

def get_lasso_coef(alpha=1):
    lasso = Lasso(alpha=alpha)
    lasso.fit(df_with_lags[feat_cols], df_with_lags[target])
    coef = lasso.coef_
    score = pd.DataFrame(data=coef, columns=['importance'], index=feat_cols)
    
    return score

#### Control the regularization using the alpha parameter

In [14]:
score = get_lasso_coef()
score.abs().nlargest(n=10, columns=['importance'])

Unnamed: 0,importance
CO_sensor_lag_1H,0.0
NMHC_sensor_lag_1H,0.0
NOX_sensor_lag_1H,0.0
NO2_sensor_lag_1H,0.0
O3_sensor_lag_1H,0.0
T_lag_1H,0.0
RH_lag_1H,0.0
CO_sensor_lag_2H,0.0
NMHC_sensor_lag_2H,0.0
NOX_sensor_lag_2H,0.0


#### At `alpha=0.1` we see that LASSO picks up lag values of the target as being important. 

* The last 1 hour and also seasonalities [ daily & weekly ] are identified as important lags!
    *  23 hours ago (approximately one day) 
    *  167 hours ago (approximately 1 week) ) 


In [15]:
score = get_lasso_coef(alpha=0.1)
score.abs().nlargest(n=10, columns=['importance'])

Unnamed: 0,importance
NO2_sensor_lag_1H,0.75661
NO2_sensor_lag_23H,0.094512
NO2_sensor_lag_167H,0.021078
NO2_sensor_lag_71H,0.00182
NO2_sensor_lag_143H,0.000282
CO_sensor_lag_1H,0.0
NMHC_sensor_lag_1H,0.0
NOX_sensor_lag_1H,0.0
O3_sensor_lag_1H,0.0
T_lag_1H,0.0


In [16]:
score = get_lasso_coef(alpha=0.01)
score.abs().nlargest(n=10, columns=['importance'])

Unnamed: 0,importance
NO2_sensor_lag_1H,0.786742
NO2_sensor_lag_23H,0.069281
NMHC_sensor_lag_2H,0.068111
NOX_sensor_lag_1H,0.044283
NMHC_sensor_lag_24H,0.034408
NO2_sensor_lag_5H,0.034331
CO_sensor_lag_1H,0.031557
NMHC_sensor_lag_143H,0.028786
NO2_sensor_lag_9H,0.027287
NMHC_sensor_lag_71H,0.027024
