## FEATURE ENGINEERING with FeatureEngine

In [1]:
import pandas as pd

## Load Data

In [5]:
df = pd.read_csv('../../Datasets/AirQualityUCI_ready.csv',
 parse_dates=['Date_Time'], usecols=["Date_Time", "CO_sensor", "RH"], index_col=['Date_Time'])

df = df.sort_index()

df = df.loc["2004-04-01":"2005-04-30"]

df.head()

Unnamed: 0_level_0,CO_sensor,RH
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-04-04 00:00:00,1224.0,56.5
2004-04-04 01:00:00,1215.0,59.2
2004-04-04 02:00:00,1115.0,62.4
2004-04-04 03:00:00,1124.0,65.0
2004-04-04 04:00:00,1028.0,65.3


In [6]:
# remove outliers

df = df.loc[df['CO_sensor']>0]

### Datetime features

In [10]:
from feature_engine.datetime import DatetimeFeatures

datefeat=DatetimeFeatures(variables='index', 
 features_to_extract=['month', 'week', 'day_of_week', 'day_of_month', 'weekend', 'hour'])

data = datefeat.fit_transform(df)

data.head()

Unnamed: 0_level_0,CO_sensor,RH,month,week,day_of_week,day_of_month,weekend,hour
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004-04-04 00:00:00,1224.0,56.5,4,14,6,4,1,0
2004-04-04 01:00:00,1215.0,59.2,4,14,6,4,1,1
2004-04-04 02:00:00,1115.0,62.4,4,14,6,4,1,2
2004-04-04 03:00:00,1124.0,65.0,4,14,6,4,1,3
2004-04-04 04:00:00,1028.0,65.3,4,14,6,4,1,4


### Lag features

We create the following lagged features:

- The pollutant concentration for the previous hour (t-1).

- The pollutant concentration for the same hour on the previous day (t-24).

In [14]:
from feature_engine.timeseries.forecasting import LagFeatures, WindowFeatures

lag_feat = LagFeatures(variables=['CO_sensor','RH'], freq=['1H', '24H'], missing_values='ignore')

data = lag_feat.fit_transform(df)

data.head()

Unnamed: 0_level_0,CO_sensor,RH,CO_sensor_lag_1H,RH_lag_1H,CO_sensor_lag_24H,RH_lag_24H
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-04-04 00:00:00,1224.0,56.5,,,,
2004-04-04 01:00:00,1215.0,59.2,1224.0,56.5,,
2004-04-04 02:00:00,1115.0,62.4,1215.0,59.2,,
2004-04-04 03:00:00,1124.0,65.0,1115.0,62.4,,
2004-04-04 04:00:00,1028.0,65.3,1124.0,65.0,,


### Window Feature

we take advantage of past 3Hour of data to predict current Hour

In [16]:
from feature_engine.timeseries.forecasting import WindowFeatures

window_feat = WindowFeatures(variables=['CO_sensor','RH'], window='3H', freq='1H', functions=['mean'])
data = window_feat.fit_transform(df)
data.head()

Unnamed: 0_level_0,CO_sensor,RH,CO_sensor_window_3H_mean,RH_window_3H_mean
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-04-04 00:00:00,1224.0,56.5,,
2004-04-04 01:00:00,1215.0,59.2,1224.0,56.5
2004-04-04 02:00:00,1115.0,62.4,1219.5,57.85
2004-04-04 03:00:00,1124.0,65.0,1184.666667,59.366667
2004-04-04 04:00:00,1028.0,65.3,1151.333333,62.2


## Periodic features

We transform the month and the hour with the sine and cosine to have a periodic representation of the features.

In [22]:
from feature_engine.creation import CyclicalFeatures
from sklearn.pipeline import make_pipeline

date_feat = DatetimeFeatures(variables='index', features_to_extract=['hour', 'month'])
cyclic_feat = CyclicalFeatures(variables=['month', 'hour'])

pipe = make_pipeline(date_feat,cyclic_feat)

data = pipe.fit_transform(df)

data.head()

Unnamed: 0_level_0,CO_sensor,RH,hour,month,month_sin,month_cos,hour_sin,hour_cos
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004-04-04 00:00:00,1224.0,56.5,0,4,0.866025,-0.5,0.0,1.0
2004-04-04 01:00:00,1215.0,59.2,1,4,0.866025,-0.5,0.269797,0.962917
2004-04-04 02:00:00,1115.0,62.4,2,4,0.866025,-0.5,0.519584,0.854419
2004-04-04 03:00:00,1124.0,65.0,3,4,0.866025,-0.5,0.730836,0.682553
2004-04-04 04:00:00,1028.0,65.3,4,4,0.866025,-0.5,0.887885,0.460065


## Missing Data

when we create lag and window feature we create missing data, lets drop them

In [25]:
from feature_engine.imputation import DropMissingData

lag_feat = LagFeatures(variables=['CO_sensor','RH'], missing_values='ignore', freq=['1H','24H'] )
window_feat = WindowFeatures(variables=['CO_sensor', 'RH'], window='3H', freq='1H', functions=['mean'])
drop = DropMissingData()

pipe = make_pipeline(lag_feat, window_feat)
data = pipe.fit_transform(df)

data.isna().sum()

CO_sensor                     0
RH                            0
CO_sensor_lag_1H             27
RH_lag_1H                    27
CO_sensor_lag_24H           461
RH_lag_24H                  461
CO_sensor_window_3H_mean     27
RH_window_3H_mean            27
dtype: int64

In [26]:
pipe = make_pipeline(lag_feat, window_feat, drop)
data = pipe.fit_transform(df)


data.isna().sum()

CO_sensor                   0
RH                          0
CO_sensor_lag_1H            0
RH_lag_1H                   0
CO_sensor_lag_24H           0
RH_lag_24H                  0
CO_sensor_window_3H_mean    0
RH_window_3H_mean           0
dtype: int64

## Avoid Look-ahead bias

Drop original data from timeseries

Sometimes, we create new variables combining other variables in the dataset, for example, we obtain the variable age by subtracting date_of_application from date_of_birth. After we obtained our new variable, we do not need the date variables in the dataset any more. Thus, we can add DropFeatures() in the Pipeline to have these removed.

In [28]:
from feature_engine.selection import DropFeatures


drop_feat = DropFeatures(features_to_drop=['CO_sensor','RH'])

pipe = make_pipeline(lag_feat, window_feat, drop, drop_feat)
data = pipe.fit_transform(df)


data.isna().sum()

CO_sensor_lag_1H            0
RH_lag_1H                   0
CO_sensor_lag_24H           0
RH_lag_24H                  0
CO_sensor_window_3H_mean    0
RH_window_3H_mean           0
dtype: int64

## FEATURE ENGINEERING with FeatureEngine

In [31]:
from sklearn.pipeline import Pipeline


pipe = Pipeline([ 
('datetime', date_feat),
('lag', lag_feat),
('window', window_feat),
('cyclic', cyclic_feat),
('drop-missing', drop),
('avoid-lookahead-bias', drop_feat)
])


data = pipe.fit_transform(df)

data.head()

Unnamed: 0_level_0,hour,month,CO_sensor_lag_1H,RH_lag_1H,CO_sensor_lag_24H,RH_lag_24H,CO_sensor_window_3H_mean,RH_window_3H_mean,month_sin,month_cos,hour_sin,hour_cos
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2004-04-05 00:00:00,0,4,1188.0,60.8,1224.0,56.5,1165.666667,58.566667,0.866025,-0.5,0.0,1.0
2004-04-05 01:00:00,1,4,1065.0,65.8,1215.0,59.2,1149.666667,61.8,0.866025,-0.5,0.269797,0.962917
2004-04-05 02:00:00,2,4,999.0,79.2,1115.0,62.4,1084.0,68.6,0.866025,-0.5,0.519584,0.854419
2004-04-05 03:00:00,3,4,911.0,80.0,1124.0,65.0,991.666667,75.0,0.866025,-0.5,0.730836,0.682553
2004-04-05 04:00:00,4,4,873.0,81.0,1028.0,65.3,927.666667,80.066667,0.866025,-0.5,0.887885,0.460065


In [32]:
data.isna().sum()

hour                        0
month                       0
CO_sensor_lag_1H            0
RH_lag_1H                   0
CO_sensor_lag_24H           0
RH_lag_24H                  0
CO_sensor_window_3H_mean    0
RH_window_3H_mean           0
month_sin                   0
month_cos                   0
hour_sin                    0
hour_cos                    0
dtype: int64