Colors

In [1]:
#13599B  # Blue
#2DCCCD  # Aqua
#EE3A6A  # Pink
#F35E61  # Coral

Imports

In [2]:
# Official imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_regression

# Inhouse imports
from utils import fake_target, plot_dual_y

# Misc
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Read data

In [3]:
# Read
df = pd.read_csv(os.path.join('..', 'data', 'data.csv'))

# List of columns
cols_og_feats = list(df.columns[1:])

Set column types

In [4]:
# x-ticks for plots
df['fecha_str'] = df['fecha'].str[:7]

# Cast fecha to date type
df['fecha'] = pd.to_datetime(df['fecha'], format='%Y-%m-%d')

## Feature engineering
1. Time features

In [5]:
# Base features
df = df.assign(
    t = np.arange(len(df)),
    month = df['fecha'].dt.month,
    quarter = df['fecha'].dt.quarter,
    year = df['fecha'].dt.year
)

# From base features
df = df.assign(
    # Long term
    t2 = df['t'].pow(2),
    t3 = df['t'].pow(3),
    logt = np.log(df['t'] + 1),
    # Seasonality
    sin_month_1k = np.sin(2 * np.pi * df['month'] / 12),
    sin_month_2k = np.sin(2 * np.pi * 2 * df['month'] / 12),
    sin_month_3k = np.sin(2 * np.pi * 3 * df['month'] / 12),
    cos_month_1k = np.cos(2 * np.pi * df['month'] / 12),
    cos_month_2k = np.cos(2 * np.pi * 2 * df['month'] / 12),
    cos_month_3k = np.cos(2 * np.pi * 3 * df['month'] / 12),
    # Crises
    is_post_covid = df['fecha'].ge('2020-04-01').astype(int),
    is_post_gfc = df['fecha'].ge('2008-09-01').astype(int)
)

# Dummies
cols_season = ['month', 'quarter']
df = pd.concat(
    objs=[
        df,
        pd.get_dummies(df[cols_season], columns=cols_season, dtype=int)
    ],
    axis=1
)       

# Fake target
df['y'] = fake_target(df)

2. Lags

In [6]:
# Lag all columns
lags = [1, 2, 3, 6, 12]
_temp = df[cols_og_feats].shift(periods=lags)
_temp.columns = [
    '_'.join(col.split('_')[:-1]) + '_lag' + col.split('_')[-1]
    for col in _temp.columns
]  # Add _lagX suffix

# Fill nans with most recent value
for col in _temp.columns:
    col_og = col.split('_lag')[0]
    _temp[col] = _temp[col].fillna(df[col_og])
df = pd.concat([df, _temp], axis=1)
del _temp

3. Roling functions

In [7]:
windows = [3, 6, 9, 12]

# Silly functions
funs = ['mean', 'min', 'max', 'std']
for window in windows:
    _temp = (
        df[cols_og_feats]
        .rolling(window, min_periods=1, center=False)
        .agg(funs)
    )
    _temp.columns = [f"{'_'.join(col)}_w{window}" for col in _temp.columns]

    # Declare min/max ratio
    for col in cols_og_feats:
        _temp[f'{col}_minmax_w{window}'] = _temp[f'{col}_min_w{window}'].div(
            _temp[f'{col}_max_w{window}'].replace(0, 0.1)
        )
        
    # Drop min or max (only minmax)
    _temp = _temp.drop(
        columns=[col for col in _temp.columns if ('_min_w' in col) or ('_max_w' in col)]
    )

    # Append to df
    df = pd.concat([df, _temp], axis=1).fillna(0)

# EMAs
for span in windows[:2]:
    for col in cols_og_feats:
        df[f'{col}_ema{span}'] = df[col].ewm(span=span, adjust=False).mean()

## Bivariate Selection

Split data into features and target

In [None]:
# Split data into features and target
feats = cols_og_feats + [
    col for col in df.columns if ('_lag' in col) or ('_w' in col) or ('_ema' in col)
    or ('_sin' in  col) or ('_cos' in col)
]
X, y = df[feats], df['y']

# Mutial info scores
mi = mutual_info_regression(X, y, discrete_features='auto')
mi = dict(zip(feats, mi))

# Correlations
corr = X.corr()

# Correlation cutoff
CORR_CUTOFF = 0.85

# For every pair of features with |corr| > CORR_CUTOFF, drop the one with the lowest mutial info coef.
biv_drop = set()
for i, feat_i in enumerate(feats):
    for j, feat_j in enumerate(feats[i + 1:], start=i + 1):
        corr_ij = corr.iloc[i, j]
        if np.abs(corr_ij) > CORR_CUTOFF:
            # Drop feature with lower MI score
            if mi[feat_i] < mi[feat_j]:
                biv_drop.add(feat_i)
            else:
                biv_drop.add(feat_j)

