Imports

In [None]:
# Official imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Inhouse imports
from utils import fake_target, plot_dual_y

# Misc
pd.set_option('display.max_columns', None)

Read data

In [None]:
# Read
df = pd.read_csv(os.path.join('..', 'data', 'data.csv'))

# List of columns
cols_og_feats = df.columns[1:]

# Plot all OG features against target
# for col in cols_og_feats:
#     plot_dual_y(df, col, 'y', 'fecha_str')

# Select core columns (lags, moving averages, emas, etc)
cols_core = [
    'cetes_12m',
    'cetes_1m',
    # 'cetes_3m',
    # 'cetes_6m',
    'exchange_rate_usd',
    'exportaciones_no_petroleras',
    # 'imss',
    'inpc',
    'official_interest_rate_usa',
    'pib',
    'snp500',
    # 'sovereign_10y',
    # 'sovereign_3y',
    # 'sovereign_5y',
    'stock_market',
    'tasa_desempleo',
    # 'tasa_fondeo_1d',
    # 'treasury_10y',
    # 'treasury_1m',
    # 'treasury_1y',
    # 'treasury_3m',
    # 'treasury_3y',
    # 'treasury_5y',
    # 'treasury_6m',
    'vix',
    'vix_bmv'
]

Set column types

In [None]:
# x-ticks for plots
df['fecha_str'] = df['fecha'].str[:7]

# Cast fecha to date type
df['fecha'] = pd.to_datetime(df['fecha'], format='%Y-%m-%d')

## Feature engineering
1. Time features

In [None]:
# Base features
df = df.assign(
    t = np.arange(len(df)),
    month = df['fecha'].dt.month,
    quarter = df['fecha'].dt.quarter,
    year = df['fecha'].dt.year
)

# From base features
df = df.assign(
    # Long term
    t2 = df['t'].pow(2),
    t3 = df['t'].pow(3),
    logt1 = np.log(df['t'] + 1),
    # Seasonality
    sin_month_1 = np.sin(2 * np.pi * df['month'] / 12),
    sin_month_2 = np.sin(2 * np.pi * 2 * df['month'] / 12),
    sin_month_3 = np.sin(2 * np.pi * 3 * df['month'] / 12),
    cos_month_1 = np.cos(2 * np.pi * df['month'] / 12),
    cos_month_2 = np.cos(2 * np.pi * 2 * df['month'] / 12),
    cos_month_3 = np.cos(2 * np.pi * 3 * df['month'] / 12),
    # Crises
    is_post_covid = df['fecha'].ge('2020-04-01').astype(int),
    is_post_gfc = df['fecha'].ge('2008-09-01').astype(int)
)

# Dummies
cols_season = ['month', 'quarter']
df = pd.concat(
    objs=[
        df,
        pd.get_dummies(df[cols_season], columns=cols_season, dtype=int)
    ],
    axis=1
)       

# Fake target
df = fake_target(df)

2. Lags

In [None]:
# Lag all columns
lags = [1, 2, 3, 6, 12]
_temp = df[cols_og_feats].shift(periods=lags)
_temp.columns = [
    '_'.join(col.split('_')[:-1]) + '_lag' + col.split('_')[-1] for col in _temp.columns
]  # Add _lagX suffix

# Fill nans with most recent value
for col in _temp.columns:
    col_og = col.split('_lag')[0]
    _temp[col] = _temp[col].fillna(df[col_og])
df = pd.concat([df, _temp], axis=1)
del _temp

3. Roling functions

In [None]:
windows = [3, 6, 9, 12]

# Silly functions
funs = ['mean', 'min', 'max', 'std']
for window in windows:
    _temp = (
        df[cols_og_feats]
        .rolling(window, min_periods=1, center=False)
        .agg(funs)
    )
    _temp.columns = [f"{'_'.join(col)}_w{window}" for col in _temp.columns]

    # Declare min/max ratio
    for col in cols_og_feats:
        _temp[f'{col}_minmax_w{window}'] = _temp[f'{col}_min_w{window}'].div(
            _temp[f'{col}_max_w{window}'].replace(0, 0.1)
        )
        
    # Drop min or max (only minmax)
    _temp = _temp.drop(
        columns=[col for col in _temp.columns if ('_min_w' in col) or ('_max_w' in col)]
    )

    # Append to df
    df = pd.concat([df, _temp], axis=1)

# EMAs
for span in windows[:2]:
    for col in cols_og_feats:
        df[f'{col}_ema{span}'] = df[col].ewm(span=span, adjust=False).mean()

## Bivariate Selection