# Feature Engineering Notebook

This notebook is where you transform the clean, raw data into structured dataset suitable for modeling.

## Key Activities:

1. Calculate Realized Volatility

2. Create Lagged Features

3. Generate Rolling Statistics

4. Prepare Final Dataset

## 1. Calculate Realized Volatility

In [6]:
import sys
sys.path.append('..')
from src.config import portfolio, data_paths
import pandas as pd
import numpy as np

tickers = portfolio['tickers']
processed_data_path = data_paths['processed_data']

# Load combined data
combined_data = pd.read_csv(f'{processed_data_path}/combined_processed_data.csv', parse_dates=['Date'])
combined_data['Close'] = pd.to_numeric(combined_data['Close'], errors='coerce')
combined_data.set_index(['Date', 'Ticker'], inplace=True)

# Calculate realized volatility (annualized)
combined_data['Returns'] = combined_data.groupby('Ticker')['Close'].pct_change()
combined_data['Realized_Volatility'] = (
    combined_data.groupby('Ticker')['Returns']
    .transform(lambda x: x.rolling(window=252).std() * np.sqrt(252))
)
print(combined_data.head())

                       Close               High                 Low  \
Date       Ticker                                                     
2010-03-16 AAPL     6.738016  6.753926278206962   6.679776890812714   
           ADBE    35.020000  35.68000030517578   34.93000030517578   
           AMZN     6.589500  6.614500045776367   6.525000095367432   
           BAC     13.285204  13.31640739580477  13.183789334075428   
           CMCSA    6.273617  6.363291131435733   6.234159861947719   

                                 Open     Volume     SMA_25     SMA_50  \
Date       Ticker                                                        
2010-03-16 AAPL     6.729910434825474  446908000   6.280004   6.212885   
           ADBE    35.459999084472656    5818800  34.083600  34.471400   
           AMZN     6.561999797821045   82650000   6.174620   6.214600   
           BAC     13.238396508681525  107411800  12.590500  12.445456   
           CMCSA    6.356117019983971   39739600   5.93371

  combined_data['Returns'] = combined_data.groupby('Ticker')['Close'].pct_change()


## 2. Create Lagged Features

In [7]:
# Create lagged features for Close, Returns, and Volatility
lags = [1, 2, 3, 5, 10]
for lag in lags:
    combined_data[f'Close_Lag_{lag}'] = combined_data.groupby('Ticker')['Close'].shift(lag)
    combined_data[f'Returns_Lag_{lag}'] = combined_data.groupby('Ticker')['Returns'].shift(lag)
    combined_data[f'Volatility_Lag_{lag}'] = combined_data.groupby('Ticker')['Realized_Volatility'].shift(lag)

print(combined_data.head())

                       Close               High                 Low  \
Date       Ticker                                                     
2010-03-16 AAPL     6.738016  6.753926278206962   6.679776890812714   
           ADBE    35.020000  35.68000030517578   34.93000030517578   
           AMZN     6.589500  6.614500045776367   6.525000095367432   
           BAC     13.285204  13.31640739580477  13.183789334075428   
           CMCSA    6.273617  6.363291131435733   6.234159861947719   

                                 Open     Volume     SMA_25     SMA_50  \
Date       Ticker                                                        
2010-03-16 AAPL     6.729910434825474  446908000   6.280004   6.212885   
           ADBE    35.459999084472656    5818800  34.083600  34.471400   
           AMZN     6.561999797821045   82650000   6.174620   6.214600   
           BAC     13.238396508681525  107411800  12.590500  12.445456   
           CMCSA    6.356117019983971   39739600   5.93371

## 3. Generate Rolling Statistics

In [8]:
# Generate rolling statistics
windows = [5, 10, 20, 50]
for window in windows:
    combined_data[f'Rolling_Mean_{window}'] = (
        combined_data.groupby('Ticker')['Close']
        .transform(lambda x: x.rolling(window=window).mean())
    )
    combined_data[f'Rolling_Std_{window}'] = (
        combined_data.groupby('Ticker')['Close']
        .transform(lambda x: x.rolling(window=window).std())
    )
    combined_data[f'Rolling_Skew_{window}'] = (
        combined_data.groupby('Ticker')['Returns']
        .transform(lambda x: x.rolling(window=window).skew())
    )
    combined_data[f'Rolling_Kurt_{window}'] = (
        combined_data.groupby('Ticker')['Returns']
        .transform(lambda x: x.rolling(window=window).kurt())
    )

print(combined_data.head())

                       Close               High                 Low  \
Date       Ticker                                                     
2010-03-16 AAPL     6.738016  6.753926278206962   6.679776890812714   
           ADBE    35.020000  35.68000030517578   34.93000030517578   
           AMZN     6.589500  6.614500045776367   6.525000095367432   
           BAC     13.285204  13.31640739580477  13.183789334075428   
           CMCSA    6.273617  6.363291131435733   6.234159861947719   

                                 Open     Volume     SMA_25     SMA_50  \
Date       Ticker                                                        
2010-03-16 AAPL     6.729910434825474  446908000   6.280004   6.212885   
           ADBE    35.459999084472656    5818800  34.083600  34.471400   
           AMZN     6.561999797821045   82650000   6.174620   6.214600   
           BAC     13.238396508681525  107411800  12.590500  12.445456   
           CMCSA    6.356117019983971   39739600   5.93371

## 4. Prepare Final Dataset

In [9]:
# Drop rows with NaN values due to lagging and rolling
final_data = combined_data.dropna()

# Reset index
final_data = final_data.reset_index()

# Save final dataset
final_file_path = f'{processed_data_path}/final_feature_dataset.csv'
final_data.to_csv(final_file_path, index=False)
print(f'Final dataset saved to {final_file_path}')
print(final_data.head())
print(final_data.shape)

Final dataset saved to c:\Users\kenne\Python_Quant_Projects\Forecasting_Realized_Volatility_with_GARCH_&_Deep_Learning_Models\data\processed/final_feature_dataset.csv
         Date Ticker      Close                High                 Low  \
0  2011-03-29   AAPL  10.535863  10.535862922668457  10.388764951541924   
1  2011-03-29   ADBE  32.619999  32.880001068115234   32.27000045776367   
2  2011-03-29   AMZN   8.731000   8.741999626159668   8.503499984741211   
3  2011-03-29    BAC  10.446569  10.493519897399569   10.29789126173958   
4  2011-03-29  CMCSA   8.973815   9.028711734126231   8.867680604309363   

                 Open     Volume     SMA_25     SMA_50     EMA_25  ...  \
0  10.436796647639797  352900800  10.425410  10.415684  10.395068  ...   
1   32.45000076293945    8032300  33.582400  33.654400  33.173122  ...   
2   8.536499977111816   97692000   8.455800   8.755850   8.509611  ...   
3  10.493519897399569  117737500  10.945792  11.086535  10.887379  ...   
4   8.918918