# Imports

In [20]:
import numpy as np
import pandas as pd

from data.features.feature_generator import FeatureGenerator
from data.features.feature_preparator import FeaturePreparator

# Creating dummy/test dataset

In [21]:
dates = pd.date_range(start='2023-01-01', periods=250, freq='1D')

np.random.seed(42)  # For reproducibility

# Create base price and volume data
sample_data = pd.DataFrame({
    'Date': dates,
    'Open': np.random.normal(100, 5, 250).cumsum(),
    'High': np.random.normal(102, 5, 250).cumsum(),
    'Low': np.random.normal(98, 5, 250).cumsum(),
    'Close': np.random.normal(101, 5, 250).cumsum(),
    'Volume': np.random.randint(1000, 5000, 250)
})

# Add technical indicators with NaN values in early periods
# Short window features
sample_data['sma_5'] =  sample_data['Close'].rolling(window=5).mean()
sample_data['sma_10'] =  sample_data['Close'].rolling(window=10).mean()
sample_data['ema_5'] =  sample_data['Close'].ewm(span=5, adjust=False).mean()
sample_data['ema_10'] =  sample_data['Close'].ewm(span=10, adjust=False).mean()
sample_data['roc_1'] =  sample_data['Close'].pct_change(periods=1) * 100
sample_data['roc_5'] =  sample_data['Close'].pct_change(periods=5) * 100

# Medium window features
sample_data['sma_20'] =  sample_data['Close'].rolling(window=20).mean()
sample_data['ema_20'] =  sample_data['Close'].ewm(span=20, adjust=False).mean()

# Calculate RSI (medium window)
delta =  sample_data['Close'].diff()
gain = delta.where(delta > 0, 0)
loss = -delta.where(delta < 0, 0)
avg_gain = gain.rolling(window=14).mean()
avg_loss = loss.rolling(window=14).mean()
rs = avg_gain / avg_loss
sample_data['rsi_14'] = 100 - (100 / (1 + rs))

# Long window features
sample_data['sma_50'] =  sample_data['Close'].rolling(window=50).mean()
sample_data['sma_200'] =  sample_data['Close'].rolling(window=200).mean()
sample_data['ema_50'] =  sample_data['Close'].ewm(span=50, adjust=False).mean()
sample_data['ema_200'] =  sample_data['Close'].ewm(span=200, adjust=False).mean()

# Volatility features
for window in [5, 10, 20, 30]:
    returns =  sample_data['Close'].pct_change()
    sample_data[f'volatility_{window}'] = returns.rolling(window=window).std() * np.sqrt(252)

# Categorical features
sample_data['day_of_week'] = dates.dayofweek
sample_data['month'] = dates.month
sample_data['quarter'] = dates.quarter


In [23]:
preparator = FeaturePreparator()
preparator = preparator.fit(sample_data)

# Check that feature categories were properly created
categories = preparator._feature_categories

print(categories.get('short_window', []))
print(categories.get('medium_window', []))
print(categories.get('long_window', []))

['sma_5', 'sma_10', 'ema_5', 'ema_10', 'roc_1', 'roc_5', 'sma_50', 'ema_50']
['sma_20', 'ema_20', 'rsi_14', 'sma_200', 'ema_200', 'volatility_5', 'volatility_10']
['volatility_20', 'volatility_30']
