# Objective: 
- To transform the pre-processed data into meaningful features (alpha factors) that can be fed into machine learning models. 
- This phase also involves extensive Exploratory Data Analysis (EDA) to understand the characteristics and relationships within your feature set.

# Load Pre-processed Data

In [None]:
import os
from src.data_ingestion.data_loader import load_csv_file

def load_processed_data(processed_dir="data/processed"):
    """
    Load cleaned market data, index constituents, ETF flows, and corporate actions from processed_dir.
    Returns a dictionary of DataFrames keyed by file type.
    """
    data = {}
    for fname in os.listdir(processed_dir):
        if fname.endswith(".csv"):
            key = fname.replace(".csv", "")
            fpath = os.path.join(processed_dir, fname)
            data[key] = load_csv_file(fpath)
    # Example access:
    # market_data = data.get("AAPL_ohlcv") or similar
    # index_data = data.get("MSCI_constituents")
    # etf_flows = data.get("etf_flows")
    # corporate_actions = data.get("corporate_actions")
    return data

if __name__ == "__main__":
    data = load_processed_data()
    for k, v in data.items():
        print(f"{k}:{ v.shape}")

# Traditional Alpha Factor Generation

In [None]:
import os
from src.data_ingestion.data_loader import load_csv_file
import pandas as pd
from src.features.feature_engineering import (
    calculate_momentum,
    calculate_volatility,
    calculate_on_balance_volume,
    apply_scaling
)

def apply_features_to_market_data(processed_dir="data/processed", output_dir="data/features"):
    """
    Apply feature engineering functions to all market data CSVs in processed_dir.
    Save the resulting feature DataFrames to output_dir.
    """
    os.makedirs(output_dir, exist_ok=True)
    for fname in os.listdir(processed_dir):
        if "ohlcv" in fname and fname.endswith(".csv"):
            fpath = os.path.join(processed_dir, fname)
            df = load_csv_file(fpath)
            features = pd.DataFrame(index=df.index)
            # Price-based features
            features = pd.concat([features, calculate_momentum(df)], axis=1)
            features = pd.concat([features, calculate_volatility(df)], axis=1)
            # Volume-based features
            features = pd.concat([features, calculate_on_balance_volume(df)], axis=1)
            # Scaling (optional, can be adjusted)
            features = apply_scaling(features, method='standard')
            # Save features
            out_path = os.path.join(output_dir, fname.replace(".csv", "_features.csv"))
            features.to_csv(out_path, index=False)
            print(f"Saved features to {out_path}")

if __name__ == "__main__":
    apply_features_to_market_data()

# Event-Driven Feature Engineering

In [None]:
import pandas as pd
import numpy as np

def create_index_rebalance_features(market_df, index_constituents_df, rebalance_dates, ticker_col='ticker', date_col='date'):
    """
    Add features for index rebalances:
    - is_upcoming_inclusion: 1 if ticker will be added soon, else 0
    - is_upcoming_exclusion: 1 if ticker will be removed soon, else 0
    - expected_weight_change: difference in index weight after rebalance
    - days_until_rebalance: days until next rebalance
    """
    features = pd.DataFrame(index=market_df.index)
    market_df = market_df.copy()
    market_df[date_col] = pd.to_datetime(market_df[date_col])
    index_constituents_df[date_col] = pd.to_datetime(index_constituents_df[date_col])

    # Assume rebalance_dates is a sorted list of pd.Timestamp
    next_rebalance = np.searchsorted(rebalance_dates, market_df[date_col])
    market_df['days_until_rebalance'] = [
        (rebalance_dates[i] - d).days if i < len(rebalance_dates) else np.nan
        for d, i in zip(market_df[date_col], next_rebalance)
    ]
    features['days_until_rebalance'] = market_df['days_until_rebalance']

    # Inclusion/Exclusion flags and expected weight change
    features['is_upcoming_inclusion'] = 0
    features['is_upcoming_exclusion'] = 0
    features['expected_weight_change'] = 0.0

    for idx, row in market_df.iterrows():
        ticker = row[ticker_col]
        date = row[date_col]
        # Find current and next constituent status
        current = index_constituents_df[
            (index_constituents_df[ticker_col] == ticker) &
            (index_constituents_df[date_col] <= date)
        ].sort_values(date_col).tail(1)
        next_ = index_constituents_df[
            (index_constituents_df[ticker_col] == ticker) &
            (index_constituents_df[date_col] > date)
        ].sort_values(date_col).head(1)
        # Inclusion
        if current.empty and not next_.empty:
            features.at[idx, 'is_upcoming_inclusion'] = 1
            features.at[idx, 'expected_weight_change'] = next_['weight'].values[0]
        # Exclusion
        if not current.empty and next_.empty:
            features.at[idx, 'is_upcoming_exclusion'] = 1
            features.at[idx, 'expected_weight_change'] = -current['weight'].values[0]
        # Weight change
        if not current.empty and not next_.empty:
            features.at[idx, 'expected_weight_change'] = next_['weight'].values[0] - current['weight'].values[0]

    return features

def create_etf_flow_features(market_df, etf_flows_df, etf_constituents_df, ticker_col='ticker', date_col='date', lag_days=1):
    """
    Estimate passive buying/selling pressure from ETF flows.
    - etf_flow_pressure: sum of (ETF flow * stock weight in ETF), lagged if desired
    """
    features = pd.DataFrame(index=market_df.index)
    market_df = market_df.copy()
    market_df[date_col] = pd.to_datetime(market_df[date_col])
    etf_flows_df[date_col] = pd.to_datetime(etf_flows_df[date_col])
    etf_constituents_df[date_col] = pd.to_datetime(etf_constituents_df[date_col])

    features['etf_flow_pressure'] = 0.0

    for idx, row in market_df.iterrows():
        ticker = row[ticker_col]
        date = row[date_col] - pd.Timedelta(days=lag_days)
        # For each ETF, get flow and constituent weight for this ticker
        pressure = 0.0
        for etf in etf_flows_df['etf'].unique():
            flow_row = etf_flows_df[(etf_flows_df['etf'] == etf) & (etf_flows_df[date_col] == date)]
            weight_row = etf_constituents_df[
                (etf_constituents_df['etf'] == etf) &
                (etf_constituents_df[ticker_col] == ticker) &
                (etf_constituents_df[date_col] <= date)
            ].sort_values(date_col).tail(1)
            if not flow_row.empty and not weight_row.empty:
                flow = flow_row['flow_usd'].values[0]
                weight = weight_row['weight'].values[0]
                pressure += flow * weight
        features.at[idx, 'etf_flow_pressure'] = pressure

    return features

def create_corporate_action_features(market_df, corp_actions_df, ticker_col='ticker', date_col='date', event_types=None, max_days_ahead=30):
    """
    Create binary flags and time-until-event features for upcoming corporate actions.
    - For each event type, create a flag and days_until_event feature.
    """
    if event_types is None:
        event_types = ['split', 'dividend', 'merger', 'spinoff']
    features = pd.DataFrame(index=market_df.index)
    market_df = market_df.copy()
    market_df[date_col] = pd.to_datetime(market_df[date_col])
    corp_actions_df[date_col] = pd.to_datetime(corp_actions_df[date_col])

    for event in event_types:
        features[f'upcoming_{event}'] = 0
        features[f'days_until_{event}'] = np.nan

    for idx, row in market_df.iterrows():
        ticker = row[ticker_col]
        date = row[date_col]
        for event in event_types:
            future_events = corp_actions_df[
                (corp_actions_df[ticker_col] == ticker) &
                (corp_actions_df['action'] == event) &
                (corp_actions_df[date_col] >= date) &
                (corp_actions_df[date_col] <= date + pd.Timedelta(days=max_days_ahead))
            ]
            if not future_events.empty:
                soonest = future_events[date_col].min()
                features.at[idx, f'upcoming_{event}'] = 1
                features.at[idx, f'days_until_{event}'] = (soonest - date).days

    return features

# Advanced Features using tf-quant-finance & QuantLib

In [None]:
import pandas as pd
import numpy as np

# Option-implied features using tfqf_pricing_models.py
from src.models.tfqf_pricing_models import calculate_implied_volatility

def add_option_implied_features(options_df, strikes, expiries, spots, rates):
    """
    Calculate implied volatility, skew, and kurtosis using tf-quant-finance.
    Returns a DataFrame with these features.
    """
    # Implied volatility
    iv = calculate_implied_volatility(
        option_prices=options_df['option_price'],
        strikes=strikes,
        expiries=expiries,
        spots=spots,
        rates=rates,
        is_call=options_df.get('is_call', True)
    ).numpy()
    features = pd.DataFrame({'implied_volatility': iv})

    # Skew: difference between 25-delta put and 25-delta call IVs (conceptual)
    # For demonstration, use percentiles of IV by strike
    grouped = options_df.groupby('date')
    features['iv_skew'] = grouped['implied_volatility'].transform(
        lambda x: np.percentile(x, 90) - np.percentile(x, 10)
    )

    # Kurtosis: excess kurtosis of IV distribution across strikes
    features['iv_kurtosis'] = grouped['implied_volatility'].transform(
        lambda x: pd.Series(x).kurt()
    )

    return features

# Yield curve features using QuantLib via instrument_pricer.py
from src.quant_instruments.instrument_pricer import build_yield_curve

def add_yield_curve_features(deposits, swaps):
    """
    Build a yield curve and derive slope and curvature features.
    Returns a dict with curve, slope, and curvature.
    """
    yield_curve = build_yield_curve(deposits, swaps)
    # Example: get rates for 2Y, 5Y, 10Y
    import QuantLib as ql
    today = ql.Date.todaysDate()
    r_2y = yield_curve.zeroRate(ql.Period("2Y"), ql.Actual365Fixed(), ql.Compounded).rate()
    r_5y = yield_curve.zeroRate(ql.Period("5Y"), ql.Actual365Fixed(), ql.Compounded).rate()
    r_10y = yield_curve.zeroRate(ql.Period("10Y"), ql.Actual365Fixed(), ql.Compounded).rate()
    # Slope: 10Y - 2Y
    slope = r_10y - r_2y
    # Curvature: (2*5Y - 2Y - 10Y)
    curvature = 2 * r_5y - r_2y - r_10y
    return {
        "yield_curve": yield_curve,
        "slope": slope,
        "curvature": curvature
    }

# Alternative Data Proxy Features (Conceptual with PyTorch)

In [None]:
import pandas as pd
import numpy as np

def integrate_sentiment_features(market_df, sentiment_df, lookback=3):
    """
    Merge rolling mean sentiment scores into market data.
    """
    sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])
    sentiment_df = sentiment_df.sort_values(['ticker', 'date'])
    sentiment_df['sentiment_rolling_mean'] = (
        sentiment_df.groupby('ticker')['sentiment_score']
        .transform(lambda x: x.rolling(lookback, min_periods=1).mean())
    )
    merged = pd.merge(
        market_df,
        sentiment_df[['date', 'ticker', 'sentiment_rolling_mean']],
        on=['date', 'ticker'],
        how='left'
    )
    return merged

# Exploratory Data Analysis (EDA)

# Target Variable Definition

# Data Splitting & Feature Storage