In [4]:
nifty_top_tickers = [
    "RELIANCE",
    "HDFCBANK",
    "TCS",
    "BHARTIARTL",
    "ICICIBANK",
    "SBIN",
    "INFY",
    "KOTAKBANK",
    "BAJFINANCE",
    "HINDUNILVR"
]

In [6]:
import pandas as pd
import numpy as np
import os

# Create output directory if it doesn't exist
output_dir = 'stock_data_daily_processed'
os.makedirs(output_dir, exist_ok=True)

# Define epsilon threshold for neutral movement
# Movements within [-epsilon, +epsilon] are considered neutral
epsilon = 0.001  # 0.1% threshold (adjust as needed)

# Process each stock
for ticker in nifty_top_tickers:
    # Read the daily stock data
    input_file = f'stock_data_daily/{ticker}_daily_data.csv'
    
    # Read the CSV file
    df = pd.read_csv(input_file, header=[0, 1], index_col=0, parse_dates=True)
    
    # Get the price columns
    close_col = ('Close', f'{ticker}.NS')
    high_col = ('High', f'{ticker}.NS')
    low_col = ('Low', f'{ticker}.NS')
    
    # === Temporal Features ===
    # Extract day of week and month from the date index
    df[('day_of_week', f'{ticker}.NS')] = df.index.dayofweek  # 0=Monday, 6=Sunday
    df[('month', f'{ticker}.NS')] = df.index.month  # 1=January, 12=December
    
    # Cyclic encoding for day of week (captures weekly periodicity)
    df[('day_of_week_sin', f'{ticker}.NS')] = np.sin(2 * np.pi * df.index.dayofweek / 7)
    df[('day_of_week_cos', f'{ticker}.NS')] = np.cos(2 * np.pi * df.index.dayofweek / 7)
    
    # Cyclic encoding for month (captures yearly seasonality)
    df[('month_sin', f'{ticker}.NS')] = np.sin(2 * np.pi * df.index.month / 12)
    df[('month_cos', f'{ticker}.NS')] = np.cos(2 * np.pi * df.index.month / 12)
    
    # === Price-based Features ===
    # Calculate log returns
    # Log return = ln(Price_t / Price_t-1)
    df[('log_return', f'{ticker}.NS')] = np.log(df[close_col] / df[close_col].shift(1))
    
    # === Technical Indicators ===
    # Calculate SMA (Simple Moving Average)
    df[('sma_5', f'{ticker}.NS')] = df[close_col].rolling(window=5).mean()
    df[('sma_10', f'{ticker}.NS')] = df[close_col].rolling(window=10).mean()
    
    # Calculate EMA (Exponential Moving Average)
    df[('ema_10', f'{ticker}.NS')] = df[close_col].ewm(span=10, adjust=False).mean()
    
    # Calculate RSI (Relative Strength Index)
    delta = df[close_col].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df[('rsi_14', f'{ticker}.NS')] = 100 - (100 / (1 + rs))
    
    # Calculate MACD (Moving Average Convergence Divergence)
    ema_12 = df[close_col].ewm(span=12, adjust=False).mean()
    ema_26 = df[close_col].ewm(span=26, adjust=False).mean()
    df[('macd', f'{ticker}.NS')] = ema_12 - ema_26
    df[('macd_signal', f'{ticker}.NS')] = df[('macd', f'{ticker}.NS')].ewm(span=9, adjust=False).mean()
    df[('macd_hist', f'{ticker}.NS')] = df[('macd', f'{ticker}.NS')] - df[('macd_signal', f'{ticker}.NS')]
    
    # Calculate Bollinger Bands
    sma_20 = df[close_col].rolling(window=20).mean()
    std_20 = df[close_col].rolling(window=20).std()
    df[('bollinger_upper', f'{ticker}.NS')] = sma_20 + (std_20 * 2)
    df[('bollinger_lower', f'{ticker}.NS')] = sma_20 - (std_20 * 2)
    df[('bollinger_mid', f'{ticker}.NS')] = sma_20
    
    # === Target Variable ===
    # Calculate target: Next day price movement class label
    # Based on next day's log_return (shift(-1) to get next day's value)
    next_day_log_return = df[('log_return', f'{ticker}.NS')].shift(-1)
    
    # Create target column with epsilon threshold: Up=1, Neutral=0, Down=-1
    df[('target', f'{ticker}.NS')] = np.where(next_day_log_return > epsilon, 1,
                                               np.where(next_day_log_return < -epsilon, -1, 0))
    
    # Save the processed data
    output_file = f'{output_dir}/{ticker}_daily_data.csv'
    df.to_csv(output_file)
    
    print(f'Processed {ticker}:')
    print(f'  - Temporal features: day_of_week, month (ordinal + cyclic encoding)')
    print(f'  - Technical indicators: SMA_5, SMA_10, EMA_10, RSI_14, MACD, Bollinger Bands')
    print(f'  - Target distribution (epsilon={epsilon}): Up={sum(df[("target", f"{ticker}.NS")] == 1)}, '
          f'Neutral={sum(df[("target", f"{ticker}.NS")] == 0)}, '
          f'Down={sum(df[("target", f"{ticker}.NS")] == -1)}')
    print(f'  - Total features: {len(df.columns)}')
    print(f'  - Saved to {output_file}\n')

print('All stocks processed successfully!')


Processed RELIANCE:
  - Temporal features: day_of_week, month (ordinal + cyclic encoding)
  - Technical indicators: SMA_5, SMA_10, EMA_10, RSI_14, MACD, Bollinger Bands
  - Target distribution (epsilon=0.001): Up=736, Neutral=86, Down=659
  - Total features: 23
  - Saved to stock_data_daily_processed/RELIANCE_daily_data.csv

Processed HDFCBANK:
  - Temporal features: day_of_week, month (ordinal + cyclic encoding)
  - Technical indicators: SMA_5, SMA_10, EMA_10, RSI_14, MACD, Bollinger Bands
  - Target distribution (epsilon=0.001): Up=716, Neutral=94, Down=671
  - Total features: 23
  - Saved to stock_data_daily_processed/HDFCBANK_daily_data.csv

Processed TCS:
  - Temporal features: day_of_week, month (ordinal + cyclic encoding)
  - Technical indicators: SMA_5, SMA_10, EMA_10, RSI_14, MACD, Bollinger Bands
  - Target distribution (epsilon=0.001): Up=722, Neutral=104, Down=655
  - Total features: 23
  - Saved to stock_data_daily_processed/TCS_daily_data.csv

Processed BHARTIARTL:
  - Te