In [1]:
nifty_top_tickers = [
    "RELIANCE",
    "HDFCBANK",
    "TCS",
    "BHARTIARTL",
    "ICICIBANK",
    "SBIN",
    "INFY",
    "KOTAKBANK",
    "BAJFINANCE",
    "HINDUNILVR"
]

In [2]:
import pandas as pd
import numpy as np
import os

# Create output directory if it doesn't exist
output_dir = 'stock_data_daily_processed'
os.makedirs(output_dir, exist_ok=True)

# Process each stock
for ticker in nifty_top_tickers:
    # Read the daily stock data
    input_file = f'stock_data_daily/{ticker}_daily_data.csv'
    
    # Read the CSV file
    df = pd.read_csv(input_file, header=[0, 1], index_col=0, parse_dates=True)
    
    # Get the price and volume columns
    close_col = ('Close', f'{ticker}.NS')
    high_col = ('High', f'{ticker}.NS')
    low_col = ('Low', f'{ticker}.NS')
    open_col = ('Open', f'{ticker}.NS')
    volume_col = ('Volume', f'{ticker}.NS')
    
    # ==========================================
    # === PRICE-BASED FEATURES ===
    # ==========================================
    
    # Calculate log returns (current day)
    df[('log_return', f'{ticker}.NS')] = np.log(df[close_col] / df[close_col].shift(1))
    
    # Lagged returns (historical momentum indicators)
    df[('return_lag_1', f'{ticker}.NS')] = df[('log_return', f'{ticker}.NS')].shift(1)
    df[('return_lag_2', f'{ticker}.NS')] = df[('log_return', f'{ticker}.NS')].shift(2)
    df[('return_lag_3', f'{ticker}.NS')] = df[('log_return', f'{ticker}.NS')].shift(3)
    df[('return_lag_5', f'{ticker}.NS')] = df[('log_return', f'{ticker}.NS')].shift(5)
    
    # Price momentum features
    df[('price_change_5d', f'{ticker}.NS')] = (df[close_col] / df[close_col].shift(5)) - 1
    df[('price_change_10d', f'{ticker}.NS')] = (df[close_col] / df[close_col].shift(10)) - 1
    df[('price_change_20d', f'{ticker}.NS')] = (df[close_col] / df[close_col].shift(20)) - 1
    
    # Intraday price range (volatility proxy)
    df[('high_low_ratio', f'{ticker}.NS')] = (df[high_col] - df[low_col]) / df[close_col]
    df[('open_close_ratio', f'{ticker}.NS')] = (df[close_col] - df[open_col]) / df[open_col]
    
    # ==========================================
    # === VOLATILITY FEATURES ===
    # ==========================================
    
    # Historical volatility (rolling standard deviation of returns)
    df[('volatility_5d', f'{ticker}.NS')] = df[('log_return', f'{ticker}.NS')].rolling(window=5).std()
    df[('volatility_10d', f'{ticker}.NS')] = df[('log_return', f'{ticker}.NS')].rolling(window=10).std()
    df[('volatility_20d', f'{ticker}.NS')] = df[('log_return', f'{ticker}.NS')].rolling(window=20).std()
    
    # ==========================================
    # === VOLUME-BASED FEATURES ===
    # ==========================================
    
    # Volume changes (important for detecting unusual trading activity)
    df[('volume_change', f'{ticker}.NS')] = df[volume_col].pct_change()
    df[('volume_ma_5', f'{ticker}.NS')] = df[volume_col].rolling(window=5).mean()
    df[('volume_ma_20', f'{ticker}.NS')] = df[volume_col].rolling(window=20).mean()
    
    # Volume ratio (current vs average)
    df[('volume_ratio_5d', f'{ticker}.NS')] = df[volume_col] / df[('volume_ma_5', f'{ticker}.NS')]
    df[('volume_ratio_20d', f'{ticker}.NS')] = df[volume_col] / df[('volume_ma_20', f'{ticker}.NS')]
    
    # Price-Volume correlation (OBV-like indicator)
    df[('price_volume_trend', f'{ticker}.NS')] = df[('log_return', f'{ticker}.NS')] * df[volume_col]
    
    # ==========================================
    # === TECHNICAL INDICATORS ===
    # ==========================================
    
    # Moving Averages
    df[('sma_5', f'{ticker}.NS')] = df[close_col].rolling(window=5).mean()
    df[('sma_10', f'{ticker}.NS')] = df[close_col].rolling(window=10).mean()
    df[('sma_20', f'{ticker}.NS')] = df[close_col].rolling(window=20).mean()
    df[('ema_10', f'{ticker}.NS')] = df[close_col].ewm(span=10, adjust=False).mean()
    df[('ema_20', f'{ticker}.NS')] = df[close_col].ewm(span=20, adjust=False).mean()
    
    # Price position relative to moving averages
    df[('price_to_sma_5', f'{ticker}.NS')] = df[close_col] / df[('sma_5', f'{ticker}.NS')] - 1
    df[('price_to_sma_20', f'{ticker}.NS')] = df[close_col] / df[('sma_20', f'{ticker}.NS')] - 1
    
    # RSI (Relative Strength Index)
    delta = df[close_col].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df[('rsi_14', f'{ticker}.NS')] = 100 - (100 / (1 + rs))
    
    # MACD (Moving Average Convergence Divergence)
    ema_12 = df[close_col].ewm(span=12, adjust=False).mean()
    ema_26 = df[close_col].ewm(span=26, adjust=False).mean()
    df[('macd', f'{ticker}.NS')] = ema_12 - ema_26
    df[('macd_signal', f'{ticker}.NS')] = df[('macd', f'{ticker}.NS')].ewm(span=9, adjust=False).mean()
    df[('macd_hist', f'{ticker}.NS')] = df[('macd', f'{ticker}.NS')] - df[('macd_signal', f'{ticker}.NS')]
    
    # Bollinger Bands
    sma_20 = df[close_col].rolling(window=20).mean()
    std_20 = df[close_col].rolling(window=20).std()
    df[('bollinger_upper', f'{ticker}.NS')] = sma_20 + (std_20 * 2)
    df[('bollinger_lower', f'{ticker}.NS')] = sma_20 - (std_20 * 2)
    df[('bollinger_mid', f'{ticker}.NS')] = sma_20
    df[('bollinger_width', f'{ticker}.NS')] = (df[('bollinger_upper', f'{ticker}.NS')] - df[('bollinger_lower', f'{ticker}.NS')]) / df[('bollinger_mid', f'{ticker}.NS')]
    
    # Bollinger Band position (where price is relative to bands)
    df[('bollinger_position', f'{ticker}.NS')] = (df[close_col] - df[('bollinger_lower', f'{ticker}.NS')]) / (df[('bollinger_upper', f'{ticker}.NS')] - df[('bollinger_lower', f'{ticker}.NS')])
    
    # ==========================================
    # === TARGET VARIABLE ===
    # ==========================================
    
    # Calculate target: Next day price movement (binary classification)
    # Based on next day's log_return (shift(-1) to get next day's value)
    next_day_log_return = df[('log_return', f'{ticker}.NS')].shift(-1)
    
    # Create binary target column: Up (positive return) = 1, Down/Neutral (non-positive return) = 0
    df[('target', f'{ticker}.NS')] = (next_day_log_return > 0).astype(int)
    
    # ==========================================
    # === DATA QUALITY ===
    # ==========================================
    
    # Drop rows with NaN values (from rolling calculations and shifts)
    # Keep track of data before/after cleaning
    rows_before = len(df)
    df_clean = df.dropna()
    rows_after = len(df_clean)
    
    # Save the processed data
    output_file = f'{output_dir}/{ticker}_daily_data.csv'
    df_clean.to_csv(output_file)
    
    # Calculate class distribution
    target_col = ('target', f'{ticker}.NS')
    class_1_count = sum(df_clean[target_col] == 1)
    class_0_count = sum(df_clean[target_col] == 0)
    class_balance = class_1_count / (class_1_count + class_0_count) * 100
    
    print(f'Processed {ticker}:')
    print(f'  - Price features: log_return, lagged returns (1,2,3,5), momentum (5d,10d,20d), price ratios')
    print(f'  - Volatility features: 5d, 10d, 20d rolling volatility')
    print(f'  - Volume features: volume changes, MA ratios, price-volume trend')
    print(f'  - Technical indicators: SMA, EMA, RSI, MACD, Bollinger Bands (with position)')
    print(f'  - Data: {rows_before} rows -> {rows_after} rows (removed {rows_before - rows_after} rows with NaN)')
    print(f'  - Target distribution: Up={class_1_count} ({class_balance:.1f}%), Down/Neutral={class_0_count} ({100-class_balance:.1f}%)')
    print(f'  - Total features: {len(df_clean.columns) - 1}')  # -1 for target
    print(f'  - Saved to {output_file}\n')

print('All stocks processed successfully!')
print('\n=== IMPORTANT NOTES ===')
print('✅ Features use only PAST data (no look-ahead bias)')
print('✅ Volume-based features included')
print('✅ Volatility measures added')
print('✅ Removed temporal features (day/month) - not predictive for stocks')
print('✅ Added momentum and lagged returns')
print('⚠️  Remember to normalize/scale features before training!')


Processed RELIANCE:
  - Price features: log_return, lagged returns (1,2,3,5), momentum (5d,10d,20d), price ratios
  - Volatility features: 5d, 10d, 20d rolling volatility
  - Volume features: volume changes, MA ratios, price-volume trend
  - Technical indicators: SMA, EMA, RSI, MACD, Bollinger Bands (with position)
  - Data: 1481 rows -> 1461 rows (removed 20 rows with NaN)
  - Target distribution: Up=760 (52.0%), Down/Neutral=701 (48.0%)
  - Total features: 40
  - Saved to stock_data_daily_processed/RELIANCE_daily_data.csv

Processed HDFCBANK:
  - Price features: log_return, lagged returns (1,2,3,5), momentum (5d,10d,20d), price ratios
  - Volatility features: 5d, 10d, 20d rolling volatility
  - Volume features: volume changes, MA ratios, price-volume trend
  - Technical indicators: SMA, EMA, RSI, MACD, Bollinger Bands (with position)
  - Data: 1481 rows -> 1461 rows (removed 20 rows with NaN)
  - Target distribution: Up=750 (51.3%), Down/Neutral=711 (48.7%)
  - Total features: 40
  -