In [None]:
# %% [markdown]
"""
# Stock Range Prediction Feature Engineering
Now with percentage-based range and body range calculations
"""

# %% [markdown]
## 1. Initial Setup
# %%
import pandas as pd
import numpy as np
from tqdm import tqdm

pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

# %% [markdown]
## 2. Data Loading
# %%
def load_data(filepath):
    df = pd.read_json(filepath)
    df['date'] = pd.to_datetime(df['date'])
    return df.sort_values(['symbol', 'date']).reset_index(drop=True)

df = load_data("C:/Users/Dileep Sathya/OneDrive/Desktop/Stock_AI_2.0/artifacts/hist_data.json")
df.head()

# %% [markdown]
## 3. Percentage-Based Feature Engineering
# %%
def add_percentage_features(df):
    """Add all percentage-based range calculations"""
    # Daily range % (High-Low relative to previous close)
    df['daily_range_pct'] = ((df['high'] - df['low']) / df['high'].shift()) * 100
    
    # Daily body range % (Close-Open relative to previous close)
    df['daily_body_pct'] = ((df['close'] - df['open']) / df['close'].shift()) * 100
    
    # Previous day versions
    df['prev_day_range_pct'] = df.groupby('symbol')['daily_range_pct'].shift(1)
    df['prev_day_body_pct'] = df.groupby('symbol')['daily_body_pct'].shift(1)
    
    return df

def add_cyclic_features(df, col, period):
    df[f'{col}_sin'] = np.sin(2 * np.pi * df[col]/period)
    df[f'{col}_cos'] = np.cos(2 * np.pi * df[col]/period)
    return df

def calculate_atr(df, window=14):
    hl = df['prev_day_high'] - df['prev_day_low']
    hc = abs(df['prev_day_high'] - df['prev_day_close'].shift())
    lc = abs(df['prev_day_low'] - df['prev_day_close'].shift())
    tr = pd.concat([hl, hc, lc], axis=1).max(axis=1)
    return tr.rolling(window).mean()

def generate_features(df):
    # ===== Time Features =====
    df = add_percentage_features(df)  # Added first for correct shifting
    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df = add_cyclic_features(df, 'day_of_week', 7)
    df = add_cyclic_features(df, 'month', 12)
    
    # ===== Lagged Features =====
    for col in ['open', 'high', 'low', 'close', 'volume']:
        df[f'prev_day_{col}'] = df.groupby('symbol')[col].shift(1)
    
    # ===== Volatility Features =====
    df['atr_14'] = df.groupby('symbol', group_keys=False).apply(calculate_atr)
    df['atr_pct'] = (df['atr_14'] / df['prev_day_close']) * 100  # ATR as % of price
    df['gap_pct'] = ((df['open'] - df['prev_day_close']) / df['prev_day_close']) * 100
    
    # ===== EMA Features =====
    for span in [5, 20, 50]:
        df[f'ema_{span}'] = df.groupby('symbol')['close'].transform(
            lambda x: x.ewm(span=span, adjust=False).mean()
        )
        df[f'open_ema_{span}_dist_pct'] = ((df['open'] - df[f'ema_{span}']) / df[f'ema_{span}']) * 100
    
    # ===== Volume Features =====
    df['avg_volume_10'] = df.groupby('symbol')['prev_day_volume'].transform(lambda x: x.rolling(10).mean())
    df['volume_pct_change'] = (df['prev_day_volume'] - df['avg_volume_10']) / df['avg_volume_10'] * 100
    
    # ===== k-value Calculation =====
    df['k'] = (df['prev_day_high'] - df['prev_day_open']) / (df['prev_day_high'] - df['prev_day_low'])
    df['k'] = df['k'].clip(-1,1)
    df['confidence'] = df.groupby('symbol')['k'].transform(lambda x: x.rolling(5).mean())
    
    # ===== Target Variables =====
    df['TG_range_pct'] = df['daily_range_pct']  # Target 1: Daily range %
    df['TG_body_pct'] = df['daily_body_pct']   # Target 2: Daily body range %
    
    return df.dropna()

# %% [markdown]
## 4. Generate Final DataFrame
# %%
final_df = generate_features(df)

"""feature_columns = [
    'symbol', 'date', 'open', 'high', 'low', 'close',
    # Percentage features
    'prev_day_range_pct', 'prev_day_body_pct', 'atr_pct', 'gap_pct',
    # Cyclic features
    'day_of_week_sin', 'day_of_week_cos', 'month_sin', 'month_cos',
    # EMA features
    'open_ema_5_dist_pct', 'open_ema_20_dist_pct', 'open_ema_50_dist_pct',
    # Other features
    'confidence', 'volume_pct_change',
    # Targets
     'TG_body_pct'
]"""



# %% [markdown]
## 5. Data Validation
# %%


# Show sample
final_df.head()

# %% [markdown]
## 6. Modeling Prep Example
# %%


  df['atr_14'] = df.groupby('symbol', group_keys=False).apply(calculate_atr)


Unnamed: 0,date,open,high,low,close,volume,symbol,daily_range_pct,daily_body_pct,prev_day_range_pct,prev_day_body_pct,day_of_week,month,day_of_week_sin,day_of_week_cos,month_sin,month_cos,prev_day_open,prev_day_high,prev_day_low,prev_day_close,prev_day_volume,atr_14,atr_pct,gap_pct,ema_5,open_ema_5_dist_pct,ema_20,open_ema_20_dist_pct,ema_50,open_ema_50_dist_pct,avg_volume_10,volume_pct_change,k,confidence,TG_range_pct,TG_body_pct
14,2000-01-21,58.0,60.0,58.0,58.82,30834,ABB,3.215434,1.392663,7.243877,0.657212,4,1,-0.433884,-0.900969,0.5,0.866025,58.5,62.2,58.0,58.88,116402.0,2.834286,4.813665,-1.494565,56.93816,1.8649,54.308946,6.796401,53.314463,8.788491,40516.7,187.293881,0.880952,0.674374,3.215434,1.392663
15,2000-01-24,59.0,60.2,56.0,56.79,29389,ABB,7.0,-3.757225,3.215434,1.392663,0,1,0.0,1.0,0.5,0.866025,58.0,60.0,58.0,58.82,30834.0,2.837857,4.824647,0.306018,56.888773,3.711148,54.545237,8.167098,53.450759,10.381969,39807.5,-22.542235,1.0,0.752499,7.0,-3.757225
16,2000-01-25,55.2,56.4,55.01,55.81,30284,ABB,2.30897,1.074133,7.0,-3.757225,1,1,0.781831,0.62349,0.5,0.866025,59.0,60.2,56.0,56.79,29389.0,2.923571,5.148039,-2.799789,56.529182,-2.351321,54.665691,0.977413,53.543278,3.094174,40034.3,-26.590449,0.285714,0.709642,2.30897,1.074133
17,2000-01-27,58.8,58.8,55.2,55.44,13570,ABB,6.382979,-6.020426,2.30897,1.074133,3,1,0.433884,-0.900969,0.5,0.866025,55.2,56.4,55.01,55.81,30284.0,2.765,4.954309,5.357463,56.166122,4.689443,54.739434,7.41799,53.617659,9.665362,40509.1,-25.241489,0.863309,0.752304,6.382979,-6.020426
18,2000-01-28,55.5,55.5,53.65,54.02,16667,ABB,3.146259,-2.669553,6.382979,-6.020426,4,1,-0.433884,-0.900969,0.5,0.866025,58.8,58.8,55.2,55.44,13570.0,2.709286,4.886879,0.108225,55.450748,0.088822,54.670917,1.516498,53.633437,3.480222,38274.7,-64.54577,0.0,0.605995,3.146259,-2.669553


In [20]:
final_df.tail(30)

Unnamed: 0,date,open,high,low,close,volume,symbol,daily_range_pct,daily_body_pct,prev_day_range_pct,prev_day_body_pct,day_of_week,month,day_of_week_sin,day_of_week_cos,month_sin,month_cos,prev_day_open,prev_day_high,prev_day_low,prev_day_close,prev_day_volume,atr_14,atr_pct,gap_pct,ema_5,open_ema_5_dist_pct,ema_20,open_ema_20_dist_pct,ema_50,open_ema_50_dist_pct,avg_volume_10,volume_pct_change,k,confidence,TG_range_pct,TG_body_pct
285138,2025-05-05,244.0,246.87,242.7,243.57,12170446,WIPRO,1.696501,-0.177049,2.074774,0.343685,0,5,0.0,1.0,0.5,-0.866025,242.04,245.8,240.75,242.87,14411726.0,7.014286,2.888082,0.465269,242.363511,0.675221,246.466676,-1.000815,261.737328,-6.776767,22054481.6,-34.65398,0.744554,0.488619,1.696501,-0.177049
285139,2025-05-06,244.5,245.9,240.61,241.19,8869045,WIPRO,2.142828,-1.358952,1.696501,-0.177049,1,5,0.781831,0.62349,0.5,-0.866025,244.0,246.87,242.7,243.57,12170446.0,6.365714,2.613505,0.38182,241.972341,1.044607,245.964135,-0.595264,260.93155,-6.297265,17783984.1,-31.565132,0.688249,0.571948,2.142828,-1.358952
285140,2025-05-07,236.01,244.45,236.01,244.04,9130056,WIPRO,3.43229,3.329325,2.142828,-1.358952,2,5,0.974928,-0.222521,0.5,-0.866025,244.5,245.9,240.61,241.19,8869045.0,6.147143,2.548672,-2.147684,242.66156,-2.741085,245.780884,-3.975445,260.269136,-9.320789,17041052.2,-47.954828,0.26465,0.534253,3.43229,3.329325
285141,2025-05-08,243.11,245.83,239.31,241.57,11434833,WIPRO,2.667212,-0.631044,3.43229,3.329325,3,5,0.433884,-0.900969,0.5,-0.866025,236.01,244.45,236.01,244.04,9130056.0,6.371429,2.610813,-0.381085,242.297707,0.335246,245.379848,-0.925034,259.535837,-6.328928,16117286.5,-43.3524,1.0,0.627586,2.667212,-0.631044
285142,2025-05-09,236.6,242.95,236.5,242.01,10582438,WIPRO,2.623764,2.239516,2.667212,-0.631044,4,5,-0.433884,-0.900969,0.5,-0.866025,243.11,245.83,239.31,241.57,11434833.0,6.319286,2.615923,-2.057375,242.201805,-2.312867,245.05891,-3.451786,258.848549,-8.5952,14070965.6,-18.734554,0.417178,0.622926,2.623764,2.239516
285143,2025-05-12,245.09,257.94,244.88,257.28,19791711,WIPRO,5.375592,5.036982,2.623764,2.239516,0,5,0.0,1.0,0.5,-0.866025,236.6,242.95,236.5,242.01,10582438.0,5.672857,2.344059,1.272675,247.22787,-0.864737,246.222823,-0.46008,258.787038,-5.292783,12829127.4,-17.51241,0.984496,0.670915,5.375592,5.036982
285144,2025-05-13,256.95,256.95,250.62,251.57,11714159,WIPRO,2.454059,-2.091107,5.375592,5.036982,1,5,0.781831,0.62349,0.5,-0.866025,245.09,257.94,244.88,257.28,19791711.0,6.532143,2.538924,-0.128265,248.675247,3.327534,246.732078,4.141303,258.504016,-0.601158,13146255.5,50.550178,0.98392,0.730049,2.454059,-2.091107
285145,2025-05-14,251.57,253.69,250.75,252.94,8839105,WIPRO,1.144191,0.54458,2.454059,-2.091107,2,5,0.974928,-0.222521,0.5,-0.866025,256.95,256.95,250.62,251.57,11714159.0,6.615,2.629487,0.0,250.096831,0.589039,247.323309,1.717061,258.28582,-2.60015,12895830.0,-9.163202,0.0,0.677119,1.144191,0.54458
285146,2025-05-15,253.2,257.1,250.62,256.57,12083645,WIPRO,2.554299,1.332332,1.144191,0.54458,3,5,0.433884,-0.900969,0.5,-0.866025,251.57,253.69,250.75,252.94,8839105.0,6.025,2.381988,0.102791,252.254554,0.374798,248.203946,2.012883,258.218533,-1.943521,12115015.9,-27.040088,0.721088,0.621337,2.554299,1.332332
285147,2025-05-16,257.0,257.0,253.53,254.31,6640387,WIPRO,1.349669,-1.048447,2.554299,1.332332,4,5,-0.433884,-0.900969,0.5,-0.866025,253.2,257.1,250.62,256.57,12083645.0,6.227143,2.427074,0.167596,252.939703,1.605243,248.785475,3.301851,258.065257,-0.412786,11902716.4,1.520061,0.601852,0.658271,1.349669,-1.048447


In [None]:

# %% [markdown]
## 6. Modeling Prep (Example)
# %%
from sklearn.model_selection import train_test_split

# Example usage for modeling
def prepare_model_data(df, symbol):
    """Prepare train/test data for a specific symbol"""
    symbol_data = df[df['symbol'] == symbol]
    
    features = [
        'prev_day_range', 'atr_14', 'gap_pct',
        'day_of_week_sin', 'day_of_week_cos',
        'month_sin', 'month_cos',
        'open_ema_5_dist', 'open_ema_20_dist', 'open_ema_50_dist',
        'rolling_k', 'percent_change_in_avg_volume',
        'prev_day_chg%', 'prev_day_body_range'
    ]
    
    X = symbol_data[features]
    y = symbol_data['TG_range']
    
    return train_test_split(X, y, test_size=0.2, shuffle=False)

# Example for AAPL
X_train, X_test, y_train, y_test = prepare_model_data(final_df, 'AAPL')
print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")

# %% [markdown]
## 7. Feature Importance (Example)
# %%
from sklearn.ensemble import RandomForestRegressor

# Train example model
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)

# Get feature importance
importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

importance.plot.bar(x='feature', y='importance')