In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

# Load and parse data
df = pd.read_json("C:/Users/Dileep Sathya/OneDrive/Desktop/Stock_AI_2.0/artifacts/hist_data.json")
df['date'] = pd.to_datetime(df['date'])
df['day_of_week'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['TG_body_pct'] = ((df['close'] - df['open']) / df['close']) * 100

# Add previous day features
for col in ['open', 'high', 'low', 'close', 'volume']:
    df[f'prev_day_{col}'] = df.groupby('symbol')[col].shift(1)

df = df.dropna()
df = df.drop(columns=['high', 'low', 'close', 'volume'])

# Add range and body % features
df['prev_range_pct'] = ((df['prev_day_high'] - df['prev_day_low']) / df['prev_day_high']) * 100
df['prev_body_pct'] = ((df['prev_day_close'] - df['prev_day_open']) / df['prev_day_close']) * 100

# ATR Calculation
def calculate_atr(df, window=14):
    hl = df['prev_day_high'] - df['prev_day_low']
    hc = abs(df['prev_day_high'] - df['prev_day_close'])
    lc = abs(df['prev_day_low'] - df['prev_day_close'])
    tr = pd.concat([hl, hc, lc], axis=1).max(axis=1)
    return tr.rolling(window).mean()

df['atr_14'] = df.groupby('symbol', group_keys=False).apply(calculate_atr)
df['atr_pct'] = (df['atr_14'] / df['prev_day_close']) * 100

# Gap %
df['gap_pct'] = ((df['open'] - df['prev_day_close']) / df['prev_day_close']) * 100

# EMA features
def compute_ema_features(df, spans):
    for span in spans:
        ema_col = f'ema_{span}'
        df[ema_col] = df.groupby('symbol')['prev_day_close'].transform(lambda x: x.ewm(span=span, adjust=False).mean())
        df[f'open_ema_{span}_dist_pct'] = ((df['open'] - df[ema_col]) / df[ema_col]) * 100
    return df

df = compute_ema_features(df, [5, 20, 50])

# Volume features
df['avg_volume_10'] = df.groupby('symbol')['prev_day_volume'].transform(lambda x: x.rolling(10).mean())
df['volume_pct_change'] = np.where(
    df['avg_volume_10'] != 0,
    (df['prev_day_volume'] - df['avg_volume_10']) / df['avg_volume_10'] * 100,
    0
)
df['final_tg'] = 'no_trade'  # default
df.loc[df['TG_body_pct'] > 2, 'final_tg'] = 'buy'
df.loc[df['TG_body_pct'] < -2, 'final_tg'] = 'sell'
df=df.drop(columns=['TG_body_pct'])
df = df.dropna()

# Final cleanup
df = df.drop(columns=[
    'symbol', 'open', 'atr_14', 'date',
    'prev_day_open', 'prev_day_high', 'prev_day_low',
    'prev_day_close', 'prev_day_volume',
    'ema_5', 'ema_20', 'ema_50', 'avg_volume_10'
])

df


  df['atr_14'] = df.groupby('symbol', group_keys=False).apply(calculate_atr)


Unnamed: 0,day_of_week,month,prev_range_pct,prev_body_pct,atr_pct,gap_pct,open_ema_5_dist_pct,open_ema_20_dist_pct,open_ema_50_dist_pct,volume_pct_change,final_tg
14,4,1,4.200000,-4.094379,5.863983,-0.069396,-0.110164,-0.909209,-2.142423,-53.377616,no_trade
15,0,1,1.659751,-0.558659,5.771149,0.349162,-0.096780,-0.977987,-2.243320,-20.623582,sell
16,1,1,4.652778,-3.829480,5.868084,-0.794798,-3.327654,-4.969098,-6.382405,-32.866477,no_trade
17,3,1,2.813853,-1.478197,5.611868,2.439024,-0.847086,-3.485321,-5.208056,0.337223,sell
18,4,1,5.599426,-4.210526,5.601504,-1.729323,-4.961164,-8.341865,-10.293109,1.842590,sell
...,...,...,...,...,...,...,...,...,...,...,...
285163,0,6,1.935277,-0.837603,2.507580,0.349679,-1.126645,-1.956685,-2.122246,31.807805,no_trade
285164,1,6,1.903018,1.445571,2.383424,1.437585,1.656187,0.961385,0.768247,-34.611600,no_trade
285165,2,6,4.123711,-1.275815,2.498662,0.470457,0.721707,0.188151,-0.013424,362.581518,no_trade
285166,3,6,1.490880,-0.856480,2.444226,0.056031,-0.033144,-0.544226,-0.773406,-19.841768,no_trade


In [3]:
df.final_tg.value_counts()

final_tg
no_trade    201838
sell         45303
buy          37215
Name: count, dtype: int64

In [11]:
sc=StandardScaler()
from sklearn.model_selection import train_test_split
x=df.drop(columns=['final_tg'])
y=df['final_tg']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

x_train_scaled=sc.fit_transform(x_train)
x_test_trans=sc.transform(x_test)

