In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

# CONFIG
IN_CSV  = "/content/drive/MyDrive/MRP/new_final_dataset.csv"
OUT_CSV = "/content/drive/MyDrive/MRP/test_dataset.csv"
OPEN_COL = "none"


# LOAD IMMUTABLE COLUMNS
# (everything else will be rebuilt)
raw_cols = [
    "date", "symbol",
    "adj close", 
    "log_volume"
]
df = pd.read_csv(IN_CSV, usecols=raw_cols, parse_dates=["date"])
df.sort_values(["symbol", "date"], inplace=True, ignore_index=True)

#   create group-by objects
close_g = df.groupby("symbol")["adj close"]
vol_g   = df.groupby("symbol")["log_volume"]

# PRICE-BASED FEATURES  (trailing, causal)
df["return_1d"] = np.log(
    df["adj close"] /
    df.groupby("symbol")["adj close"].shift(1)
)

for lag in (1, 3, 5):
    df[f"return_1d_lag{lag}"] = df.groupby("symbol")["return_1d"].shift(lag)

df["return_7d_mean"] = (
    df.groupby("symbol")["return_1d"]
      .transform(lambda s: s.rolling(7, min_periods=1).mean())
)
df["return_7d_std"]  = (
    df.groupby("symbol")["return_1d"]
      .transform(lambda s: s.rolling(7, min_periods=1).std())
)

df["ma_10"] = (
    df.groupby("symbol")["adj close"]
      .transform(lambda s: s.shift(1).rolling(10, min_periods=1).mean())
)
df["vol_30"] = (
    df.groupby("symbol")["log_volume"]
      .transform(lambda s: s.shift(1).rolling(30, min_periods=1).std())
)

# safe trailing RSI14
def _rsi14(series: pd.Series, window: int = 14) -> pd.Series:
    diff = series.diff()
    gain = diff.clip(lower=0)
    loss = -diff.clip(upper=0)

    avg_gain = gain.shift(1).rolling(window, min_periods=1).mean()
    avg_loss = loss.shift(1).rolling(window, min_periods=1).mean()

    rs = avg_gain / avg_loss.replace(0, np.nan)
    return 100 - 100 / (1 + rs)

df["rsi_14"] = (
    df.groupby("symbol")["adj close"]
      .transform(_rsi14)
)

# TARGET (tomorrow’s direction)
df["target"] = (
    (df.groupby("symbol")["adj close"].shift(-1) > df["adj close"])
    .astype("int8")
)



# CLEAN-UP & SAVE

df.dropna(inplace=True)              # drop first n rows per symbol
df.reset_index(drop=True, inplace=True)

df.to_csv(OUT_CSV, index=False)
print(f"Saved dataset: {OUT_CSV}")

In [3]:
df.head()

Unnamed: 0,date,symbol,adj close,log_volume,return_1d,return_1d_lag1,return_1d_lag3,return_1d_lag5,return_7d_mean,return_7d_std,ma_10,vol_30,rsi_14,target
0,2016-01-22,A,3.623964,14.422283,0.00537,-0.002737,-0.009548,-0.002824,-0.001183,0.005735,3.622068,0.292457,23.216507,0
1,2016-01-26,A,3.611846,14.377923,-0.003349,0.00537,0.005436,-0.002791,-0.001492,0.005299,3.622339,0.325307,37.571274,0
2,2016-01-27,A,3.599842,14.207148,-0.003329,-0.003349,-0.002737,-0.009548,-0.001564,0.005323,3.621027,0.343275,33.643947,0
3,2016-01-28,A,3.574855,15.062371,-0.006965,-0.003329,0.00537,0.005436,-0.002161,0.005704,3.618673,0.374544,30.487081,1
4,2016-02-01,A,3.616504,14.796165,0.011583,-0.006965,-0.003349,-0.002737,0.000858,0.006655,3.614292,0.369227,25.505347,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6233757 entries, 0 to 6233756
Data columns (total 14 columns):
 #   Column          Dtype         
---  ------          -----         
 0   date            datetime64[ns]
 1   symbol          object        
 2   adj close       float64       
 3   log_volume      float64       
 4   return_1d       float64       
 5   return_1d_lag1  float64       
 6   return_1d_lag3  float64       
 7   return_1d_lag5  float64       
 8   return_7d_mean  float64       
 9   return_7d_std   float64       
 10  ma_10           float64       
 11  vol_30          float64       
 12  rsi_14          float64       
 13  target          int8          
dtypes: datetime64[ns](1), float64(11), int8(1), object(1)
memory usage: 624.2+ MB


In [None]:
df.isna().sum().sum()

np.int64(0)

In [7]:
df['target'].value_counts(normalize=True).round(3)

Unnamed: 0_level_0,proportion
target,Unnamed: 1_level_1
1,0.506
0,0.494
