In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import pandas as pd
import numpy as np

price_df = pd.read_csv('/content/drive/MyDrive/MRP/price_preprocessed.csv', parse_dates=['date'])
price_df = price_df.sort_values(['symbol', 'date']).reset_index(drop=True)
price_df.head(), price_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11083232 entries, 0 to 11083231
Data columns (total 10 columns):
 #   Column         Dtype         
---  ------         -----         
 0   date           datetime64[ns]
 1   volume         float64       
 2   open           float64       
 3   high           float64       
 4   low            float64       
 5   close          float64       
 6   adj close      float64       
 7   symbol         object        
 8   volume_capped  float64       
 9   log_volume     float64       
dtypes: datetime64[ns](1), float64(8), object(1)
memory usage: 845.6+ MB


(        date     volume       open       high        low      close  \
 0 2016-01-04  3287300.0  41.060001  41.189999  40.340000  40.689999   
 1 2016-01-05  2587200.0  40.730000  40.950001  40.340000  40.549999   
 2 2016-01-06  2103600.0  40.240002  40.990002  40.049999  40.730000   
 3 2016-01-07  3504300.0  40.139999  40.150002  38.810001  39.000000   
 4 2016-01-08  3736700.0  39.220001  39.709999  38.470001  38.590000   
 
    adj close symbol  volume_capped  log_volume  
 0  39.089256      A      3287300.0   15.005577  
 1  38.954754      A      2587200.0   14.766087  
 2  39.127674      A      2103600.0   14.559161  
 3  37.465733      A      3504300.0   15.069502  
 4  37.071869      A      3736700.0   15.133714  ,
 None)

In [3]:
# Calculate 1-day simple returns per symbol and fill NaNs with 0.0

price_df['return_1d'] = price_df.groupby('symbol')['adj close'].pct_change().fillna(0.0)

In [4]:
# 10-Day Moving Average of Close
price_df['ma_10'] = price_df.groupby('symbol')['adj close'] \
    .transform(lambda x: x.rolling(10, min_periods=10).mean())

In [5]:
# 30-Day Rolling Volatility (std of return_1d)
price_df['vol_30'] = price_df.groupby('symbol')['return_1d'] \
    .transform(lambda x: x.rolling(30, min_periods=30).std())

In [6]:
# RSI (14-day)
def compute_rsi(close_series, window=14):
    delta = close_series.diff()
    up = delta.clip(lower=0)
    down = -delta.clip(upper=0)
    ma_up = up.rolling(window, min_periods=window).mean()
    ma_down = down.rolling(window, min_periods=window).mean()
    rs = ma_up / ma_down
    return 100 - (100 / (1 + rs))

price_df['rsi_14'] = price_df.groupby('symbol')['adj close'] \
    .apply(lambda x: compute_rsi(x, 14)) \
    .reset_index(level=0, drop=True)

In [7]:
# Drop rows with incomplete feature windows
price_feats = price_df.dropna(subset=['ma_10', 'vol_30', 'rsi_14']).reset_index(drop=True)

# Select relevant columns
cols_to_keep = [
    'date', 'symbol', 'open', 'high', 'low', 'adj close',
    'log_volume', 'ma_10', 'vol_30', 'rsi_14'
]
price_feats = price_feats[cols_to_keep]

In [8]:
price_feats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10851560 entries, 0 to 10851559
Data columns (total 10 columns):
 #   Column      Dtype         
---  ------      -----         
 0   date        datetime64[ns]
 1   symbol      object        
 2   open        float64       
 3   high        float64       
 4   low         float64       
 5   adj close   float64       
 6   log_volume  float64       
 7   ma_10       float64       
 8   vol_30      float64       
 9   rsi_14      float64       
dtypes: datetime64[ns](1), float64(8), object(1)
memory usage: 827.9+ MB


In [9]:
price_feats[['symbol','date','rsi_14']].head(20)

Unnamed: 0,symbol,date,rsi_14
0,A,2016-02-16,47.826113
1,A,2016-02-17,54.011715
2,A,2016-02-18,55.421703
3,A,2016-02-19,48.788891
4,A,2016-02-22,51.843788
5,A,2016-02-23,50.528541
6,A,2016-02-24,51.502575
7,A,2016-02-25,51.097212
8,A,2016-02-26,59.416867
9,A,2016-02-29,67.634884


In [10]:
price_feats.to_csv('/content/drive/MyDrive/MRP/price_features_engineered.csv', index=False)
price_feats.info()
price_feats.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10851560 entries, 0 to 10851559
Data columns (total 10 columns):
 #   Column      Dtype         
---  ------      -----         
 0   date        datetime64[ns]
 1   symbol      object        
 2   open        float64       
 3   high        float64       
 4   low         float64       
 5   adj close   float64       
 6   log_volume  float64       
 7   ma_10       float64       
 8   vol_30      float64       
 9   rsi_14      float64       
dtypes: datetime64[ns](1), float64(8), object(1)
memory usage: 827.9+ MB


Unnamed: 0,date,symbol,open,high,low,adj close,log_volume,ma_10,vol_30,rsi_14
0,2016-02-16,A,36.720001,37.169998,36.400002,35.621273,14.881462,34.811432,0.020069,47.826113
1,2016-02-17,A,35.220001,38.130001,34.75,36.380188,15.498627,34.888284,0.020547,54.011715
2,2016-02-18,A,37.73,37.959999,37.09,35.726944,14.618185,34.888284,0.020747,55.421703
3,2016-02-19,A,36.98,37.599998,36.849998,35.967102,14.665405,34.890205,0.020778,48.788891
4,2016-02-22,A,37.880001,38.189999,37.779999,36.533894,14.399122,35.081377,0.019611,51.843788
