In [1]:
import numpy as np
import pandas_datareader.data as web
import pandas as pd
import pandas_ta as ta
import pandas_market_calendars as mcal
from dotenv import load_dotenv
from datetime import datetime, timedelta
import matplotlib
import matplotlib.pyplot as plt
import yfinance as yf
import os

In [2]:
import requests
from bs4 import BeautifulSoup

def add_symbols_from_URL(URL='https://www.tradingview.com/markets/stocks-usa/market-movers-most-volatile/',companies = []):

    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    for row in soup.select('tbody tr'):
        if not row.find("td") == None:
            if not row.find("td").find("a") == None:
                #print(row.find("td").find("a").get_text())
                symbol = row.find("td").find("a").get_text()
                if symbol not in companies:
                    companies.append(row.find("td").find("a").get_text())

companies = []

f = open("S&P_Top50.txt", "r")
for lines in f:
    companies.append(lines.strip())

add_symbols_from_URL("https://www.tradingview.com/markets/stocks-usa/market-movers-most-volatile/",companies)
add_symbols_from_URL("https://www.tradingview.com/markets/stocks-usa/market-movers-gainers/",companies)
add_symbols_from_URL("https://www.tradingview.com/markets/stocks-usa/market-movers-losers/",companies)

In [3]:
NUM_OF_YEARS_PRED = 1 # num of year to predict
NUM_OF_YEARS_DATA = 5 # num of years of data to use

In [4]:
selected_features = []

file = open('selected_features.txt','r')
for lines in file:
	selected_features.append(lines.strip())

In [5]:
companies.sort()

In [6]:
def create_outcome(df:pd.DataFrame,year_pred:int,date:datetime):

    outcome_out = df.sort_index().copy()
    outcome_out.drop(columns=df.columns.difference(['date','symbol']),inplace=True)
    outcome_out.set_index(pd.DatetimeIndex(outcome_out["date"]),inplace=True)
    end_date =  df.index[-1]
    start_date = end_date.replace(year=end_date.year-year_pred)
    days_shift = (end_date - start_date).days

    outcome_colums = df.columns.difference(['date','symbol'])

    for i in range(1,days_shift+1):
        outcome = df[outcome_colums].shift(-i).add_suffix('_f'+str(i))
        outcome_out = pd.concat([outcome_out,outcome],axis=1)
         
    return outcome_out

def create_indicators(df:pd.DataFrame):
    features = df.sort_index().copy()

    features.set_index((features["date"]),inplace=True)

    features.ta.core = os.cpu_count()/2

    features.ta.strategy(ta.AllStrategy,exclude=["mcgd"],lookahead=False)
    features.ta.cdl_pattern(ta.ALL_PATTERNS, lookahead=False)

    features.drop(['open','high','low','close','volume'],axis=1,inplace=True)
    
    features.replace([np.inf, -np.inf], np.nan , inplace=True)
    for col in features.columns.difference(['date','symbol']): # fill nans
        features[col].interpolate(method="time",limit_direction="both",inplace=True)
        #features[col].interpolate(method="time",limit_direction="both",inplace=True)
    features.ffill(inplace = True)
         
    return features

def get_data(symbols,features,year_pred=NUM_OF_YEARS_PRED, year_data=NUM_OF_YEARS_DATA, data_source=None, begin_date=None,end_date=None):
    end_date = datetime.now().date() # to make sure we get todays data
    start_date = end_date.replace(year=end_date.year-(2*year_pred)-year_data)

    data_days = (end_date - start_date).days

    nyse = mcal.get_calendar('NYSE')
    dates = nyse.valid_days(start_date, end_date)

    while len(dates) <= data_days:
        if len(dates) == data_days:
            break
        start_date = start_date + timedelta(days=len(dates)-data_days)
        dates = nyse.valid_days(start_date, end_date)
    
    start_date = dates[0].date()
    end_date = dates[-1].date()

    days_shift = (end_date - end_date.replace(year=end_date.year-year_pred)).days

    features_out = pd.DataFrame()
    prices_out = pd.DataFrame()
    outcomes_out = pd.DataFrame()
    for symbol in symbols:
        if "." in symbol:
            continue

        ticker  = yf.Ticker(symbol)

        if ticker.info is None:
            continue

        df = ticker.history(start=start_date,interval='1d', auto_adjust=True)[['Open','High','Low','Close','Volume']].reset_index()     
        
        df.columns = ['date','open','high','low','close','volume']
        df['symbol'] = symbol

        if not df.date[0].date() == start_date:
            print("Not Enough Data For: ", symbol)
            continue
        
        #df['date'] = pd.to_datetime(df['date'])
        df = df.set_index(df['date'])
        #outcomes = create_outcome(df,year_pred,dates[-days_shift])
        #df = df[:dates[-days_shift]]
        features = create_indicators(df)
        

        df = df.set_index(['date','symbol'])
        features = features.set_index(['date','symbol'])
        #outcomes = outcomes.set_index(['date','symbol'])

        prices_out = pd.concat([prices_out,df],axis=0)
        features_out = pd.concat([features_out,features],axis=0)
        #outcomes_out = pd.concat([outcomes_out,outcomes],axis=0)

        print("Done Processing : ", symbol)

    return features_out.sort_index(),prices_out.sort_index()

In [7]:
#companies = ["MSFT","AAPL","AMZN"]
indicators_features,prices= get_data(companies,selected_features,year_pred=NUM_OF_YEARS_PRED,year_data=NUM_OF_YEARS_DATA)

130it [00:02, 49.65it/s]


Done Processing :  AA
Not Enough Data For:  AAMC


130it [00:02, 54.38it/s]


Done Processing :  AAME


130it [00:02, 53.17it/s]


Done Processing :  AAPL


130it [00:02, 56.21it/s]


Done Processing :  ABBV
Not Enough Data For:  ABSI


130it [00:02, 51.62it/s]


Done Processing :  ABT


130it [00:02, 52.63it/s]


Done Processing :  ACN
Not Enough Data For:  ACON


130it [00:02, 48.01it/s]


Done Processing :  ACOR


130it [00:02, 54.19it/s]


Done Processing :  ADBE
Not Enough Data For:  ADTX
Not Enough Data For:  AGL
Not Enough Data For:  AINC


130it [00:02, 55.20it/s]


Done Processing :  AIRI


130it [00:02, 50.14it/s]


Done Processing :  AIRT
Not Enough Data For:  AISP
Not Enough Data For:  AKAN
Not Enough Data For:  ALDX
Not Enough Data For:  ALLG
Not Enough Data For:  ALT
Not Enough Data For:  ALTI
Not Enough Data For:  ALVO


130it [00:02, 52.28it/s]


Done Processing :  AMD
Not Enough Data For:  AMPG
Not Enough Data For:  AMPX


130it [00:02, 58.00it/s]


Done Processing :  AMPY


130it [00:02, 48.37it/s]


Done Processing :  AMZN
Not Enough Data For:  ANGH


130it [00:02, 56.77it/s]


Done Processing :  ANGO
Not Enough Data For:  ANTX
Not Enough Data For:  APAC


130it [00:02, 51.00it/s]


Done Processing :  AREN


130it [00:02, 53.41it/s]


Done Processing :  ARQ
Not Enough Data For:  ARRW
Not Enough Data For:  ASTS
Not Enough Data For:  ATGL
Not Enough Data For:  AUST
Not Enough Data For:  AUVI


130it [00:02, 59.17it/s]


Done Processing :  AVGO
Not Enough Data For:  AVTX


130it [00:02, 56.97it/s]


Done Processing :  AVXL


130it [00:02, 58.19it/s]


Done Processing :  AWH
Not Enough Data For:  AWIN
Not Enough Data For:  AZ


130it [00:02, 55.96it/s]


Done Processing :  BAC
Not Enough Data For:  BACK
Not Enough Data For:  BCG
Not Enough Data For:  BFRI
Not Enough Data For:  BMEA
Not Enough Data For:  BNAI
Not Enough Data For:  BOWL
Not Enough Data For:  BRLS
Not Enough Data For:  BTBT
Not Enough Data For:  BTOG
Not Enough Data For:  BYSI


130it [00:02, 59.29it/s]


Done Processing :  CAMP
Not Enough Data For:  CDIO
Not Enough Data For:  CERO
Not Enough Data For:  CHX
Not Enough Data For:  CIFR
Not Enough Data For:  CINT
Not Enough Data For:  CISS


130it [00:02, 59.93it/s]


Done Processing :  CLRO
Not Enough Data For:  CLSD
Not Enough Data For:  CLSK


130it [00:02, 54.21it/s]


Done Processing :  CMCSA
Not Enough Data For:  CNFR
Not Enough Data For:  CNSP
Not Enough Data For:  COOL


130it [00:02, 59.20it/s]


Done Processing :  COST


130it [00:02, 55.26it/s]


Done Processing :  CRM
Not Enough Data For:  CRML


130it [00:02, 55.71it/s]


Done Processing :  CSCO


130it [00:02, 50.59it/s]


Done Processing :  CVV


130it [00:02, 51.90it/s]


Done Processing :  CVX
Not Enough Data For:  CXAI


130it [00:02, 56.77it/s]


Done Processing :  CYTH
Not Enough Data For:  CZOO
Not Enough Data For:  DCFC
Not Enough Data For:  DECA


130it [00:02, 51.33it/s]


Done Processing :  DHR


130it [00:02, 52.14it/s]


Done Processing :  DHX


130it [00:02, 50.81it/s]


Done Processing :  DIS
Not Enough Data For:  DJT
Not Enough Data For:  DNTH
Not Enough Data For:  DRCT
Not Enough Data For:  DTI
Not Enough Data For:  DTSS


130it [00:02, 53.95it/s]


Done Processing :  DTST
Not Enough Data For:  DUOT
Not Enough Data For:  DXYZ
Not Enough Data For:  EFSH
Not Enough Data For:  EGOX


130it [00:02, 56.59it/s]


Done Processing :  EIGR
Not Enough Data For:  ENVB


130it [00:02, 54.92it/s]


Done Processing :  EP
Not Enough Data For:  EVGO


130it [00:02, 53.22it/s]


Done Processing :  EXAS
Not Enough Data For:  EZFL
Not Enough Data For:  FGEN
Not Enough Data For:  FLGC
Not Enough Data For:  FLNC


130it [00:02, 59.50it/s]


Done Processing :  FLXS
Not Enough Data For:  FNGR


130it [00:02, 49.49it/s]


Done Processing :  GAIA
Not Enough Data For:  GCT
Not Enough Data For:  GCTS


130it [00:02, 59.53it/s]


Done Processing :  GE
Not Enough Data For:  GLSI
Not Enough Data For:  GMDA
Not Enough Data For:  GOEV
Not Enough Data For:  GOOG


130it [00:02, 58.26it/s]


Done Processing :  GOOGL
Not Enough Data For:  GRI


130it [00:02, 54.64it/s]


Done Processing :  GROW


130it [00:02, 59.15it/s]


Done Processing :  GRPN
Not Enough Data For:  GRTS


130it [00:02, 54.53it/s]


Done Processing :  GSIT


130it [00:02, 55.96it/s]


Done Processing :  GTE
Not Enough Data For:  GUTS
Not Enough Data For:  GV


130it [00:02, 54.79it/s]


Done Processing :  GVP


130it [00:02, 56.60it/s]


Done Processing :  GYRE


130it [00:02, 55.20it/s]


Done Processing :  HBI


130it [00:02, 56.50it/s]


Done Processing :  HD


130it [00:02, 54.85it/s]


Done Processing :  HLF
Not Enough Data For:  HOLO


130it [00:02, 56.93it/s]


Done Processing :  HUM
Not Enough Data For:  HWH
Not Enough Data For:  HYMC


130it [00:02, 55.20it/s]


Done Processing :  IBIO


130it [00:02, 56.18it/s]


Done Processing :  IBM


130it [00:02, 54.13it/s]


Done Processing :  IDN


130it [00:02, 46.63it/s]


Done Processing :  IGC
Not Enough Data For:  IHS


130it [00:02, 51.60it/s]


Done Processing :  IMNN
Not Enough Data For:  IMPP
Not Enough Data For:  IMRX
Not Enough Data For:  INDI


130it [00:02, 49.21it/s]


Done Processing :  INFN


130it [00:03, 40.47it/s]


Done Processing :  INLX


130it [00:03, 40.41it/s]


Done Processing :  INTC


130it [00:02, 47.22it/s]


Done Processing :  INTU


130it [00:02, 46.31it/s]


Done Processing :  INTZ
Not Enough Data For:  IREN
Not Enough Data For:  IVVD


130it [00:02, 47.38it/s]


Done Processing :  IZEA
Not Enough Data For:  IZM


130it [00:02, 50.14it/s]


Done Processing :  JNJ


130it [00:02, 55.18it/s]


Done Processing :  JPM
Not Enough Data For:  KNSL


130it [00:02, 51.95it/s]


Done Processing :  KO
Not Enough Data For:  KULR
Not Enough Data For:  KYCH
Not Enough Data For:  LE
Not Enough Data For:  LGCB
Not Enough Data For:  LGVC
Not Enough Data For:  LIFW


130it [00:02, 48.77it/s]


Done Processing :  LIND


130it [00:02, 54.02it/s]


Done Processing :  LIXT


130it [00:02, 53.34it/s]


Done Processing :  LLY
Not Enough Data For:  LOBO
Not Enough Data For:  LOOP
Not Enough Data For:  LPA
Not Enough Data For:  LYRA
Not Enough Data For:  LYT


130it [00:03, 41.96it/s]


Done Processing :  MA


130it [00:02, 44.41it/s]


Done Processing :  MARA


130it [00:02, 48.67it/s]


Done Processing :  MARPS
Not Enough Data For:  MAXN


130it [00:02, 52.03it/s]


Done Processing :  MCD
Not Enough Data For:  MDAI
Not Enough Data For:  MDXH
Not Enough Data For:  MEG
Not Enough Data For:  MGOL


130it [00:02, 50.89it/s]


Done Processing :  MLAB
Not Enough Data For:  MQ
Not Enough Data For:  MRAI


130it [00:02, 51.99it/s]


Done Processing :  MRK
Not Enough Data For:  MRNO


130it [00:02, 47.83it/s]


Done Processing :  MSFT


130it [00:02, 49.66it/s]


Done Processing :  MTEX
Not Enough Data For:  MVST


130it [00:02, 48.26it/s]


Done Processing :  MXC


130it [00:02, 47.50it/s]


Done Processing :  MXL


130it [00:02, 48.51it/s]


Done Processing :  NAII
Not Enough Data For:  NAUT


130it [00:02, 43.55it/s]


Done Processing :  NCLH
Not Enough Data For:  NEOV


130it [00:02, 51.80it/s]


Done Processing :  NFLX


130it [00:02, 50.53it/s]


Done Processing :  NGS
Not Enough Data For:  NINE
Not Enough Data For:  NISN
Not Enough Data For:  NKTX
Not Enough Data For:  NOVA


130it [00:02, 50.10it/s]


Done Processing :  NSPR


130it [00:02, 49.82it/s]


Done Processing :  NVDA


130it [00:02, 48.53it/s]


Done Processing :  NWFL
Not Enough Data For:  NXGL
Not Enough Data For:  NXL
Not Enough Data For:  NXTC
Not Enough Data For:  OCEA


130it [00:02, 44.42it/s]


Done Processing :  OII


130it [00:02, 45.24it/s]


Done Processing :  OMQS


130it [00:02, 47.13it/s]


Done Processing :  ONTX


130it [00:02, 50.94it/s]


Done Processing :  ORCL


130it [00:02, 51.36it/s]


Done Processing :  ORGS


130it [00:02, 52.73it/s]


Done Processing :  OTRK
Not Enough Data For:  PCSA
Not Enough Data For:  PEGR


130it [00:02, 50.62it/s]


Done Processing :  PEP
Not Enough Data For:  PEPG


130it [00:02, 48.96it/s]


Done Processing :  PG
Not Enough Data For:  PIK


130it [00:02, 47.05it/s]


Done Processing :  PLUG
Not Enough Data For:  PMEC
Not Enough Data For:  PMN


130it [00:02, 51.09it/s]


Done Processing :  PNBK


130it [00:02, 51.45it/s]


Done Processing :  POAI


130it [00:02, 52.27it/s]


Done Processing :  PPSI
Not Enough Data For:  PRE
Not Enough Data For:  PROP


130it [00:02, 51.74it/s]


Done Processing :  PRTA


130it [00:02, 51.64it/s]


Done Processing :  PRTG
Not Enough Data For:  PTIX
Not Enough Data For:  PTPI


130it [00:02, 46.44it/s]


Done Processing :  PVH
Not Enough Data For:  PYXS


130it [00:02, 48.07it/s]


Done Processing :  QCOM


130it [00:02, 50.14it/s]


Done Processing :  QDEL
Not Enough Data For:  QTRX
Not Enough Data For:  RDDT
Not Enough Data For:  ROIV
Not Enough Data For:  RWOD
Not Enough Data For:  RYDE
Not Enough Data For:  SABR
Not Enough Data For:  SABS
Not Enough Data For:  SANA
Not Enough Data For:  SBFM
Not Enough Data For:  SDA


130it [00:02, 51.95it/s]


Done Processing :  SIGA
Not Enough Data For:  SINT
Not Enough Data For:  SKYX
Not Enough Data For:  SLNA


130it [00:02, 51.27it/s]


Done Processing :  SLNH
Not Enough Data For:  SMR
Not Enough Data For:  SNAX


130it [00:02, 51.52it/s]


Done Processing :  SNBR
Not Enough Data For:  SRTS


130it [00:02, 51.36it/s]


Done Processing :  SSKN


130it [00:02, 50.58it/s]


Done Processing :  SSP
Not Enough Data For:  SST
Not Enough Data For:  STI


130it [00:02, 51.09it/s]


Done Processing :  STXS
Not Enough Data For:  SWI
Not Enough Data For:  SWIN
Not Enough Data For:  SWVL
Not Enough Data For:  SYRS
Not Enough Data For:  TCRX
Not Enough Data For:  TFFP
Not Enough Data For:  TGL
Not Enough Data For:  TKNO
Not Enough Data For:  TKO


130it [00:02, 51.88it/s]


Done Processing :  TMO


130it [00:02, 47.12it/s]


Done Processing :  TMUS
Not Enough Data For:  TNYA
Not Enough Data For:  TPET
Not Enough Data For:  TPST


130it [00:02, 47.37it/s]


Done Processing :  TRVN


130it [00:02, 49.34it/s]


Done Processing :  TSLA
Not Enough Data For:  TSVT
Not Enough Data For:  TVGN
Not Enough Data For:  UBXG


130it [00:02, 50.76it/s]


Done Processing :  UNH
Not Enough Data For:  UP
Not Enough Data For:  UPC
Not Enough Data For:  USGO
Not Enough Data For:  UWMC


130it [00:02, 53.29it/s]


Done Processing :  V
Not Enough Data For:  VERI
Not Enough Data For:  VERV
Not Enough Data For:  VHAI
Not Enough Data For:  VINC
Not Enough Data For:  VLCN
Not Enough Data For:  VLD
Not Enough Data For:  VSTE


130it [00:02, 49.75it/s]


Done Processing :  VTSI
Not Enough Data For:  VVPR


130it [00:02, 51.41it/s]


Done Processing :  VZ
Not Enough Data For:  WALD


130it [00:02, 50.72it/s]


Done Processing :  WFC
Not Enough Data For:  WINV


130it [00:02, 50.04it/s]


Done Processing :  WMT
Not Enough Data For:  WORX
Not Enough Data For:  XBIT


130it [00:02, 45.09it/s]


Done Processing :  XOM
Not Enough Data For:  XOS
Not Enough Data For:  XTIA


130it [00:02, 44.93it/s]


Done Processing :  XXII
Not Enough Data For:  YMAB


130it [00:02, 50.05it/s]


Done Processing :  YTEN
Not Enough Data For:  ZNTL
Not Enough Data For:  ZPTA


In [10]:
features = pd.DataFrame(index=prices.index).sort_index()

features['close_pct_chnage'] = prices.close.pct_change()
features['volume_increase'] = np.where(prices.volume.shift(-1) > prices.volume, 1, 0)

In [11]:
features = pd.DataFrame(index=prices.index).sort_index()
features['f01'] = prices.close/prices.open-1 # daily return
features['f02'] = prices.open/prices.groupby(level='symbol', group_keys=False).close.shift(1)-1 
features['f03'] = prices.volume.apply(np.log) # log of daily volume
features['f04'] = prices.groupby(level='symbol', group_keys=False).volume.diff() # change since prior day
features['f05'] = prices.groupby(level='symbol', group_keys=False).volume.diff(50) # change since 50 days prior
pct_chg_fxn = lambda x: x.pct_change()
features['f06'] = prices.groupby(level='symbol', group_keys=False).volume.apply(pct_chg_fxn) # rate of change
# exponential moving avg of daily volume
# log of 5 day moving average of volume
ma_5 = lambda x: x.rolling(5).mean()
features['f07'] = prices.volume.groupby(level='symbol', group_keys=False).apply(ma_5).apply(np.log) 
# daily closing price vs. 10 day 
ema_10 = lambda x: x.ewm(span=10).mean()
features['f08'] = prices.close/ prices.close.groupby(level='symbol', group_keys=False).apply(ema_10)-1
# daily volume vs. 20 day moving average
ma_20 = lambda x: x.rolling(20).mean()
features['f09'] = prices.volume/ prices.volume.groupby(level='symbol', group_keys=False).apply(ma_20)-1
# daily volume vs. 40 day moving average
ma_40 = lambda x: x.rolling(40).mean()
features['f10'] = prices.volume/ prices.volume.groupby(level='symbol', group_keys=False).apply(ma_40)-1
# daily volume vs. 80 day moving average
ma_80 = lambda x: x.rolling(80).mean()
features['f11'] = prices.volume/ prices.volume.groupby(level='symbol', group_keys=False).apply(ma_80)-1
# daily volume vs. 160 day moving average
ma_160 = lambda x: x.rolling(160).mean()
features['f12'] = prices.volume/ prices.volume.groupby(level='symbol', group_keys=False).apply(ma_160)-1
# daily volume vs. 320 day moving average
ma_320 = lambda x: x.rolling(320).mean()
features['f12'] = prices.volume/ prices.volume.groupby(level='symbol', group_keys=False).apply(ma_320)-1

zscore_fun = lambda x: (x - x.rolling(window=200, min_periods=20).mean()) / x.rolling(window=200, min_periods=20).std()
features['f13'] =prices.groupby(level='symbol', group_keys=False).close.apply(zscore_fun)

rollrank_fxn = lambda x: x.rolling(200,min_periods=20).apply(lambda x: pd.Series(x).rank(pct=True)[0])
features['f14'] = prices.groupby(level='symbol', group_keys=False).volume.apply(rollrank_fxn)

features['f15'] = features['f11'].dropna().groupby(level='date').rank(pct=True) 
features['f16'] = features['f12'].dropna().groupby(level='date').rank(pct=True)

# money flow index (14 day)
features['f17'] = ta.volume.mfi(prices.high, prices.low, prices.close,prices.volume, window=14, fillna=False).astype(float)
# mean-centered money flow index
features['f18'] = features['f13'] - features['f13'].rolling(200,min_periods=20).mean()

#features.replace([np.inf, -np.inf], np.nan, inplace=True)
#n_bins = 10
#bin_fxn = lambda y: pd.qcut(y,q=n_bins,labels = range(1,n_bins+1))
#features['f19'] = prices.volume.groupby(level='symbol', group_keys=False).apply(bin_fxn)

features['f20'] = features['f06'].apply(np.sign)

plus_minus_fxn = lambda x: x.rolling(20).sum()
features['f21'] = features['f20'].groupby(level='symbol', group_keys=False).apply(plus_minus_fxn)

In [12]:
month_of_year = prices.index.get_level_values(level='date').month
one_hot_frame = pd.DataFrame(pd.get_dummies(month_of_year, dtype=int))
one_hot_frame.index = prices.index
# create column names 
feat_names = ['month_'+str(num) for num in list(range(1,1+12))]

# rename columns and merge
one_hot_frame.columns = feat_names

features = features.join(one_hot_frame)

features.replace([np.inf, -np.inf], np.nan, inplace=True)

features

Unnamed: 0_level_0,Unnamed: 1_level_0,f01,f02,f03,f04,f05,f06,f07,f08,f09,f10,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
date,symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2014-02-04 00:00:00-05:00,AA,0.016874,,15.929367,,,,,0.000000,,,...,0,0,0,0,0,0,0,0,0,0
2014-02-04 00:00:00-05:00,AAME,0.000000,,8.594154,,,,,0.000000,,,...,0,0,0,0,0,0,0,0,0,0
2014-02-04 00:00:00-05:00,AAPL,0.005812,,19.746910,,,,,0.000000,,,...,0,0,0,0,0,0,0,0,0,0
2014-02-04 00:00:00-05:00,ABBV,0.016320,,16.077909,,,,,0.000000,,,...,0,0,0,0,0,0,0,0,0,0
2014-02-04 00:00:00-05:00,ABT,0.009749,,15.924195,,,,,0.000000,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-04-02 00:00:00-04:00,WFC,-0.000174,-0.004340,16.498715,3592400.0,-10970500.0,0.325413,16.388109,0.000092,-0.214695,-0.228185,...,0,1,0,0,0,0,0,0,0,0
2024-04-02 00:00:00-04:00,WMT,-0.012354,-0.001667,16.786279,5669300.0,331500.0,0.409703,16.575284,-0.018246,0.155908,-0.085626,...,0,1,0,0,0,0,0,0,0,0
2024-04-02 00:00:00-04:00,XOM,0.011533,0.007949,16.806813,6094600.0,-167700.0,0.441094,16.559940,0.036726,0.134338,0.125970,...,0,1,0,0,0,0,0,0,0,0
2024-04-02 00:00:00-04:00,XXII,-0.044335,0.112451,12.196083,107180.0,132404.0,1.179981,11.874178,-0.040442,1.193814,1.592689,...,0,1,0,0,0,0,0,0,0,0


In [13]:
features = pd.concat([features,indicators_features],axis=1)
features.reset_index(level='symbol',inplace=True)
def interp_func(x):
    for col in x.columns.difference(['date','symbol','f19']):
        x[col].interpolate(method="time",limit_direction="both",inplace=True)
        x[col].ffill(inplace=True)
    return x

features = features.groupby('symbol', group_keys=False).apply(interp_func)
features.set_index([features.index,'symbol'],inplace=True)

In [14]:
features

Unnamed: 0_level_0,Unnamed: 1_level_0,f01,f02,f03,f04,f05,f06,f07,f08,f09,f10,...,VIDYA_14,VTXP_14,VTXM_14,VWAP_D,VWMA_10,WCP,WILLR_14,WMA_10,ZL_EMA_10,ZS_30
date,symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2014-02-04 00:00:00-05:00,AA,0.016874,-0.000876,15.929367,6917354.0,7052934.0,0.835418,16.083051,0.000000,-0.093025,0.061188,...,0.391431,1.069953,0.911178,25.838484,25.502111,25.870613,-19.658258,25.678018,26.651419,0.889937
2014-02-04 00:00:00-05:00,AAME,0.000000,-0.002532,8.594154,100.0,800.0,0.018519,8.416267,0.000000,1.699690,0.269009,...,0.049315,0.928571,1.000000,3.723672,3.716326,3.722104,-85.714313,3.713634,3.706105,-2.527300
2014-02-04 00:00:00-05:00,AAPL,0.005812,-0.004383,19.746910,-48336400.0,-161915600.0,-0.128322,19.631806,0.000000,-0.132850,-0.248186,...,0.511884,1.112036,0.816118,15.854253,16.585526,15.868221,-45.973160,16.852452,17.082871,0.269548
2014-02-04 00:00:00-05:00,ABBV,0.016320,-0.003962,16.077909,-2619000.0,-2455100.0,-0.272639,15.801685,0.000000,0.004563,-0.282135,...,1.924995,1.249115,0.772887,31.564819,32.687138,31.618948,-24.418401,33.216730,34.445149,1.862278
2014-02-04 00:00:00-05:00,ABT,0.009749,-0.003586,15.924195,-640100.0,2715900.0,-0.077707,16.098960,0.000000,0.081223,-0.334580,...,3.558886,1.370416,0.619804,29.853205,30.974900,29.867645,-13.055529,31.464306,32.544776,0.829844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-04-02 00:00:00-04:00,WFC,-0.000174,-0.004340,16.498715,3592400.0,-10970500.0,0.325413,16.388109,0.000092,-0.214695,-0.228185,...,55.758652,0.988263,0.989828,57.513332,57.339015,57.472499,-50.462974,57.388182,57.660983,0.650365
2024-04-02 00:00:00-04:00,WMT,-0.012354,-0.001667,16.786279,5669300.0,331500.0,0.409703,16.575284,-0.018246,0.155908,-0.085626,...,59.133032,0.834972,1.133047,59.380000,60.484668,59.325000,-89.928097,60.274727,59.805278,-0.750436
2024-04-02 00:00:00-04:00,XOM,0.011533,0.007949,16.806813,6094600.0,-167700.0,0.441094,16.559940,0.036726,0.134338,0.125970,...,111.881339,1.320778,0.509945,118.736666,115.058868,118.872499,-3.050522,115.816545,117.714625,2.128368
2024-04-02 00:00:00-04:00,XXII,-0.044335,0.112451,12.196083,107180.0,132404.0,1.179981,11.874178,-0.040442,1.193814,1.592689,...,2.738130,0.892307,1.014902,1.955367,2.016870,1.951525,-62.527463,2.008131,1.913841,-1.297085


In [15]:
prices

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,volume
date,symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-02-04 00:00:00-05:00,AA,25.536102,26.103067,25.445389,25.966997,8280108
2014-02-04 00:00:00-05:00,AAME,3.717398,3.745632,3.707987,3.717398,5400
2014-02-04 00:00:00-05:00,AAPL,15.818188,15.931075,15.721562,15.910123,376681200
2014-02-04 00:00:00-05:00,ABBV,31.270976,31.794590,31.118532,31.781334,9606100
2014-02-04 00:00:00-05:00,ABT,29.622171,30.067739,29.580912,29.910965,8237400
...,...,...,...,...,...,...
2024-04-02 00:00:00-04:00,WFC,57.360001,58.000000,57.189999,57.349998,14631900
2024-04-02 00:00:00-04:00,WMT,59.900002,60.099998,58.880001,59.160000,19506900
2024-04-02 00:00:00-04:00,XOM,117.919998,119.599998,117.330002,119.279999,19911600
2024-04-02 00:00:00-04:00,XXII,2.030000,2.216100,1.710000,1.940000,198012


In [16]:
features.sort_index().to_csv('RoboStockFeatures.csv')
prices.sort_index().to_csv('RoboStockPrices.csv')