# Temporal Data

In [107]:
import pandas as pd

from os import listdir

In [108]:
# Set the stock market (NASDAQ or NYSE)
market = "NYSE"
preprocess = True

## Get list of stocks

In [109]:
path = "../../Temporal_Relational_Stock_Ranking/data/2013-01-01"
prefix = "{}_".format(market)
suffix = "_1.csv"
stock_list = [f.removeprefix(prefix).removesuffix(suffix) for f in listdir(path) if ".csv" in f and market in f]
stock_list.sort()
print(len(stock_list))
print(stock_list[:5])

1769
['A', 'AAN', 'AAP', 'AAT', 'AB']


## Generate DataFrame

In [110]:
# deal with invalid data
def process_data(df):
    df = df.copy()
    # if NASDAQ, remove last row. NASDAQ has 1 extra row
    # with multiple invalid data
    if market == "NASDAQ":
        df = df.drop(df.tail(1).index)
    # if first row is invalid, set its value as the next
    # possible one
    for column in ["open", "high", "low", "close", "volume"]:
        if df.at[0, column] < 0:
            count = 1
            while df.at[count, column] < 0:
                count += 1
            for i in range(count):
                df.at[i, column] = df.at[count, column]
    # iterate through rows setting invalid values as the
    # last temporal value
    for row in df.itertuples():
        if row.Index > 0:
            if row.open < 0:
                df.at[row.Index, "open"] = df.at[row.Index - 1, "open"]
            if row.high < 0:
                df.at[row.Index, "high"] = df.at[row.Index - 1, "high"]
            if row.low < 0:
                df.at[row.Index, "low"] = df.at[row.Index - 1, "low"]
            if row.close < 0:
                df.at[row.Index, "close"] = df.at[row.Index - 1, "close"]
            if row.volume < 0:
                df.at[row.Index, "volume"] = df.at[row.Index - 1, "volume"]
    return df

In [111]:
# generate dataframe function
def generate_df(stock):
    columns = ["day", "open", "high", "low", "close", "volume"]
    df = pd.read_csv("{}/{}_{}_1.csv".format(path, market, stock), header=None, names=columns)
    df["day"] = df["day"].astype(int)
    if preprocess: # process data
        df = process_data(df) 
    df["tic"] = stock # set tic list
    return df

In [112]:
# First subdataframe
final_df = generate_df(stock_list[0])
final_df

Unnamed: 0,day,open,high,low,close,volume,tic
0,0,0.578496,0.580309,0.567871,0.555112,0.593285,A
1,1,0.581725,0.581215,0.570633,0.557246,0.595410,A
2,2,0.588015,0.584431,0.573608,0.560353,0.607168,A
3,3,0.595722,0.586117,0.576604,0.563522,0.602777,A
4,4,0.599320,0.587760,0.579721,0.566039,0.597960,A
...,...,...,...,...,...,...,...
1240,1240,0.973084,0.978439,0.969663,0.966341,0.938377,A
1241,1241,0.959598,0.972588,0.967970,0.965779,0.932568,A
1242,1242,0.951665,0.966893,0.966298,0.965104,0.933418,A
1243,1243,0.944496,0.964088,0.965307,0.964783,0.945035,A


In [113]:
# other dataframes
for i in range(1, len(stock_list)):
    df = generate_df(stock_list[i])
    final_df = pd.concat([final_df, df])
final_df = final_df.sort_values(by=["day", "tic"])
final_df

Unnamed: 0,day,open,high,low,close,volume,tic
0,0,0.578496,0.580309,0.567871,0.555112,0.593285,A
0,0,0.586496,0.589924,0.593711,0.598892,0.603702,AAN
0,0,0.357611,0.357296,0.359589,0.367068,0.361763,AAP
0,0,0.602742,0.597432,0.592057,0.594226,0.620892,AAT
0,0,0.535432,0.529228,0.526605,0.533735,0.555247,AB
...,...,...,...,...,...,...,...
1244,1244,0.961218,0.962850,0.965538,0.967666,0.959063,ZB-A
1244,1244,0.862950,0.864858,0.853509,0.855318,0.851905,ZBH
1244,1244,0.797489,0.798619,0.790712,0.792157,0.795982,ZF
1244,1244,0.712403,0.711281,0.688027,0.658353,0.734713,ZNH


## Save DataFrame

In [114]:
final_df.to_csv("../temporal_data/{}_temporal_data.csv".format(market), index=False)