# Importing Libraries

In [1]:
import warnings
warnings.simplefilter("ignore", UserWarning)

import os
import glob
import pickle
import numpy as np
import pandas as pd

# Loading Data

In [2]:
# Get a list of all the csv files
dataname = 'NASDAQ'
dir = 'D:/Research/LIEST/Data/'
csv_files = glob.glob(dir+'RawData/NASDAQ/*.csv')

# List comprehension that loads of all the files
dfs = [pd.read_csv(i) for i in csv_files]

In [3]:
path = []
for i in range(len(csv_files)):
    base = os.path.basename(os.path.normpath(csv_files[i]))
    path.append(os.path.splitext(base)[0])

# Company-Sector tagging

In [4]:
Technology = ['AMAT', 'ADSK', 'ADBE', 'AAPL', 'ENPH', 'CTSH','DDOG', 'CRWD', 'CDNS','AVGO','ASML', 'AUB','ANSS','AMD','MCHP',
              'INTU','INTC','FTNT','FISV','MRVL','MSFT', 'ZM','TXN', 'CSCO' ]

Healthcare = ['ALGN', 'DXCM','BIIB', 'AZN', 'AMGN','KALA','ISRG','KPTI','IDXX','ILMN','GILD','GERN','FREQ','MRNA','NCNA','WBA',
              'VRTX','SGEN','REGN']

Consumer_Cyclical = ['EBAY', 'CPRT','AMZN', 'BKNG','BBBY','MELI','MAR','LULU','JD','PDD','ORLY','SBUX','ROST']

Consumer_Defensive =['DLTR', 'COST','MNST', 'MDLZ', 'KDP','KHC','PEP','STKL']

Utilities = ['AEP','EXC', 'XEL']

Industrial = ['ADP', 'CSX', 'CTAS', 'AXON','HON','FAST','PCAR','PAYX','ODFL','NKLA','VRSK']

Energy = ['BKR','FANG','GLNG']

Real_Estate = ['AGNC', 'CSGP', 'LAMR', 'GOOD','HST','GLPI','EQIX','SBRA','ROIC']

Financial = ['ACGL', 'ABCB','BANR','CATY','CACC','BPRN','BPOP','BOKF','BGCP','BHF','TIGR','PFG','PYPL']

Communication_services = ['EA', 'CMCSA','CHTR','ATVI','META', 'LSXMK', 'LSXMA','GOOGL','NFLX','PARA','WBD','TMUS','SIRI']

Automobile = ['CVGI','MPAA','LKQ','GT','GNTX','WPRT','TSLA','RIDE']

Basic_Material = [ 'MERC','UFPI']

sector = [Technology, Healthcare, Consumer_Cyclical, Consumer_Defensive , Utilities, Industrial,
          Energy, Real_Estate , Financial, Communication_services, Automobile , Basic_Material]

sectorname = ['Technology', 'Healthcare', 'Consumer_Cyclical', 'Consumer_Defensive' , 'Utilities', 'Industrial',
          'Energy', 'Real_Estate' , 'Financial', 'Communication_services', 'Automobile' , 'Basic_Material']

sectordict = {sectorname[i]: sector[i] for i in range(len(sector))}

# Preprocessing

In [5]:
#Changing indices of each dataframe
for i in range(len(csv_files)):
    dfs[i].set_index("Date", inplace = True)
    dfs[i].index= pd.to_datetime(dfs[i].index)
    dfs[i].index = dfs[i].index.strftime('%Y-%m-%d')

#Considering Closing price for each stock
dfs_copy = [pd.DataFrame(dfs[i].loc[:,'Close']) for i in range(len(csv_files))]

# Merging all parts (Stock name as column name and date as index)
dfs_main = pd.concat(dfs_copy , axis = 1) 
dfs_main = dfs_main.set_axis(path , axis=1)
dfs_main = dfs_main.sort_index()

# dfs main (filling nan values by previous values)
dfs_main.fillna(method='ffill', inplace=True)

# Saving the preprocessed data and company sector labels 

In [6]:
# In covid data
dfs_main.loc['2019-12-01':'2020-08-31',:].to_csv(dir+'PreprocessedData/'+dataname+'_in_covid.csv')
# post covid data
dfs_main.loc['2020-09-01':'2021-06-31',:].to_csv(dir+'PreprocessedData/'+dataname+'_post_covid.csv')

In [7]:
# Saving csLabels
with open(dir+"CSLabels/"+dataname+"Sectors.pkl", "wb") as fp:
    pickle.dump(sectordict, fp)

In [8]:
# Saving Company names
with open(dir+"CSLabels/"+dataname+"Companies.pkl", "wb") as fp:
    pickle.dump(path, fp)