# Importing Libraries

In [1]:
import warnings
warnings.simplefilter("ignore", UserWarning)

import os
import glob
import pickle
import numpy as np
import pandas as pd

# Loading Data

In [2]:
# Get a list of all the csv files
dataname = 'UK_STOCK'
dir = 'D:/Research/LIEST/Data/'
csv_files = glob.glob(dir+'RawData/UK_STOCK/*.csv')

# List comprehension that loads of all the files
dfs = [pd.read_csv(i) for i in csv_files]

In [3]:
path = []
for i in range(len(csv_files)):
    base = os.path.basename(os.path.normpath(csv_files[i]))
    path.append(os.path.splitext(base)[0])

# Company-Sector tagging

In [4]:
CC = ['AO.L', 'CARD.L', 'FSTA.L', 'HFD.L', 'GYM.L','HEAD.L','BOWL.L','CPG.L','CCL.L','FRAS.L','HWDN.L','TW.L','PSN.L',
      'BDEV.L','BKG.L','SMDS.L','WTB.L', 'COA.L']

CD = ['AEP.L', 'GNC.L','ULVR.L','DGE.L','BATS.L','RKT.L','TSCO.L','ABF.L', 'BNZL.L','SBRY.L','PZC.L','PFD.L']

Financial = ['ADIG.L', 'ANII.L','ASIT.L','AAIF.L','ACIC.L','ASLI.L', 'AUSC.L', 'ATS.L', 'AIE.L','DGN.L',
             'AUGM.L','BGCG.L','BGS.L', 'CSN.L', 'FCH.L', 'HSBA.L','LSEG.L','PRU.L','LLOY.L', 'BARC.L',
             'NWG.L','III.L','ABDN.L','LIO.L']

Automobile = ['ABDP.L', 'TIFS.L', 'AML.L', 'SCE.L', 'TRT.L', 'AUTG.L']

Real_estate = ['AEWU.L', 'BOOT.L','HWG.L','HLCL.L','IHR.L', 'MLI.L','SHED.L','HMSO.L','THRL.L']

Technology = ['ALFA.L', 'APTD.L', 'AVV.L',  'SXS.L','OXIG.L']

HC = ['AZN.L','GSK.L','SN.L', 'MDC.L','HIK.L','CTEC.L','GDR.L', 'YGEN.L', 'NCYT.L','MXCT.L', 'RENX.L']

Industrial = ['AVON.L', 'DLAR.L', 'HYVE.L','EXPN.L', 'BA.L','RTO.L','DPLM.L','IMI.L','RS1.L','WEIR.L',
              'SMIN.L', 'SPX.L','BOY.L' ]

Communication = ['BMY.L', 'REL.L','WPP.L','ITV.L', 'AAF.L','AUTO.L','RMV.L']

BM = ['FORT.L','HOC.L', 'RIO.L','GLEN.L','AAL.L','ANTO.L','CRDA.L','JMAT.L']

Energy = ['HTG.L','SHEL.L','BP.L','POS.L', 'WTE.L', 'TOM.L']

Utilities = ['NG.L','SSE.L','CNA.L']

sector = [CC, CD, Financial, Automobile, Real_estate, Technology, HC, Industrial,
          Communication, BM, Energy, Utilities ]

sectorname = ['CC', 'CD', 'Financial', 'Automobile', 'Real_estate', 'Technology',
              'HC', 'Industrial', 'Communication', 'BM', 'Energy', 'Utilities']

sectordict = {sectorname[i]: sector[i] for i in range(len(sector))}

# Preprocessing

In [5]:
#Changing indices of each dataframe
for i in range(len(csv_files)):
    dfs[i].set_index("Date", inplace = True)
    dfs[i].index= pd.to_datetime(dfs[i].index)
    dfs[i].index = dfs[i].index.strftime('%Y-%m-%d')

#Considering Closing price for each stock
dfs_copy = [pd.DataFrame(dfs[i].loc[:,'Close']) for i in range(len(csv_files))]

# Merging all parts (Stock name as column name and date as index)
dfs_main = pd.concat(dfs_copy , axis = 1) 
dfs_main = dfs_main.set_axis(path , axis=1)
dfs_main = dfs_main.sort_index()

# dfs main (filling nan values by previous values)
dfs_main.fillna(method='ffill', inplace=True)

# Saving the preprocessed data and company sector labels 

In [6]:
# In covid data
dfs_main.loc['2019-12-01':'2020-08-31',:].to_csv(dir+'PreprocessedData/'+dataname+'_in_covid.csv')
# post covid data
dfs_main.loc['2020-09-01':'2021-06-31',:].to_csv(dir+'PreprocessedData/'+dataname+'_post_covid.csv')

In [7]:
# Saving csLabels
with open(dir+"CSLabels/"+dataname+"Sectors.pkl", "wb") as fp:
    pickle.dump(sectordict, fp)

In [8]:
# Saving Company names
with open(dir+"CSLabels/"+dataname+"Companies.pkl", "wb") as fp:
    pickle.dump(path, fp)