# Importing Libraries

In [1]:
import warnings
warnings.simplefilter("ignore", UserWarning)

import os
import glob
import pickle
import numpy as np
import pandas as pd

# Loading Data

In [2]:
# Get a list of all the csv files
dataname = 'BOVESPA'
dir = 'D:/Research/LIEST/Data/'
csv_files = glob.glob(dir+'RawData/BOVESPA/*.csv')

# List comprehension that loads of all the files
dfs = [pd.read_csv(i) for i in csv_files]

In [3]:
path = []
for i in range(len(csv_files)):
    base = os.path.basename(os.path.normpath(csv_files[i]))
    path.append(os.path.splitext(base)[0])

# Company-Sector tagging

In [4]:
CC = ['ALPA4.SA', 'AMER3.SA', 'CVCB3.SA','CYRE3.SA', 'PCAR3.SA','LREN3.SA','MGLU3.SA', 'MRVE3.SA','VBBR3.SA','VIIA3.SA']

CD = [ 'ABEV3.SA', 'BRFS3.SA','CRFB3.SA','COGN3.SA','JBSS3.SA','MRFG3.SA','BEEF3.SA','YDUQ3.SA']

Industrial = ['AZUL4.SA', 'CCRO3.SA','ECOR3.SA','EMBR3.SA','GOLL4.SA','RENT3.SA','RAIL3.SA','WEGE3.SA' ]

Financial = ['B3SA3.SA', 'BBAS3.SA', 'BBSE3.SA', 'BBDC4.SA', 'BBDC3.SA' , 'BPAN4.SA', 'BRAP4.SA', 'BPAC11.SA','HAPV3.SA','IRBR3.SA',
             'ITUB4.SA','ITSA4.SA','SANB11.SA']

Real_estate = ['BRML3.SA','EZTC3.SA','JHSF3.SA','MULT3.SA']

BM = [ 'BRKM5.SA', 'BRKM3.SA', 'BRKM6.SA','CSNA3.SA','DXCO3.SA', 'GGBR4.SA','KLBN11.SA','KLBN3.SA','SUZB3.SA','USIM5.SA',
      'USIM3.SA', 'USIM6.SA','VALE3.SA']

Utilities = ['CMIG4.SA','CPLE6.SA','CPLE3.SA', 'CPFE3.SA','ENBR3.SA','ELET6.SA','ENGI11.SA','ENEV3.SA','EGIE3.SA','EQTL3.SA','SBSP3.SA',
             'TAEE11.SA', 'TAEE4.SA']

Technology = ['CIEL3.SA','TOTS3.SA', 'POSI3.SA']

Energy = ['CSAN3.SA','PETR4.SA','PRIO3.SA','UGPA3.SA' ]

HC = ['FLRY3.SA', 'HYPE3.SA','QUAL3.SA','RADL3.SA',  'PFRM3.SA', 'AALR3.SA', 'DASA3.SA']

Communication = ['VIVT3.SA','TIMS3.SA']

sector = [CC, CD, Industrial, Financial, Real_estate, BM, Utilities, Technology, Energy, HC, Communication]

sectorname = ['CC', 'CD', 'Industrial', 'Financial', 'Real_estate', 'BM',
              'Utilities', 'Technology', 'Energy', 'HC', 'Communication']

sectordict = {sectorname[i]: sector[i] for i in range(len(sector))}

# Preprocessing

In [5]:
#Changing indices of each dataframe
for i in range(len(csv_files)):
    dfs[i].set_index("Date", inplace = True)
    dfs[i].index= pd.to_datetime(dfs[i].index)
    dfs[i].index = dfs[i].index.strftime('%Y-%m-%d')

#Considering Closing price for each stock
dfs_copy = [pd.DataFrame(dfs[i].loc[:,'Close']) for i in range(len(csv_files))]

# Merging all parts (Stock name as column name and date as index)
dfs_main = pd.concat(dfs_copy , axis = 1) 
dfs_main = dfs_main.set_axis(path , axis=1)
dfs_main = dfs_main.sort_index()

# dfs main (filling nan values by previous values)
dfs_main.fillna(method='ffill', inplace=True)

# Saving the preprocessed data and company sector labels 

In [6]:
# In covid data
dfs_main.loc['2019-12-01':'2020-08-31',:].to_csv(dir+'PreprocessedData/'+dataname+'_in_covid.csv')
# post covid data
dfs_main.loc['2020-09-01':'2021-06-31',:].to_csv(dir+'PreprocessedData/'+dataname+'_post_covid.csv')

In [7]:
# Saving csLabels
with open(dir+"CSLabels/"+dataname+"Sectors.pkl", "wb") as fp:
    pickle.dump(sectordict, fp)

In [8]:
# Saving Company names
with open(dir+"CSLabels/"+dataname+"Companies.pkl", "wb") as fp:
    pickle.dump(path, fp)