# Importing Libraries

In [1]:
import warnings
warnings.simplefilter("ignore", UserWarning)

import os
import glob
import pickle
import numpy as np
import pandas as pd

# Loading Data

In [2]:
# Get a list of all the csv files
dataname = 'NIKKEI'
dir = 'D:/Research/LIEST/Data/'
csv_files = glob.glob(dir+'RawData/NIKKEI/*.csv')

# List comprehension that loads of all the files
dfs = [pd.read_csv(i) for i in csv_files]

In [3]:
path = []
for i in range(len(csv_files)):
    base = os.path.basename(os.path.normpath(csv_files[i]))
    path.append(os.path.splitext(base)[0])

# Company-Sector tagging

In [4]:
Technology = ['6706.T', '6817.T', '6810.T', '4427.T', '4425.T', '6836.T', '3803.T', '2323.T', '6977.T', '3913.T', '9425.T', '4444.T',
              '4335.T', '6942.T',  '6550.T']

Healthcare = ['4563.T', '6190.T', '4583.T', '4586.T', '4570.T', '6090.T', '4584.T', '4556.T', '7681.T', '4355.T', '6545.T','6029.T',
              '7813.T', '7037.T']

Consumer_cyclical = ['7683.T', '7682.T', '2753.T', '2752.T', '6191.T','8181.T', '3358.T', '3953.T','3571.T']

Consumer_Defensive = ['1333.T','1382.T', '2901.T', '2926.T', '5341.T', '4918.T', '7073.T', '4920.T','7515.T', '2683.T','2107.T', 
                      '2112.T', '7413.T']

Utilities = ['2743.T', '1711.T', '9514.T', '9517.T', '9513.T', '9511.T', '9519.T', '9532.T', '9531.T', '9533.T']

Energy = ['7462.T', '5010.T', '5009.T', '7486.T', '9386.T', '5015.T', '5019.T', '5017.T', '5020.T', '5021.T']

Basic_material = ['3103.T', '5707.T', '5541.T','3101.T', '5486.T', '5632.T', '3945.T', '3892.T', '1971.T']

Industrial = ['7004.T', '6472.T', '6103.T', '5932.T','6440.T', '4657.T','1716.T', '6403.T','4651.T', '2749.T', '7314.T',
              '7018.T', '5603.T']

Financial = ['8558.T', '8624.T', '2134.T', '8742.T', '7162.T', '8746.T', '8617.T', '7175.T', '8518.T', '8747.T', '9318.T',
             '6196.T', '8783.T', '8789.T', '7192.T', '6178.T']

Communication = ['2440.T', '4760.T', '8072.T', '6177.T', '2459.T', '3929.T', '3137.T', '6026.T', '3842.T']

Automobile = ['5202.T','7294.T', '7623.T', '5104.T', '5101.T', '5105.T', '5108.T', '2754.T','7255.T', '7259.T', '7256.T', '7254.T', 
              '7273.T', '5162.T']

Real_Estate = [ '6192.T', '8905.T', '8908.T', '8995.T', '1400.T', '3261.T', '3494.T', '7837.T']

sector = [Technology, Healthcare, Consumer_cyclical, Consumer_Defensive, Utilities, Energy,
          Basic_material, Industrial, Financial, Communication, Automobile, Real_Estate]

sectorname = ['Technology', 'Healthcare', 'Consumer_cyclical', 'Consumer_Defensive', 'Utilities', 'Energy',
              'Basic_material', 'Industrial', 'Financial', 'Communication', 'Automobile', 'Real_Estate']

sectordict = {sectorname[i]: sector[i] for i in range(len(sector))}

# Preprocessing

In [5]:
#Changing indices of each dataframe
for i in range(len(csv_files)):
    dfs[i].set_index("Date", inplace = True)
    dfs[i].index= pd.to_datetime(dfs[i].index)
    dfs[i].index = dfs[i].index.strftime('%Y-%m-%d')

#Considering Closing price for each stock
dfs_copy = [pd.DataFrame(dfs[i].loc[:,'Close']) for i in range(len(csv_files))]

# Merging all parts (Stock name as column name and date as index)
dfs_main = pd.concat(dfs_copy , axis = 1) 
dfs_main = dfs_main.set_axis(path , axis=1)
dfs_main = dfs_main.sort_index()

# dfs main (filling nan values by previous values)
dfs_main.fillna(method='ffill', inplace=True)

# Saving the preprocessed data and company sector labels 

In [6]:
# In covid data
dfs_main.loc['2019-12-01':'2020-08-31',:].to_csv(dir+'PreprocessedData/'+dataname+'_in_covid.csv')
# post covid data
dfs_main.loc['2020-09-01':'2021-06-31',:].to_csv(dir+'PreprocessedData/'+dataname+'_post_covid.csv')

In [7]:
# Saving csLabels
with open(dir+"CSLabels/"+dataname+"Sectors.pkl", "wb") as fp:
    pickle.dump(sectordict, fp)

In [8]:
# Saving Company names
with open(dir+"CSLabels/"+dataname+"Companies.pkl", "wb") as fp:
    pickle.dump(path, fp)