# Importing Libraries

In [1]:
import warnings
warnings.simplefilter("ignore", UserWarning)

import os
import glob
import pickle
import numpy as np
import pandas as pd

# Loading Data

In [2]:
# Get a list of all the csv files
dataname = 'NSE'
dir = 'D:/Research/LIEST/Data/'
csv_files = glob.glob(dir+'RawData/NSE/*.csv')

# List comprehension that loads of all the files
dfs = [pd.read_csv(i) for i in csv_files]

In [3]:
path = []
for i in range(len(csv_files)):
    base = os.path.basename(os.path.normpath(csv_files[i]))
    path.append(os.path.splitext(base)[0])

# Company-Sector tagging

In [4]:
Industrial = ['ESCORTS', 'ASHOKLEY', 'BEML', 'RIIL', 'FINPIPE', 'VESUVIUS','INGERRAND','ELGIEQUIP',
              'KSB', 'VOLTAS', 'KEC', 'BHEL', 'THERMAX', 'HEG', 'ESABINDIA', 'CENTENKA', 'HGS','CARBORUNIV',
              'SUPREMEIND', 'GESHIP', 'FINCABLES', 'BLUEDART', 'ABB', 'SIEMENS', 'LT.NS', 'HAVELLS.NS']

Healthcare = ['GLAXO', 'DRREDDY', 'CIPLA', 'SUNPHARMA', 'IPCALAB', 'PFIZER', 'AUROPHARMA', 'NATCOPHARM',
              'APOLLOHOSP', 'DIVISLAB.NS', 'TORNTPHARM.NS', 'CADILAHC.NS', 'BIOCON.NS', 'ABBOTINDIA',
              'LUPIN.NS']

BasicMaterials = ['TNPETRO', 'SUPPETRO', 'DCW','NOCIL', 'TIRUMALCHM', 'TATACHEM', 'GHCL',
                  'GUJALKALI', 'PIDILITIND', 'FOSECOIND', 'BASF', 'HINDALCO', 'SAIL', 'TATAMETALI',
                  'MAHSEAMLES', 'SURYAROSNI', 'TNPL','KAKATCEM','ASIANPAINT','BERGEPAINT', 'GNFC',
                  'DEEPAKFERT', 'GSFC','SRF', 'GRASIM', 'INDIACEM','KESORAMIND', 'RAMANEWS', 'POLYPLEX',
                  'SPIC', 'NAGAFERT', 'ACC', 'CENTURYTEX', 'JSWSTEEL.NS', 'SHREECEM.NS', 'ULTRACEMCO.NS',
                  'VEDL.NS']

Energy = ['HINDPETRO', 'ONGC', 'RELIANCE', 'BPCL', 'ADANIENT', 'NTPC.NS']

ConsumerCyclical = ['MIRCELECTR', 'BATAINDIA', 'ICIL','ARVIND', 'RAYMOND', 'HIMATSEIDE', 'BOMDYEING',
                    'NAHARSPING', 'MARALOVER', 'SIYSIL', 'INDHOTEL', 'EIHOTEL', 'ASIANHOTNR', 'COSMOFILMS',
                    'THOMASCOOK', 'TITAN', 'NAHARINDUS', 'JCHAC.NS']

Automobile = ['LUMAXIND','HEROMOTOCO','SHANTIGEAR', 'MAHSCOOTER', 'BAJAJ-AUTO','EICHERMOT',
              'HINDMOTORS', 'SWARAJENG', 'APOLLOTYRE', 'FMGOETZE', 'MRF', 'UCALFUEL', 'BHARATFORG',
              'M&M', 'SUNDRMFAST', 'MARUTI.NS', 'TATAMOTORS.NS', 'MOTHERSUMI.NS', 'BOSCHLTD.NS']

Financial = ['IDBI', 'HDFCBANK', 'SBIN', 'KARURVYSYA', 'IFCI','RELCAPITAL', 'CHOLAFIN',
             'BAJFINANCE', 'HDFC', 'LICHSGFIN', 'CANFINHOME', 'GICHSGFIN', 'TFCILTD', 'CRISIL',
             'BAJAJFINSV.NS', 'KOTAKBANK.NS']

ConsumerDefensive = ['ITC', 'VSTIND', 'GODFRYPHLP', 'HARRMALAYA', 'BALRAMCHIN', 'RAJSREESUG',
                     'SAKHTISUG', 'DHAMPURSUG', 'BRITANNIA', 'RUCHI','DABUR', 'COLPAL','HINDUNILVR',
                     'EIDPARRY', 'ZEEL', 'NESTLEIND.NS', 'TATACONSUM.NS', 'RADICO.NS', 'MCDOWELL-N.NS',
                     'MARICO.NS']

Technology = ['TATAELXSI', 'ROLTA', 'INFY', 'MASTEK', 'WIPRO', 'SHYAMTEL', 'BIRLACABLE','TCS', 'ITI',
              'HCLTECH.NS', 'MPHASIS.NS']

Utilities = ['GIPCL', 'CESC', 'TATAPOWER']

RealEstate = ['UNITECH']

Telecom = ['MTNL', 'BHARTIARTL.NS', 'TATACOMM.NS']

sector = [Industrial, Healthcare, BasicMaterials, Energy, ConsumerCyclical, Automobile,
          Financial, ConsumerDefensive, Technology, Utilities, RealEstate, Telecom]

sectorname = ['Industrial', 'Healthcare', 'BasicMaterials', 'Energy', 'ConsumerCyclical', 'Automobile',
          'Financial', 'ConsumerDefensive', 'Technology', 'Utilities', 'RealEstate', 'Telecom']

sectordict = {sectorname[i]: sector[i] for i in range(len(sector))}

# Preprocessing

In [5]:
#Changing indices of each dataframe
for i in range(len(csv_files)):
    dfs[i].set_index("Date", inplace = True)
    dfs[i].index= pd.to_datetime(dfs[i].index)
    dfs[i].index = dfs[i].index.strftime('%Y-%m-%d')

#Considering Closing price for each stock
dfs_copy = [pd.DataFrame(dfs[i].loc[:,'Close']) for i in range(len(csv_files))]

# Merging all parts (Stock name as column name and date as index)
dfs_main = pd.concat(dfs_copy , axis = 1) 
dfs_main = dfs_main.set_axis(path , axis=1)
dfs_main = dfs_main.sort_index()

# dfs main (filling nan values by previous values)
dfs_main.fillna(method='ffill', inplace=True)

# Saving the preprocessed data and company sector labels 

In [6]:
# In covid data
dfs_main.loc['2019-12-01':'2020-08-31',:].to_csv(dir+'PreprocessedData/'+dataname+'_in_covid.csv')
# post covid data
dfs_main.loc['2020-09-01':'2021-06-31',:].to_csv(dir+'PreprocessedData/'+dataname+'_post_covid.csv')

In [7]:
# Saving csLabels
with open(dir+"CSLabels/"+dataname+"Sectors.pkl", "wb") as fp:
    pickle.dump(sectordict, fp)

In [8]:
# Saving Company names
with open(dir+"CSLabels/"+dataname+"Companies.pkl", "wb") as fp:
    pickle.dump(path, fp)