# Import libraries

In [1]:
import pandas as pd
import numpy as np
import xlwings as xw
import matplotlib.pyplot as plt
import os
import datetime as dt

# Define Globals

In [2]:
if os.environ["COMPUTERNAME"] == "SURFACEBOOK":
    laptop = True
else:
    laptop = False   

if not laptop:
    xlDataName = '\DissData [03].xlsx'

else:
    xlDataName = '\DissData [03]'

dataFolder = r"C:\Users\rfg\OneDrive\Desktop\Dissertation ES30029\data"
companyNamesRange = "A1:A2009"
cellRng = "A1:BYH288"
xlBook = xw.books(xlDataName[1:])
sheetWhitelist = ['all_companies']
sectors = ['dsindustry6', 'icbindustry2']
fields = []

# Read data from Excel

In [3]:
fields = []
for sheet in xlBook.sheets:
    if sheet.name not in sheetWhitelist:
        exec('{} = pd.DataFrame(xw.books(xlDataName[1:]).sheets("{}").range(cellRng).value)'.format(sheet.name, sheet.name))
        exec('fields.append({})'.format(sheet.name))
        exec('{}.name = "{}"'.format(sheet.name, sheet.name))
        exec('{}.iloc[0][0] = "{}"'.format(sheet.name, sheet.name))



# Clean up the dataframes to have correct cols and rows

for field in fields:
    print(field.name)
    companyNames = [str(i) for i in xlBook.sheets("all_companies").range(companyNamesRange).value]
    companyNames.insert(0, field.iloc[0][0])
    field.columns = companyNames
    field.index = field[field.columns[0]]
    del field[field.columns[0]]
    field.drop(field.index[0], inplace=True)
    field.replace("NA", np.nan, inplace=True)
    field = field.apply(pd.to_numeric,errors='coerce')


p
so
eps
mvtbv
dy
mv
dsindustry6
icbindustry2
fcf
opmarg
roe
roic
debtpct


# Useful Functions

In [4]:
def drop_from_fields(fieldName):
    global fields
    fields = [field for field in fields if field.name != fieldName]


# Data Cleaning

### Reshape Sectors

In [5]:
sectorFields = [dsindustry6, icbindustry2]

dsindustry6.index = p.index
dsindustry6[1:] = dsindustry6.iloc[0].values
print('dsindustry6 complete')

icbindustry2.index = p.index
try:
    icbindustry2[1:] = icbindustry2.iloc[0].values
except:
    print('Running slow method for indgroup')
    for row in icbindustry2.index:
        icbindustry2.loc[row] = icbindustry2.iloc[0].values
print('icbindustry2 complete')

dsindustry6 complete
indgroup complete


### Drop Last 10 Months (not much data)

In [6]:
for field in fields:
    field.drop(list(field.loc[pd.datetime(2018, 12, 31):].index), axis=0, inplace=True)

### Get Rid of Investment Trusts

In [8]:
trusts = [stock for stock in dsindustry6.columns if dsindustry6.iloc[0][stock] == "Investment Trusts"]
print('There were {} investment trusts in this dataset'.format(int(len(trusts))))

# Drop trusts from the dataset as they are not what we are looking for
for field in fields:
    field.drop(trusts, axis=1, inplace=True)

There were 341 investment trusts in this dataset


### Remove series which don't have data for a field

In [9]:
errorCols = []
for field in fields:
    for col in field:
        try:
            if field[col].iloc[0][:4] == '$$ER': 
                errorCols.append(col)
        except:
            pass

errorCols = list(dict.fromkeys(errorCols))

for field in fields:
    for col in errorCols:
        del field[col]

print(np.stack(fields, axis=-1).shape)

(276, 1549, 13)


### Manipulating Fields

In [10]:
# Perform manipulations on some of the fields
bvtmv = 1 / mvtbv
drop_from_fields('mvtbv')

# Data Calculation

### Returns

In [11]:
r = pd.DataFrame()

r = p/p.shift(1)-1

r.name = 'r'

In [12]:
ret_3m = pd.DataFrame()
ret_6m = pd.DataFrame()
ret_9m = pd.DataFrame()
ret_12m = pd.DataFrame()
ret_18m = pd.DataFrame()
ret_24m = pd.DataFrame()
ret_36m = pd.DataFrame()

ret_3m = (1+r).rolling(window=3).apply(np.prod, raw=True)-1
ret_6m = (1+r).rolling(window=6).apply(np.prod, raw=True)-1
ret_9m = (1+r).rolling(window=9).apply(np.prod, raw=True)-1
ret_12m = (1+r).rolling(window=12).apply(np.prod, raw=True)-1
ret_18m = (1+r).rolling(window=18).apply(np.prod, raw=True)-1
ret_24m = (1+r).rolling(window=24).apply(np.prod, raw=True)-1
ret_36m = (1+r).rolling(window=36).apply(np.prod, raw=True)-1

ret_3m.name = 'ret_3m'
ret_6m.name = 'ret_6m'
ret_9m.name = 'ret_9m'
ret_12m.name = 'ret_12m'
ret_18m.name = 'ret_18m'
ret_24m.name = 'ret_24m'
ret_36m.name = 'ret_36m'

rollingReturns = [ret_3m, ret_6m, ret_9m, ret_12m, ret_18m, ret_24m, ret_36m]
for i in rollingReturns:
    print(i.values.shape)

for i in rollingReturns:
    fields.append(i)

(276, 1549)
(276, 1549)
(276, 1549)
(276, 1549)
(276, 1549)
(276, 1549)
(276, 1549)


### Moving averages

In [13]:
map_3m = pd.DataFrame()
map_6m = pd.DataFrame()
map_9m = pd.DataFrame()
map_12m = pd.DataFrame()
map_18m = pd.DataFrame()
map_24m = pd.DataFrame()
map_36m = pd.DataFrame()

# Calculate moving averages
map_3m = p.rolling(window=3).mean()
map_6m = p.rolling(window=6).mean()
map_9m = p.rolling(window=9).mean()
map_12m = p.rolling(window=12).mean()
map_18m = p.rolling(window=18).mean()
map_24m = p.rolling(window=24).mean()
map_36m = p.rolling(window=36).mean()

map_3m.name = 'map_3m'
map_6m.name = 'map_6m'
map_9m.name = 'map_9m'
map_12m.name = 'map_12m'
map_18m.name = 'map_18m'
map_24m.name = 'map_24m'
map_36m.name = 'map_36m'

# Technical indicators (crosses of MAVs for example)

movingAverages = [map_3m, map_6m, map_9m, map_12m, map_18m, map_24m, map_36m]
for i in movingAverages:
    print(i.values.shape)

for i in movingAverages:
    fields.append(i)

(276, 1549)
(276, 1549)
(276, 1549)
(276, 1549)
(276, 1549)
(276, 1549)
(276, 1549)


### Volatilities

In [14]:
std_3m = pd.DataFrame()
std_6m = pd.DataFrame()
std_9m = pd.DataFrame()
std_12m = pd.DataFrame()
std_18m = pd.DataFrame()
std_24m = pd.DataFrame()
std_36m = pd.DataFrame()

std_3m = r.rolling(window=3).std() * (12 ** 0.5)
std_6m = r.rolling(window=6).std() * (12 ** 0.5)
std_9m = r.rolling(window=9).std() * (12 ** 0.5)
std_12m = r.rolling(window=12).std() * (12 ** 0.5)
std_18m = r.rolling(window=18).std() * (12 ** 0.5)
std_24m = r.rolling(window=24).std() * (12 ** 0.5)
std_36m = r.rolling(window=36).std() * (12 ** 0.5)

std_3m.name = 'std_3m'
std_6m.name = 'std_6m'
std_9m.name = 'std_9m'
std_12m.name = 'std_12m'
std_18m.name = 'std_18m'
std_24m.name = 'std_24m'
std_36m.name = 'std_36m'

volatilities = [std_3m, std_6m, std_9m, std_12m, std_18m, std_24m, std_36m]
for i in volatilities:
    print(i.values.shape)

for i in volatilities:
    fields.append(i)

(276, 1549)
(276, 1549)
(276, 1549)
(276, 1549)
(276, 1549)
(276, 1549)
(276, 1549)


### Sector Dummy Variables

In [57]:
# Create list of sector dummies
sectorDummies = []
for sector in list(icbindustry2.iloc[0].drop_duplicates())[:-3]:
    sectorname = "ind_{}".format(sector.lower().replace(' ', ''))
    exec('{} = (icbindustry2 == sector) * 1'.format(sectorname))
    exec('{}.name = "{}"'.format(sectorname, sectorname))
    exec('{}.index.name = "{}"'.format(sectorname, sectorname))
    exec('sectorDummies.append({})'.format(sectorname))

for i in sectorDummies:
    fields.append(i)


# Export Data

In [15]:
# Save rows and columns
def save_data():
    np.save(file=dataFolder+r'\index.npy', arr=p.index)
    np.save(file=dataFolder+r'\columns.npy', arr=p.columns)
    np.save(file=dataFolder+r'\fields.npy', arr=[field.name for field in fields])

    for field in fields:
        np.save(file=os.path.join(dataFolder, '{}.npy'.format(field.name)), arr=field)

# Misc

In [None]:
'''
TO DO
    - Create dummy variables for each ICB industry
    - Now you should have all you need to create your first regression model
'''