In [1]:
import pandas as pd
import numpy as np

In [2]:
def check_null(df):
  return np.sum(df.isnull(), axis = 0)
def check_zero(df):
  return np.sum(df == 0, axis = 0)

In [3]:
stock_price = pd.read_csv('2.18 stock price.csv')

In [4]:
stock_price.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker,Exchange,ISIN
0,2020-11-12,20.205654,21.898428,19.81426,20.450274,20.450274,2124310.0,RRRP3.SA,BOVESPA,BRRRRPACNOR5
1,2020-11-13,20.548122,20.890591,20.352427,20.352427,20.352427,480335.0,RRRP3.SA,BOVESPA,BRRRRPACNOR5
2,2020-11-16,20.694895,20.792744,20.518768,20.548122,20.548122,179666.0,RRRP3.SA,BOVESPA,BRRRRPACNOR5
3,2020-11-17,20.489412,20.538338,20.205654,20.538338,20.538338,262242.0,RRRP3.SA,BOVESPA,BRRRRPACNOR5
4,2020-11-18,20.538338,20.616615,20.450274,20.450274,20.450274,98724.0,RRRP3.SA,BOVESPA,BRRRRPACNOR5


In [5]:
stock_price.shape

(2022145, 10)

In [6]:
check_null(stock_price)

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
Ticker       0
Exchange     0
ISIN         0
dtype: int64

In [7]:
check_zero(stock_price)

Date              0
Open            665
High            504
Low             507
Close           506
Adj Close       506
Volume       542959
Ticker            0
Exchange          0
ISIN              0
dtype: int64

In [8]:
stock_price[stock_price['Open']==0].head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker,Exchange,ISIN
49723,2011-03-24,0.0,13.793078,13.476069,13.718165,13.099489,5758858.0,ALFAA.MX,BMV,MXP000511016
49724,2011-03-25,0.0,14.068976,13.721819,13.952952,13.323687,5757632.0,ALFAA.MX,BMV,MXP000511016
49731,2011-04-05,0.0,15.576366,14.982546,15.256617,14.568557,8592664.0,ALFAA.MX,BMV,MXP000511016
49753,2011-05-09,0.0,15.497799,15.201803,15.363504,14.845777,2923126.0,ALFAA.MX,BMV,MXP000511016
49758,2011-05-16,0.0,15.072989,14.692944,14.903979,14.401737,10245072.0,ALFAA.MX,BMV,MXP000511016


In [9]:
for column in ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']:
  stock_price[column] = np.where(stock_price[column] == 0, np.nan, stock_price[column])

In [10]:
check_null(stock_price)

Date              0
Open            665
High            504
Low             507
Close           506
Adj Close       506
Volume       542959
Ticker            0
Exchange          0
ISIN              0
dtype: int64

In [11]:
check_null(stock_price) + 55837

Date          55837
Open          56502
High          56341
Low           56344
Close         56343
Adj Close     56343
Volume       598796
Ticker        55837
Exchange      55837
ISIN          55837
dtype: int64

In [12]:
stock_price['Date'] = pd.to_datetime(stock_price['Date'])

In [13]:
stock_price['Year'] = stock_price['Date'].dt.strftime('%Y')

In [14]:
stock_price = stock_price[stock_price['Year'] != '2024']

In [15]:
stock_price.shape[0]

2002896

In [16]:
isin_2006 = list(stock_price[stock_price['Year'] == '2006']['ISIN'].unique())
len(isin_2006)

267

In [17]:
stock_price_2006 = stock_price[stock_price['ISIN'].isin(isin_2006)]

In [18]:
stock_price_2006.shape[0]

1197728

In [19]:
check_null(stock_price_2006)

Date              0
Open            502
High            504
Low             507
Close           506
Adj Close       506
Volume       367841
Ticker            0
Exchange          0
ISIN              0
Year              0
dtype: int64

# Try to get all the dates

In [20]:
correct_datetime = pd.DataFrame({'datadate': list(np.sort(stock_price['Date'].unique()))})

In [21]:
correct_datetime.head()

Unnamed: 0,datadate
0,2006-01-02
1,2006-01-03
2,2006-01-04
3,2006-01-05
4,2006-01-06


In [22]:
correct_datetime.shape[0]

4695

In [23]:
len(isin_2006)

267

In [24]:
# find the expected length of the dataframe
expected_length = correct_datetime.shape[0] * len(isin_2006)
expected_length

1253565

1253565 - 1197728 = 55837 rows interpolated.

# Time interpolation

In [25]:
stock_price_2006.head(1)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker,Exchange,ISIN,Year
809,2006-01-02,1.867705,1.867705,1.867705,1.867705,1.867705,,ACCELSAB.MX,BMV,MXP000171316,2006


In [26]:
correct_datetime.set_index('datadate', inplace = True)

In [27]:
first_company_info = stock_price_2006[stock_price_2006['ISIN'] == isin_2006[0]].set_index('Date')
stock_price_adjusted = first_company_info.join(correct_datetime, how = 'outer').reset_index()

for column in ['Ticker', 'Exchange', 'ISIN']:
    stock_price_adjusted[column] = stock_price_adjusted[stock_price_adjusted[column].notnull()][column].unique().item()


# interpolation:
stock_price_adjusted.set_index('index', inplace = True)
for column in ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']:
  stock_price_adjusted[column] = stock_price_adjusted[column].interpolate(method='time', limit_direction='both')
stock_price_adjusted.reset_index(drop = False, inplace = True)

In [28]:
stock_price_adjusted.shape[0]

4695

In [None]:
for isin in isin_2006[1:]:
    temp = stock_price_2006[stock_price_2006['ISIN'] == isin].set_index('Date')
    temp = temp.join(correct_datetime, how = 'outer').reset_index()

    for column in ['Ticker', 'Exchange', 'ISIN']:
      temp[column] = temp[temp[column].notnull()][column].unique().item()


    # interpolation:
    temp.set_index('index', inplace = True)
    for column in ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']:
      temp[column] = temp[column].interpolate(method='time', limit_direction='both')
    temp.reset_index(drop = False, inplace = True)

    stock_price_adjusted = pd.concat([stock_price_adjusted, temp], axis = 0)

print(stock_price_adjusted.shape[0])

1253565


In [None]:
stock_price_adjusted.columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Ticker', 'Exchange', 'ISIN', 'Year']

In [None]:
stock_price_adjusted['Year'] = stock_price_adjusted['Date'].dt.strftime('%Y')

In [None]:
check_null(stock_price_adjusted)

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
Ticker       0
Exchange     0
ISIN         0
Year         0
dtype: int64

# All prices to USD

In [None]:
stock_price_adjusted.head(1)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker,Exchange,ISIN,Year
0,2006-01-02,1.867705,1.867705,1.867705,1.867705,1.867705,10172.0,ACCELSAB.MX,BMV,MXP000171316,2006


In [None]:
stock_price_adjusted.shape

(1253565, 11)

## Load exchange rate dataset

In [None]:
exchange_rate = pd.read_excel('exchange rates.xlsx')

In [None]:
exchange_rate.head(2)

Unnamed: 0,Country,Country Code,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,United States,USA,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,United Kingdom,GBR,0.543349,0.49977,0.545602,0.640837,0.647392,0.62366,0.630878,0.639537,0.607349,0.65404,0.740701,0.77623,0.749078,0.783585,0.779495,0.726696,0.811149,0.806456


In [None]:
def get_exchange_rate(df):
    index = 0
    if df['Exchange'] == 'NYSE' or df['Exchange'] ==  'CCSE':
        index += 0
    elif df['Exchange'] == 'AIM' or df['Exchange'] == 'LSE' or df['Exchange'] == 'OFEX':
        index += 1
    elif df['Exchange'] == 'TWSE' or df['Exchange'] == 'TPEX':
        index += 2
    elif df['Exchange'] == 'KOSDAQ':
        index += 3
    elif df['Exchange'] == 'SNSE':
        index += 4
    elif df['Exchange'] == 'BMV':
        index += 5
    elif df['Exchange'] == 'BOVESPA':
        index += 6
    elif df['Exchange'] == 'BVC':
        index += 7
    elif df['Exchange'] == 'BASE':
        index += 8
    elif df['Exchange'] == 'SEHK':
        index += 9
    elif df['Exchange'] == 'ASX':
        index += 10
    elif df['Exchange'] == 'TTSE':
        index += 11
    elif df['Exchange'] == 'JMSE':
        index += 12
    elif df['Exchange'] == 'ENXTAM':
        index += 13
    return exchange_rate.loc[index,float(df['Year'])]

In [None]:
stock_price_adjusted['exchange_rate'] = stock_price_adjusted.apply(get_exchange_rate, axis = 1)

In [None]:
for column in ['Open', 'High', 'Low', 'Close', 'Adj Close']:
    stock_price_adjusted[column] = (stock_price_adjusted[column]/stock_price_adjusted['exchange_rate'])

In [None]:
stock_price_adjusted.shape

(1253565, 12)

In [None]:
stock_price_adjusted.head(1)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker,Exchange,ISIN,Year,exchange_rate
0,2006-01-02,0.171269,0.171269,0.171269,0.171269,0.171269,10172.0,ACCELSAB.MX,BMV,MXP000171316,2006,10.905087


# Check how many of the 2006 companies are in financial industry

In [None]:
balance_sheet_df = pd.read_csv('2.18 interpolated balance sheet.csv')
balance_sheet_df.head(1)

Unnamed: 0,datadate,key,fiscal_quarter,fiscal_year,total_assets,total_liabilities,isin,company_name,gic_industries,iso_country_code,industry_name,is_finance
0,2006-03-31,18075.0,1,2006,1223.955,1044.091,BRAZULACNPR4,AZUL SA,203020.0,BRA,Passenger Airlines,False


In [None]:
temp = balance_sheet_df[balance_sheet_df['isin'].isin(isin_2006)]

In [None]:
def get_num_finance(df):
  return df[['isin', 'is_finance']].drop_duplicates()['is_finance'].sum()

get_num_finance(temp)

43

# Get quarterly stock price data

In [None]:
q_stock_price = stock_price_adjusted[stock_price_adjusted['ISIN']==isin_2006[0]]

In [None]:
q_stock_price.head(1)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker,Exchange,ISIN,Year,exchange_rate
0,2006-01-02,0.171269,0.171269,0.171269,0.171269,0.171269,10172.0,ACCELSAB.MX,BMV,MXP000171316,2006,10.905087


In [None]:
logic = {'Open'  : 'first',
         'High'  : 'max',
         'Low'   : 'min',
         'Close' : 'last',
         'Volume': 'sum'}

q_stock_price = q_stock_price.iloc[:,:9].resample('Q', on='Date').apply(logic)
q_stock_price.reset_index(inplace = True)
q_stock_price = pd.concat([q_stock_price, pd.DataFrame({'ISIN': [isin_2006[0]]*q_stock_price.shape[0]})], axis = 1)

In [None]:
q_stock_price.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,ISIN
0,2006-03-31,0.171269,0.234368,0.171269,0.234368,1416258.0,MXP000171316
1,2006-06-30,0.234368,0.252397,0.234368,0.252397,1444249.0,MXP000171316
2,2006-09-30,0.252397,0.252397,0.224903,0.224903,200327.4,MXP000171316
3,2006-12-31,0.224903,0.224903,0.220847,0.220847,47449.78,MXP000171316
4,2007-03-31,0.22035,0.26532,0.206859,0.26532,2747720.0,MXP000171316


In [None]:
q_stock_price.shape

(72, 7)

In [None]:
for isin in isin_2006[1:]:
  temp = stock_price_adjusted[stock_price_adjusted['ISIN']==isin]
  temp = temp.iloc[:,:9].resample('Q', on='Date').apply(logic)
  temp.reset_index(inplace = True)
  isin_column = pd.DataFrame({'ISIN': [isin]*temp.shape[0]})
  temp = pd.concat([temp, isin_column], axis = 1)
  q_stock_price = pd.concat([q_stock_price, temp], axis = 0)

In [None]:
q_stock_price.shape

(19224, 7)

In [None]:
q_stock_price.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,ISIN
0,2006-03-31,0.171269,0.234368,0.171269,0.234368,1416258.0,MXP000171316
1,2006-06-30,0.234368,0.252397,0.234368,0.252397,1444249.0,MXP000171316
2,2006-09-30,0.252397,0.252397,0.224903,0.224903,200327.4,MXP000171316
3,2006-12-31,0.224903,0.224903,0.220847,0.220847,47449.78,MXP000171316
4,2007-03-31,0.22035,0.26532,0.206859,0.26532,2747720.0,MXP000171316


# Export the stock data

In [None]:
#q_stock_price.to_csv('2.18 quarterly stock price.csv', index = False)