In [None]:
import numpy as np
import pandas as pd

# Load two datasets

In [None]:
stock_price = pd.read_csv('2.18 quarterly stock price.csv')
balance_sheet = pd.read_csv('2.18 interpolated balance sheet.csv')

In [None]:
stock_price.head(1)

Unnamed: 0,Date,Open,High,Low,Close,Volume,ISIN
0,2006-03-31,0.171269,0.234368,0.171269,0.234368,1416258.0,MXP000171316


In [None]:
balance_sheet.head(1)

Unnamed: 0,datadate,key,fiscal_quarter,fiscal_year,total_assets,total_liabilities,isin,company_name,gic_industries,iso_country_code,industry_name,is_finance
0,2006-03-31,18075.0,1,2006,1223.955,1044.091,BRAZULACNPR4,AZUL SA,203020.0,BRA,Passenger Airlines,False


In [None]:
print(stock_price.shape)
print(balance_sheet.shape)

(19224, 7)
(48960, 12)


# Keep only the companies in stock_price

In [None]:
isin_list = list(stock_price['ISIN'].unique())

In [None]:
len(isin_list)

267

In [None]:
balance_sheet = balance_sheet[balance_sheet['isin'].isin(isin_list)]

In [None]:
balance_sheet.shape

(19224, 12)

# We cannot use index to merge them. Hence, try to sort them:

In [None]:
sorted_stock_price = stock_price.sort_values(by=['ISIN', 'Date']).reset_index(drop=True)
sorted_stock_price.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,ISIN
0,2006-03-31,0.185858,0.202586,0.170246,0.185858,8522223.0,ARAGRO010015
1,2006-06-30,0.185858,0.187717,0.131959,0.141996,8754235.25,ARAGRO010015
2,2006-09-30,0.142368,0.148687,0.133818,0.146456,6738297.0,ARAGRO010015
3,2006-12-31,0.147943,0.174707,0.14497,0.16244,8879853.35,ARAGRO010015
4,2007-03-31,0.160571,0.179738,0.152226,0.158462,8679271.4,ARAGRO010015


In [None]:
sorted_balance_sheet = balance_sheet.sort_values(by=['isin', 'datadate']).reset_index(drop=True)
sorted_balance_sheet.head()

Unnamed: 0,datadate,key,fiscal_quarter,fiscal_year,total_assets,total_liabilities,isin,company_name,gic_industries,iso_country_code,industry_name,is_finance
0,2006-03-31,212935.0,1,2006,74.4419,27.491,ARAGRO010015,AGROMETAL SA,201060.0,ARG,Machinery,False
1,2006-06-30,212935.0,2,2006,77.1485,32.3303,ARAGRO010015,AGROMETAL SA,201060.0,ARG,Machinery,False
2,2006-09-30,212935.0,3,2006,89.3972,36.4684,ARAGRO010015,AGROMETAL SA,201060.0,ARG,Machinery,False
3,2006-12-31,212935.0,4,2006,77.2313,23.2864,ARAGRO010015,AGROMETAL SA,201060.0,ARG,Machinery,False
4,2007-03-31,212935.0,1,2007,83.2715,29.5663,ARAGRO010015,AGROMETAL SA,201060.0,ARG,Machinery,False


## Check that the order is correct:

In [None]:
temp = sorted_stock_price[['Date', 'ISIN']]
temp.columns = ['datadate', 'isin']

In [None]:
np.sum(temp != sorted_balance_sheet[['datadate', 'isin']])

datadate    0
isin        0
dtype: int64

# Merge the two dataset

In [None]:
temp = sorted_stock_price.drop(columns = ['Date', 'ISIN'])

In [None]:
combined_df = pd.concat([sorted_balance_sheet, temp], axis = 1)

In [None]:
combined_df.head()

Unnamed: 0,datadate,key,fiscal_quarter,fiscal_year,total_assets,total_liabilities,isin,company_name,gic_industries,iso_country_code,industry_name,is_finance,Open,High,Low,Close,Volume
0,2006-03-31,212935.0,1,2006,74.4419,27.491,ARAGRO010015,AGROMETAL SA,201060.0,ARG,Machinery,False,0.185858,0.202586,0.170246,0.185858,8522223.0
1,2006-06-30,212935.0,2,2006,77.1485,32.3303,ARAGRO010015,AGROMETAL SA,201060.0,ARG,Machinery,False,0.185858,0.187717,0.131959,0.141996,8754235.25
2,2006-09-30,212935.0,3,2006,89.3972,36.4684,ARAGRO010015,AGROMETAL SA,201060.0,ARG,Machinery,False,0.142368,0.148687,0.133818,0.146456,6738297.0
3,2006-12-31,212935.0,4,2006,77.2313,23.2864,ARAGRO010015,AGROMETAL SA,201060.0,ARG,Machinery,False,0.147943,0.174707,0.14497,0.16244,8879853.35
4,2007-03-31,212935.0,1,2007,83.2715,29.5663,ARAGRO010015,AGROMETAL SA,201060.0,ARG,Machinery,False,0.160571,0.179738,0.152226,0.158462,8679271.4


## Check that there is no null value:

In [None]:
np.sum(combined_df.isnull(), axis = 0)

datadate             0
key                  0
fiscal_quarter       0
fiscal_year          0
total_assets         0
total_liabilities    0
isin                 0
company_name         0
gic_industries       0
iso_country_code     0
industry_name        0
is_finance           0
Open                 0
High                 0
Low                  0
Close                0
Volume               0
dtype: int64

# Check the number of firms and number of financial firms in the combined dataset

In [None]:
def check(df):
  num_company = df['isin'].nunique()
  num_rows = df.shape[0]
  print(f'Number of companies: {num_company}')
  print(f'Number of rows: {num_rows}')

def get_num_finance(df):
  num_finance = df[['isin', 'is_finance']].drop_duplicates()['is_finance'].sum()
  print(f'Number of financial firms: {num_finance}')

In [None]:
check(combined_df)

Number of companies: 267
Number of rows: 19224


In [None]:
get_num_finance(combined_df)

Number of financial firms: 43


# Export the combined dataset

In [None]:
#combined_df.to_csv('2.18 combined dataset of stock data and balance sheet data.csv', index = False)

# Try to load the dataset

In [None]:
example_df = pd.read_csv('2.18 combined dataset of stock data and balance sheet data.csv')

In [None]:
example_df.shape

(19224, 17)