In [2]:
import numpy as np
import pandas as pd

In [3]:
def check(dataframe):
  num_company = dataframe['isin'].nunique()
  num_rows = dataframe.shape[0]
  print(f'Number of companies: {num_company}')
  print(f'Number of rows: {num_rows}')

def get_num_finance(dataframe):
  return dataframe[['isin', 'is_finance']].drop_duplicates()['is_finance'].sum()

# Import original dataset before elimination

In [4]:
df = pd.read_csv('2.18 combined dataset of stock data and balance sheet data.csv')

# Change a problematic company name

In [5]:
df[df['company_name']=='ADMINISTRADORA DE FONDOS DE']['isin'].unique()

array(['CL0000003096', 'CLP7919K1035'], dtype=object)

After doing research we notice that they are two completely different companies with the same abbreviation, one is called 'Administradora de Fondos de Pensiones Capital S.A.' and the other is called 'Administradora de Fondos de Pensiones Provida S.A.'. Thus we modify the name:

In [6]:
df['company_name'] = np.where(df['isin']=='CL0000003096', 'ADMINISTRADORA DE FONDOS DE PENSIONES CAPITAL S.A.', df['company_name'])
df['company_name'] = np.where(df['isin']=='CLP7919K1035', 'ADMINISTRADORA DE FONDOS DE PENSIONES PROVIDA S.A.', df['company_name'])

# Check some basic info

In [7]:
check(df)

Number of companies: 267
Number of rows: 19224


In [8]:
get_num_finance(df)

43

In [9]:
df[df['isin']=='CL0000003096']['datadate'].duplicated().sum()

0

In [10]:
temp = df[df['isin']=='CL0000003096']['datadate']
temp.iloc[0]
temp[temp == '2006-03-31']

9792    2006-03-31
Name: datadate, dtype: object

# 1. Eliminate based on the 0.9 criteria

In [11]:
df[df['total_assets']/df['total_liabilities'] <= 0.9]

Unnamed: 0,datadate,key,fiscal_quarter,fiscal_year,total_assets,total_liabilities,isin,company_name,gic_industries,iso_country_code,industry_name,is_finance,Open,High,Low,Close,Volume
576,2006-03-31,349889.0,1,2006,5.22010,19.246000,ARHULI010014,HULYTEGO S.A.I.C.,151010.0,ARG,Chemicals,False,0.097576,0.097576,0.097576,0.097576,1.950000e+05
577,2006-06-30,349889.0,2,2006,5.22010,19.246000,ARHULI010014,HULYTEGO S.A.I.C.,151010.0,ARG,Chemicals,False,0.097576,0.097576,0.097576,0.097576,1.950000e+05
578,2006-09-30,349889.0,3,2006,5.22010,19.246000,ARHULI010014,HULYTEGO S.A.I.C.,151010.0,ARG,Chemicals,False,0.097576,0.097576,0.097576,0.097576,1.950000e+05
579,2006-12-31,349889.0,4,2006,5.22010,19.246000,ARHULI010014,HULYTEGO S.A.I.C.,151010.0,ARG,Chemicals,False,0.097576,0.097576,0.097576,0.097576,1.950000e+05
580,2007-03-31,349889.0,1,2007,5.34647,18.729918,ARHULI010014,HULYTEGO S.A.I.C.,151010.0,ARG,Chemicals,False,0.096288,0.096288,0.096288,0.096288,1.950000e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16308,2015-03-31,268567.0,1,2015,9322.76000,44900.472000,MX01UR000007,URBI DESARROLLOS URBANOS SA,252010.0,MEX,Household Durables,False,1884.532227,1884.532227,1884.532227,1884.532227,8.790134e+05
16309,2015-06-30,268567.0,2,2015,8540.12900,43951.996000,MX01UR000007,URBI DESARROLLOS URBANOS SA,252010.0,MEX,Household Durables,False,1884.532227,1884.532227,1884.532227,1884.532227,1.032365e+06
16310,2015-09-30,268567.0,3,2015,8470.32400,44264.144000,MX01UR000007,URBI DESARROLLOS URBANOS SA,252010.0,MEX,Household Durables,False,1884.532227,1884.532227,1884.532227,1884.532227,1.192210e+06
16311,2015-12-31,268567.0,4,2015,5613.41100,42636.795000,MX01UR000007,URBI DESARROLLOS URBANOS SA,252010.0,MEX,Household Durables,False,1884.532227,1884.532227,1884.528661,1884.528661,1.337262e+06


In [12]:
isin_to_be_dropped = list(df[df['total_assets']/df['total_liabilities'] <= 0.9]['isin'].unique())

In [13]:
df = df[~df['isin'].isin(isin_to_be_dropped)]

In [14]:
check(df)

Number of companies: 232
Number of rows: 16704


In [15]:
get_num_finance(df)

43

# 2. Eliminate the companies with correlation equal to -1, 1, or 0

## Functions from model

In [16]:
df['total_assets'].replace(0,0.01,inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['total_assets'].replace(0,0.01,inplace = True)


In [17]:
(df[df['company_name'] == df['company_name'].unique()[0]].shape[0])+1

73

In [18]:
len(df['company_name'].unique())

232

### Change fiscal_quarter

In [19]:
series_list = []
for i in range(1,(df[df['company_name'] == df['company_name'].unique()[0]].shape[0])+1):
  series_list.append(i)

In [20]:
temp = pd.DataFrame({'fiscal_quarter': series_list * (len(df['company_name'].unique()))})
df['fiscal_quarter'] = temp['fiscal_quarter']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fiscal_quarter'] = temp['fiscal_quarter']


In [21]:
len(df['company_name'].unique())

232

In [22]:
temp['fiscal_quarter'].shape

(16704,)

In [23]:
df["datadate"] = pd.to_datetime(df["datadate"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["datadate"] = pd.to_datetime(df["datadate"])


In [24]:
df.head(5)

Unnamed: 0,datadate,key,fiscal_quarter,fiscal_year,total_assets,total_liabilities,isin,company_name,gic_industries,iso_country_code,industry_name,is_finance,Open,High,Low,Close,Volume
0,2006-03-31,212935.0,1.0,2006,74.4419,27.491,ARAGRO010015,AGROMETAL SA,201060.0,ARG,Machinery,False,0.185858,0.202586,0.170246,0.185858,8522223.0
1,2006-06-30,212935.0,2.0,2006,77.1485,32.3303,ARAGRO010015,AGROMETAL SA,201060.0,ARG,Machinery,False,0.185858,0.187717,0.131959,0.141996,8754235.25
2,2006-09-30,212935.0,3.0,2006,89.3972,36.4684,ARAGRO010015,AGROMETAL SA,201060.0,ARG,Machinery,False,0.142368,0.148687,0.133818,0.146456,6738297.0
3,2006-12-31,212935.0,4.0,2006,77.2313,23.2864,ARAGRO010015,AGROMETAL SA,201060.0,ARG,Machinery,False,0.147943,0.174707,0.14497,0.16244,8879853.35
4,2007-03-31,212935.0,5.0,2007,83.2715,29.5663,ARAGRO010015,AGROMETAL SA,201060.0,ARG,Machinery,False,0.160571,0.179738,0.152226,0.158462,8679271.4


In [25]:
temp = df['datadate']
print(type(temp.iloc[0]))

<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [26]:
### These are function producing the estimated parameters used in the Zhou(2001) model
def get_companies():
  return df['company_name'].unique()

def get_industries():
  return df['industry_name'].unique()

def get_date():
  return df['datadate'].unique()

def get_num_quarter():
  return len(df['fiscal_quarter'].value_counts())

def get_companies_from_industry(industry):
  return df[df['industry_name'] == industry]['company_name'].unique()

def get_companies_from_country(country):
  return df[df['iso_country_code'] == country]['company_name'].unique()

def get_Vi0(start,company1,company2):
  '''
  This function takes the a start date and two company names to return the starting value of assets
  Output: [v_company1_0,v_company_2_0]
  '''
  v_10 = df[(df['company_name'] == company1)&(df['datadate'] == start)]['total_assets'].item()
  v_20 = df[(df['company_name'] == company2)&(df['datadate'] == start)]['total_assets'].item()
  return v_10,v_20

def get_Ki(start,company1,company2):
  '''
  This function takes the a start date and two company names to return the starting value of Ki := short-term liability + 50% long-term liability
                                                                                                 = total liability -50% long-term liability
  Output: k_company1,k_company_2
  '''
  total_liability_1 = df.loc[(df['company_name'] == company1)&(df['datadate'] == start),['total_liabilities']].iloc[0,0]
  total_liability_2 = df.loc[(df['company_name'] == company2)&(df['datadate'] == start),['total_liabilities']].iloc[0,0]
  # long_term_liability_1 = df.loc[(df['company_name'] == company1)&(df['datadate'] == start),['long_term_liabilities']].iloc[0,0]
  # long_term_liability_2 = df.loc[(df['company_name'] == company2)&(df['datadate'] == start),['long_term_liabilities']].iloc[0,0]
  return  0.75*total_liability_1,0.75*total_liability_2

def get_t(start,end):
  '''
  This function takes the start date and end date of a time horizon and return t
  in Zhou(2001) formula
  Output: t(in years)
  '''
  start_idx = np.where(df['datadate'] == start)[0][0]
  end_idx = np.where(df['datadate'] == end)[0][0]
  return (end_idx - start_idx)/4

def get_sigma_i(company):
  '''
  This function returns the estimated asset volatility of a firm
  '''
  return 0.4

def get_rho(start,end,companies,column,company_1 = None,company_2 = None):
  '''
  This function takes the start date and end date of a time frame and returns the correlation matrix of asset value in a given time horizon
  Note companies is a numpy array containing the name of companies
  Output: A list object:[whole correlation matrix, corr(companyname1,companyname2)] if company_1 and company_2 specified
          the whole correlation matrix otherwise
  '''
  diff_matrix = np.zeros((int(get_t(start,end)*4),len(companies)))
  for i in [i for i in range(0,len(companies))]:
    col_val = np.diff(np.log(df[(df['datadate'] >= start) & (df['datadate'] <= end)&(df['company_name'] == companies[i])][column]))
    diff_matrix [:,i]= col_val
  diff_matrix = pd.DataFrame(diff_matrix)
  corr_df  = diff_matrix.corr()
  if company_1 == None and company_2 == None:
    return corr_df
  else:
    company_1_idx = companies.tolist().index(company_1)
    company_2_idx = companies.tolist().index(company_2)
    output = corr_df[company_1_idx][company_2_idx]
    return output

## Elimination: stage 1

In [29]:
get_rho(get_date()[0],get_date()[3],get_companies(),'Close')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,222,223,224,225,226,227,228,229,230,231
0,1.000000,-0.633200,0.677572,0.904816,0.989767,0.721817,0.835530,0.495626,-0.650678,0.901614,...,0.982783,0.674592,0.944351,0.904763,0.945420,0.183826,0.653276,,0.565793,0.986711
1,-0.633200,1.000000,0.140197,-0.243364,-0.516281,0.078610,-0.103794,0.358406,-0.175722,-0.236120,...,-0.765304,0.144200,-0.343368,-0.902549,-0.850847,0.644400,-0.999656,,-0.996450,-0.499024
2,0.677572,0.140197,1.000000,0.926237,0.775580,0.998081,0.970224,0.974593,-0.999353,0.929024,...,0.530020,0.999992,0.881786,0.299800,0.400939,0.847479,-0.114188,,-0.223054,0.788068
3,0.904816,-0.243364,0.926237,1.000000,0.956315,0.947803,0.989956,0.818275,-0.912078,0.999972,...,0.810565,0.924705,0.994527,0.637289,0.716682,0.584875,0.268711,,0.160844,0.961979
4,0.989767,-0.516281,0.775580,0.956315,1.000000,0.813185,0.905380,0.614486,-0.752372,0.954108,...,0.946363,0.773021,0.981624,0.834731,0.889250,0.322203,0.538558,,0.442349,0.999799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,0.183826,0.644400,0.847479,0.584875,0.322203,0.812979,0.693674,0.944844,-0.866025,0.590911,...,-0.000954,0.849619,0.496930,-0.252338,-0.146508,1.000000,-0.624128,,-0.706488,0.341100
228,0.653276,-0.999656,-0.114188,0.268711,0.538558,-0.052444,0.129836,-0.333805,0.149850,0.261517,...,0.781917,-0.118205,0.367876,0.913529,0.864331,-0.624128,1.000000,,0.993900,0.521573
229,,,,,,,,,,,...,,,,,,,,,,
230,0.565793,-0.996450,-0.223054,0.160844,0.442349,-0.162256,0.019694,-0.435727,0.257975,0.153476,...,0.708399,-0.226994,0.263081,0.863096,0.803594,-0.706488,0.993900,,1.000000,0.424297


In [31]:
temp = get_rho(get_date()[0],get_date()[3],get_companies(),'Close')
series = np.sum(temp.isnull(), axis = 0)==232
problematic_list = list(series.index[series])

problematic_name = []
for i in problematic_list:
  problematic_name.append(get_companies()[i])
problematic_name

['LEYDEN ARGENTINA SA LEID',
 'COMPANHIA CATARINENSE DE AG',
 'COMPANHIA BRASILEIRA DE DIST',
 'SUZANO SA',
 'ULTRAPAR PARTICIPACOES SA',
 'ADMINISTRADORA DE FONDOS DE PENSIONES CAPITAL S.A.',
 'VINEDOS EMILIANA SA',
 'CIA INVERSIONES LA ESPANOLA',
 'FRUTICOLA VICONTO SA',
 'ADMINISTRADORA DE FONDOS DE PENSIONES PROVIDA S.A.',
 'QUILICURA SA',
 'IMPULSORA DEL DESARROLLO ECO',
 'INTERAMERICANA ENTRTENMIENTO',
 'ARC MINERALS LTD']

We decide to remove them all, as they have difference in stock price = 0, which makes no sense.

In [32]:
df = df[~df['company_name'].isin(problematic_name)]

In [33]:
check(df)

Number of companies: 218
Number of rows: 15696


In [34]:
get_num_finance(df)

40

## Elimination: stage 2

In [35]:
get_rho(get_date()[0],get_date()[3],get_companies(),'Close')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,208,209,210,211,212,213,214,215,216,217
0,1.000000,-0.633200,0.677572,0.904816,0.989767,0.721817,0.835530,0.495626,-0.650678,0.901614,...,-0.315740,0.982783,0.674592,0.944351,0.904763,0.945420,0.183826,0.653276,0.565793,0.986711
1,-0.633200,1.000000,0.140197,-0.243364,-0.516281,0.078610,-0.103794,0.358406,-0.175722,-0.236120,...,0.934322,-0.765304,0.144200,-0.343368,-0.902549,-0.850847,0.644400,-0.999656,-0.996450,-0.499024
2,0.677572,0.140197,1.000000,0.926237,0.775580,0.998081,0.970224,0.974593,-0.999353,0.929024,...,0.483899,0.530020,0.999992,0.881786,0.299800,0.400939,0.847479,-0.114188,-0.223054,0.788068
3,0.904816,-0.243364,0.926237,1.000000,0.956315,0.947803,0.989956,0.818275,-0.912078,0.999972,...,0.118335,0.810565,0.924705,0.994527,0.637289,0.716682,0.584875,0.268711,0.160844,0.961979
4,0.989767,-0.516281,0.775580,0.956315,1.000000,0.813185,0.905380,0.614486,-0.752372,0.954108,...,-0.177118,0.946363,0.773021,0.981624,0.834731,0.889250,0.322203,0.538558,0.442349,0.999799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,0.945420,-0.850847,0.400939,0.716682,0.889250,0.456903,0.610888,0.185559,-0.367726,0.711459,...,-0.607691,0.989349,0.397230,0.785623,0.994167,1.000000,-0.146508,0.864331,0.803594,0.879910
214,0.183826,0.644400,0.847479,0.584875,0.322203,0.812979,0.693674,0.944844,-0.866025,0.590911,...,0.874635,-0.000954,0.849619,0.496930,-0.252338,-0.146508,1.000000,-0.624128,-0.706488,0.341100
215,0.653276,-0.999656,-0.114188,0.268711,0.538558,-0.052444,0.129836,-0.333805,0.149850,0.261517,...,-0.924655,0.781917,-0.118205,0.367876,0.913529,0.864331,-0.624128,1.000000,0.993900,0.521573
216,0.565793,-0.996450,-0.223054,0.160844,0.442349,-0.162256,0.019694,-0.435727,0.257975,0.153476,...,-0.961012,0.708399,-0.226994,0.263081,0.863096,0.803594,-0.706488,0.993900,1.000000,0.424297


In [43]:
# Check correlation == 1:
temp = get_rho(get_date()[0],get_date()[3],get_companies(),'Close')
series_1 = np.sum(temp==1, axis = 0)
cov1_list = list(series_1[series_1 != 1].index)
cov1_list

[8, 44, 59, 61, 66, 69, 71, 81, 98, 99, 116, 121, 132, 153, 176, 180]

In [47]:
# Check correlation == 0:
series_0 = np.sum(temp==0, axis = 0)
cov0_list = list(series_0[series_0 != 0].index)
cov0_list

[105, 142, 214]

In [48]:
# Check correlation == -1:
series_neg1 = np.sum(temp==-1, axis = 0)
covneg1_list = list(series_neg1[series_0 != 0].index)
covneg1_list

[105, 142, 214]

In [51]:
problematic_cov_list = cov1_list + cov0_list

In [53]:
problematic_name_2 = []

for i in problematic_cov_list:
  problematic_name_2.append(get_companies()[i])

problematic_name_2

['INSUMOS AGROQUIMIC SA',
 'CYRELA BRAZIL REALTY SA',
 'LPS BRASIL-CONSULTORIA DE',
 'M DIAS BRANCO SA IND',
 'PROFARMA DISTRIBUIDORA DE',
 'POSITIVO TECNOLOGIA S.A',
 'RAIA DROGASIL SA',
 'SYN PROP & TECH S.A.',
 'PAZ CORP SA',
 'SONDA S.A.',
 'INVERSIONES UNESPA SA',
 'GRUPO EMPRESAS NAVIERAS SA',
 'MARBELLA COUNTRY CLUB',
 'CLINICA DE MARLY SA',
 'GRUPO AEROPORTUARIO DEL CENT',
 'CASA DE BOLSA FINAMEX SAB DE',
 'AGRICOLA NACIONAL SACEI',
 'HIPODROMO CHILE SA',
 'BANCO NACIONAL DE CREDITO CA']

We decide to remove them all.

In [54]:
df = df[~df['company_name'].isin(problematic_name_2)]

In [55]:
check(df)

Number of companies: 199
Number of rows: 14328


In [56]:
get_num_finance(df)

38

# Export the new dataset

In [None]:
df

In [59]:
#df.to_csv('2.22 combined dataset of stock data and balance sheet data.csv', index = False)