Notebook for filtering the accounts and indicators that will be used for selecting stocks for the Magic Formula

In [1]:
import pandas as pd
pd.options.display.max_colwidth = 30
pd.options.display.max_rows = 4

In [2]:
# Lendo a base diretamente do S3 -> são mais de 10 milhões de linhas contábeis!
# df = pd.read_feather("s3://aq-dl/FinancialStatements/dataset.feather")
df = pd.read_feather("/mnt/aq_disk/data/FinancialStatements/dataset.feather")
print('Number of companies available for backtesting:', df.cia_id.nunique())
df

Number of companies available for backtesting: 1108


Unnamed: 0,cia_id,cia_nome,doc_id,doc_arq,doc_tp,doc_ver,doc_env,doc_ref,per_ini,per_fim,per_ref,dem_tp,conta_id,conta_fixa,conta_desc,conta_valor
0,3,CIA MODELO,54947,00000320160331301.zip,ITR,1,2016-04-02 12:04:12,2016-03-31,2016-01-01,2016-03-31,0,IND,3.01,1,Receita de Venda de Bens e...,100000000.0
1,3,CIA MODELO,54947,00000320160331301.zip,ITR,1,2016-04-02 12:04:12,2016-03-31,2016-01-01,2016-03-31,0,IND,3.02,1,Custo dos Bens e/ou Serviç...,-10000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10546746,11312,OI S.A. - EM RECUPERAÇÃO J...,118172,01131220220331301.zip,ITR,1,2022-06-29 00:05:12,2022-03-31,2022-01-01,2022-03-31,0,IND,9.02.02,1,Ações Preferenciais (em Te...,1812000.0
10546747,11312,OI S.A. - EM RECUPERAÇÃO J...,118172,01131220220331301.zip,ITR,1,2022-06-29 00:05:12,2022-03-31,2022-01-01,2022-03-31,0,IND,9.02.03,1,Total de Ações (em Tesoura...,645862000.0


In [3]:
# Ler arquivo com as empresas que farão parte do backtesting
df_included = pd.read_csv("../../data/included_companies.psv", sep="|")
df_included

Unnamed: 0,DENOM_SOCIAL,DT_REG,DT_CANCEL,SIT,DT_INI_SIT,CD_CVM,SETOR_ATIV,CONTROLE_ACIONARIO
0,ACOPALMA CIA INDL ACOS V P...,1978-05-09,2007-08-07,CANCELADA,2007-08-07,60,Metalurgia e Siderurgia,PRIVADO
1,ACOS ANHANGUERA SA,1971-10-20,1994-02-09,CANCELADA,1994-02-09,78,Metalurgia e Siderurgia,PRIVADO
...,...,...,...,...,...,...,...,...
1441,QESTRA TECNOLOGIA ADMINIST...,2022-05-30,,ATIVO,2022-05-30,26816,Comunicação e Informática,PRIVADO
1442,CONCESSIONARIA CATARINENSE...,2022-06-21,,ATIVO,2022-06-21,26859,Emp. Adm. Part. - Serviços...,PRIVADO


In [4]:
# Create list with companies that must be excluded from backtest
included_codes = df_included.CD_CVM.to_list()
print(included_codes[:10])

[60, 78, 86, 94, 116, 132, 159, 167, 175, 183]


In [5]:
# Remove those companies from financials dataframe
df.query('cia_id == @included_codes', inplace=True)
print('Number of companies available for backtesting:', df.cia_id.nunique())
df

Number of companies available for backtesting: 585


Unnamed: 0,cia_id,cia_nome,doc_id,doc_arq,doc_tp,doc_ver,doc_env,doc_ref,per_ini,per_fim,per_ref,dem_tp,conta_id,conta_fixa,conta_desc,conta_valor
171,94,PANATLANTICA S.A.,6094,00009420101231401.zip,DFP,1,2011-03-31 10:16:48,2010-12-31,2008-01-01,2008-12-31,-2,IND,1,1,Ativo Total,1.847500e+08
172,94,PANATLANTICA S.A.,6094,00009420101231401.zip,DFP,1,2011-03-31 10:16:48,2010-12-31,2008-01-01,2008-12-31,-2,CON,1,1,Ativo Total,2.045610e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10546289,24775,BCBF Participações S.A.,87098,02477520181231403.zip,DFP,3,2019-09-02 11:46:05,2018-12-31,2018-01-01,2018-12-31,0,IND,9.01.01,1,Ações Ordinárias (Capital ...,1.563080e+09
10546290,24775,BCBF Participações S.A.,87098,02477520181231403.zip,DFP,3,2019-09-02 11:46:05,2018-12-31,2018-01-01,2018-12-31,0,IND,9.01.03,1,Total de Ações (Capital In...,1.563080e+09


In [6]:
# Versão inicial: somente dados auditados serão usados -> (DFP) 
df.query('doc_tp == "DFP"', inplace=True)
# Remover colunas que não serão usadas no backtesting
# doc_ref -> já está implícito no período inicial e no final
# doc_ver e doc_id -> o controle da versão do documento será feito pela horário
# de envio e pelo empresa
df.drop(columns=["doc_tp", "doc_arq", "doc_ref", "doc_ver", "doc_id"], inplace=True)
df.reset_index(drop=True, inplace=True)
print('Number of companies available for backtesting:', df.cia_id.nunique())
df

Number of companies available for backtesting: 585


Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,per_ref,dem_tp,conta_id,conta_fixa,conta_desc,conta_valor
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2008-01-01,2008-12-31,-2,IND,1,1,Ativo Total,1.847500e+08
1,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2008-01-01,2008-12-31,-2,CON,1,1,Ativo Total,2.045610e+08
...,...,...,...,...,...,...,...,...,...,...,...
4265020,24775,BCBF Participações S.A.,2019-09-02 11:46:05,2018-01-01,2018-12-31,0,IND,9.01.01,1,Ações Ordinárias (Capital ...,1.563080e+09
4265021,24775,BCBF Participações S.A.,2019-09-02 11:46:05,2018-01-01,2018-12-31,0,IND,9.01.03,1,Total de Ações (Capital In...,1.563080e+09


In [7]:
# Iremos simular somente com dados consolidados (CON)
# Os dados com o número de ações foram estão no balanço individual (IND)
# A conta que interessa é a "9.01.03" -> "Total de Ações (Capital Integralizado)"
df.query('dem_tp == "CON" or conta_id == "9.01.03"', inplace=True)
# Remover coluna que não será mais usada no backtesting
df.drop(columns=["dem_tp"], inplace=True)
df.reset_index(drop=True, inplace=True)
print('Number of companies available for backtesting:', df.cia_id.nunique())
df

Number of companies available for backtesting: 585


Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,per_ref,conta_id,conta_fixa,conta_desc,conta_valor
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2008-01-01,2008-12-31,-2,1,1,Ativo Total,2.045610e+08
1,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2008-01-01,2008-12-31,-2,1.01,1,Ativo Circulante,1.697710e+08
...,...,...,...,...,...,...,...,...,...,...
1952459,24775,BCBF Participações S.A.,2019-09-02 11:46:05,2018-01-01,2018-12-31,0,7.08.05,1,Outros,1.000000e+03
1952460,24775,BCBF Participações S.A.,2019-09-02 11:46:05,2018-01-01,2018-12-31,0,9.01.03,1,Total de Ações (Capital In...,1.563080e+09


* (pág. 138) *For purposes of the study, earnings-related numbers were based on the latest 12-month period, balance sheet items were based on the most recent balance sheet, and market prices were based on the most recent closing price. Utilities, financial stocks and companies where we could not be certain that the information in the database was timely or complete were eliminated. Adjustments were also made for certain non-interest bearing liabilities. The study was structured so that an average of 30 stocks was held during the study period. Stocks with only limited liquidity were eliminated from the study. Market capitalizations were determined based on 2003 dollars. Both the number of companies in each decile as well as the number of companies in each market capitalization group fluctuated as the number of companies in the database varied during the study period.

In [8]:
# Pelo que está no livro, somentes os indicadores dos últimos 12 meses serão usados. 
# Logo, podemos descartar períodos que não são o corrente -> per_ref == 0
df.query('per_ref == 0', inplace=True)
# Remover coluna que não será mais usada no backtesting
df.drop(columns=["per_ref"], inplace=True)
df.reset_index(drop=True, inplace=True)
print('Number of companies available for backtesting:', df.cia_id.nunique())
df

Number of companies available for backtesting: 585


Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,conta_id,conta_fixa,conta_desc,conta_valor
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1,1,Ativo Total,2.788580e+08
1,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01,1,Ativo Circulante,2.203650e+08
...,...,...,...,...,...,...,...,...,...
700205,24775,BCBF Participações S.A.,2019-09-02 11:46:05,2018-01-01,2018-12-31,7.08.05,1,Outros,1.000000e+03
700206,24775,BCBF Participações S.A.,2019-09-02 11:46:05,2018-01-01,2018-12-31,9.01.03,1,Total de Ações (Capital In...,1.563080e+09


Plano de contas da CVM (parte inicial "conta_id"):
- 1 -> Balance Sheet - Assets
- 2 -> Balance Sheet - Liabilities and Shareholders’ Equity
- 3 -> Income
- 4 -> Comprehensive Income
- 5 -> Changes in Equity
- 6 -> Cash Flow (Indirect Method)
- 7 -> Added Value

Logo, para usar a fórmula precisamos filtrar as empresas por:
- Liquidez mínima (o livro filtra por valor de mercado no lugar de liquidez)
- Excluir empresas do setor financeiro, seguros e utilities
- ROIC = EBIT / Invested Capital
- Invested Capital = Net Working Capital + Net Fixed Assets
- Earnings yield = EBIT / Enterprise Value*
    - Enterprise Value (EV) = market value of equity + net interest-bearing debt
    - Invested Capital (Damodaran) = PL + Dívida - Caixa (calculados pelo valor contábil)
- Desmembrando os indicadores:
    - EBIT = lucro antes dos juros e impostos -> 3.05
    - Market value of equity = núm. de ações x preço da ação -> 9.01.03
    - Net interest-bearing debt = dívida líquida
        - total_cash = df.loc["1.01.01"] + df.loc["1.01.02"]
        - total_debt = df.loc["2.01.04"] + df.loc["2.02.01"]
        - net_debt = total_debt - total_cash
    - PL -> df.loc["2.03"]

In [9]:
# Filtrar as contas que serão usadas
contas = ["1.01.01", "1.01.02", "2.01.04", "2.02.01", "2.03", "3.05", "9.01.03"]
df.query('conta_id == @contas', inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,conta_id,conta_fixa,conta_desc,conta_valor
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01.01,1,Caixa e Equivalentes de Caixa,4.944000e+06
1,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01.02,1,Aplicações Financeiras,7.979400e+07
...,...,...,...,...,...,...,...,...,...
27421,24775,BCBF Participações S.A.,2019-09-02 11:46:05,2018-01-01,2018-12-31,3.05,1,Resultado Antes do Resulta...,7.234170e+08
27422,24775,BCBF Participações S.A.,2019-09-02 11:46:05,2018-01-01,2018-12-31,9.01.03,1,Total de Ações (Capital In...,1.563080e+09


In [10]:
# Verificar se as contas selecionadas são fixas
df.conta_fixa.value_counts()

1    27423
0        0
Name: conta_fixa, dtype: int64

In [11]:
# Testar se sobrou alguma IF na seleção
procurar = "bco |banco|crédito|mercantil|seguradora|seguro|PPLA PARTICIPATIONS"
df.query('cia_nome.str.contains(@procurar, case=False)').drop_duplicates('cia_nome')

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,conta_id,conta_fixa,conta_desc,conta_valor


In [12]:
# Remover a coluna conta fixa da base, pois não será mais usada
df.drop(columns=["conta_fixa"], inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,conta_id,conta_desc,conta_valor
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01.01,Caixa e Equivalentes de Caixa,4.944000e+06
1,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01.02,Aplicações Financeiras,7.979400e+07
...,...,...,...,...,...,...,...,...
27421,24775,BCBF Participações S.A.,2019-09-02 11:46:05,2018-01-01,2018-12-31,3.05,Resultado Antes do Resulta...,7.234170e+08
27422,24775,BCBF Participações S.A.,2019-09-02 11:46:05,2018-01-01,2018-12-31,9.01.03,Total de Ações (Capital In...,1.563080e+09


In [13]:
# Remover a descrição do código contábil para a operação futura de unstack
df.drop(columns=["conta_desc"], inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,conta_id,conta_valor
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01.01,4.944000e+06
1,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01.02,7.979400e+07
...,...,...,...,...,...,...,...
27421,24775,BCBF Participações S.A.,2019-09-02 11:46:05,2018-01-01,2018-12-31,3.05,7.234170e+08
27422,24775,BCBF Participações S.A.,2019-09-02 11:46:05,2018-01-01,2018-12-31,9.01.03,1.563080e+09


In [14]:
# Unstack -> escolher as colunas que irão para o multiindex
colunas_index = df.columns[:-1].to_list()
colunas_index

['cia_id', 'cia_nome', 'doc_env', 'per_ini', 'per_fim', 'conta_id']

In [15]:
# Passar o dataframe para multiindex
df = df.set_index(colunas_index).sort_index()
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,conta_valor
cia_id,cia_nome,doc_env,per_ini,per_fim,conta_id,Unnamed: 6_level_1
94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01.01,4944000.0
94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01.02,79794000.0
...,...,...,...,...,...,...
26816,Qestra Tecnologia Administração E Participações S.A.,2022-04-14 20:44:45,2021-01-01,2021-12-31,9.01.03,118672.0
26859,Concessionária Catarinense de Rodovias S.A.,2022-03-29 21:09:24,2021-01-01,2021-12-31,9.01.03,991031991.0


In [16]:
# Fazer o unstack do dataframe com base na última coluna do índice (conta_id)
df = df.unstack(level='conta_id', fill_value=0)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,conta_valor,conta_valor,conta_valor,conta_valor,conta_valor,conta_valor,conta_valor
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,conta_id,1.01.01,1.01.02,2.01.04,2.02.01,2.03,3.05,9.01.03
cia_id,cia_nome,doc_env,per_ini,per_fim,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,4944000.0,79794000.0,61484000.0,4273000.0,142805000.0,24777000.0,8856000.0
94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,4944000.0,79794000.0,61485000.0,4273000.0,142082000.0,21093000.0,8856000.0
...,...,...,...,...,...,...,...,...,...,...,...
26816,Qestra Tecnologia Administração E Participações S.A.,2022-04-14 20:44:45,2021-01-01,2021-12-31,0.0,0.0,0.0,0.0,0.0,0.0,118672.0
26859,Concessionária Catarinense de Rodovias S.A.,2022-03-29 21:09:24,2021-01-01,2021-12-31,0.0,0.0,0.0,0.0,0.0,0.0,991031991.0


In [17]:
# Remover o multiindex das colunas
df.columns = df.columns.droplevel(0)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,conta_id,1.01.01,1.01.02,2.01.04,2.02.01,2.03,3.05,9.01.03
cia_id,cia_nome,doc_env,per_ini,per_fim,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,4944000.0,79794000.0,61484000.0,4273000.0,142805000.0,24777000.0,8856000.0
94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,4944000.0,79794000.0,61485000.0,4273000.0,142082000.0,21093000.0,8856000.0
...,...,...,...,...,...,...,...,...,...,...,...
26816,Qestra Tecnologia Administração E Participações S.A.,2022-04-14 20:44:45,2021-01-01,2021-12-31,0.0,0.0,0.0,0.0,0.0,0.0,118672.0
26859,Concessionária Catarinense de Rodovias S.A.,2022-03-29 21:09:24,2021-01-01,2021-12-31,0.0,0.0,0.0,0.0,0.0,0.0,991031991.0


In [18]:
# Remover o multiindex do índice
df.reset_index(inplace=True)
df.columns.name = None
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,1.01.01,1.01.02,2.01.04,2.02.01,2.03,3.05,9.01.03
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,4944000.0,79794000.0,61484000.0,4273000.0,142805000.0,24777000.0,8856000.0
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,4944000.0,79794000.0,61485000.0,4273000.0,142082000.0,21093000.0,8856000.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5606,26816,Qestra Tecnologia Administ...,2022-04-14 20:44:45,2021-01-01,2021-12-31,0.0,0.0,0.0,0.0,0.0,0.0,118672.0
5607,26859,Concessionária Catarinense...,2022-03-29 21:09:24,2021-01-01,2021-12-31,0.0,0.0,0.0,0.0,0.0,0.0,991031991.0


In [19]:
# Calcular os indicadores
df["total_cash"] = df["1.01.01"] + df["1.01.02"]
df["total_debt"] = df["2.01.04"] + df["2.02.01"]
df["net_debt"] = df["total_debt"] - df["total_cash"]
df.rename(
    columns={"2.03": "equity", "3.05": "ebit", "9.01.03": "shares_outstanding"},
    inplace=True
)
df["invested_capital"] = df["equity"] + df["net_debt"]
df["roic"] = df["ebit"] / df["invested_capital"]
df.drop(columns=["1.01.01", "1.01.02", "2.01.04", "2.02.01"], inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,equity,ebit,shares_outstanding,total_cash,total_debt,net_debt,invested_capital,roic
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,142805000.0,24777000.0,8856000.0,84738000.0,65757000.0,-18981000.0,123824000.0,0.200099
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,142082000.0,21093000.0,8856000.0,84738000.0,65758000.0,-18980000.0,123102000.0,0.171346
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5606,26816,Qestra Tecnologia Administ...,2022-04-14 20:44:45,2021-01-01,2021-12-31,0.0,0.0,118672.0,0.0,0.0,0.0,0.0,
5607,26859,Concessionária Catarinense...,2022-03-29 21:09:24,2021-01-01,2021-12-31,0.0,0.0,991031991.0,0.0,0.0,0.0,0.0,


In [20]:
df.query('equity > 0', inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,equity,ebit,shares_outstanding,total_cash,total_debt,net_debt,invested_capital,roic
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,142805000.0,24777000.0,8856000.0,84738000.0,65757000.0,-18981000.0,123824000.0,0.200099
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,142082000.0,21093000.0,8856000.0,84738000.0,65758000.0,-18980000.0,123102000.0,0.171346
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5604,26794,Self It Academias Holding ...,2021-12-10 22:49:21,2020-01-01,2020-12-31,73586000.0,-41873000.0,10336691.0,23952000.0,157490000.0,133538000.0,207124000.0,-0.202164
5605,26794,Self It Academias Holding ...,2022-04-29 18:14:40,2021-01-01,2021-12-31,89309000.0,-46981000.0,63790395.0,28204000.0,166279000.0,138075000.0,227384000.0,-0.206615


In [21]:
df.query('total_cash > 0', inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,equity,ebit,shares_outstanding,total_cash,total_debt,net_debt,invested_capital,roic
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,142805000.0,24777000.0,8856000.0,84738000.0,65757000.0,-18981000.0,123824000.0,0.200099
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,142082000.0,21093000.0,8856000.0,84738000.0,65758000.0,-18980000.0,123102000.0,0.171346
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5604,26794,Self It Academias Holding ...,2021-12-10 22:49:21,2020-01-01,2020-12-31,73586000.0,-41873000.0,10336691.0,23952000.0,157490000.0,133538000.0,207124000.0,-0.202164
5605,26794,Self It Academias Holding ...,2022-04-29 18:14:40,2021-01-01,2021-12-31,89309000.0,-46981000.0,63790395.0,28204000.0,166279000.0,138075000.0,227384000.0,-0.206615


In [22]:
# Remover ebit negativos ou próximos de 0 (logo, ROIC negativo ou próximo de 0) da base,
# pois essas empresas não serão selecionadas pela fórmula
df.query('ebit >= 0.001', inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,equity,ebit,shares_outstanding,total_cash,total_debt,net_debt,invested_capital,roic
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.428050e+08,2.477700e+07,8856000.0,84738000.0,6.575700e+07,-1.898100e+07,1.238240e+08,0.200099
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,1.420820e+08,2.109300e+07,8856000.0,84738000.0,6.575800e+07,-1.898000e+07,1.231020e+08,0.171346
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5600,26700,EUROFARMA LABORATÓRIOS S.A.,2022-03-29 18:27:45,2021-01-01,2021-12-31,3.385494e+09,1.449802e+09,858714812.0,717372000.0,2.376600e+09,1.659228e+09,5.044722e+09,0.287390
5603,26786,Senior Sistemas S.A.,2022-02-01 19:43:03,2021-01-01,2021-12-31,2.505990e+08,1.372220e+08,1269683.0,125739000.0,3.712200e+07,-8.861700e+07,1.619820e+08,0.847144


In [23]:
df.query('roic >= 0', inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,equity,ebit,shares_outstanding,total_cash,total_debt,net_debt,invested_capital,roic
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.428050e+08,2.477700e+07,8856000.0,84738000.0,6.575700e+07,-1.898100e+07,1.238240e+08,0.200099
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,1.420820e+08,2.109300e+07,8856000.0,84738000.0,6.575800e+07,-1.898000e+07,1.231020e+08,0.171346
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5600,26700,EUROFARMA LABORATÓRIOS S.A.,2022-03-29 18:27:45,2021-01-01,2021-12-31,3.385494e+09,1.449802e+09,858714812.0,717372000.0,2.376600e+09,1.659228e+09,5.044722e+09,0.287390
5603,26786,Senior Sistemas S.A.,2022-02-01 19:43:03,2021-01-01,2021-12-31,2.505990e+08,1.372220e+08,1269683.0,125739000.0,3.712200e+07,-8.861700e+07,1.619820e+08,0.847144


In [24]:
# Remover os indicadores intermediários que não serão usados no backtesting
colunas = (
    df.columns[:5].to_list() +
    ['shares_outstanding', 'net_debt', 'ebit', 'roic']
)
df = df[colunas].copy()
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,8856000.0,-1.898100e+07,2.477700e+07,0.200099
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,8856000.0,-1.898000e+07,2.109300e+07,0.171346
...,...,...,...,...,...,...,...,...,...
5600,26700,EUROFARMA LABORATÓRIOS S.A.,2022-03-29 18:27:45,2021-01-01,2021-12-31,858714812.0,1.659228e+09,1.449802e+09,0.287390
5603,26786,Senior Sistemas S.A.,2022-02-01 19:43:03,2021-01-01,2021-12-31,1269683.0,-8.861700e+07,1.372220e+08,0.847144


In [25]:
# Verificar os indicadores da Petro
# O total de ações da publicação enviada em 2011-02-25 21:33:09 foi corrigido
# na versão seguinte
df.query('cia_id == 9512')

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic


In [26]:
# Empresas sem o número de ações
# Verificar o doc. da Petro de 2018 00951220181231401.zip
df.query('shares_outstanding == 0')

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic
5421,25526,CRUZEIRO DO SUL EDUCACIONA...,2022-03-30 19:50:24,2021-01-01,2021-12-31,0.0,91674000.0,313341000.0,0.202749


In [27]:
# Remover essas empresas
df.query('shares_outstanding != 0', inplace=True)
print('Number of companies available for backtesting:', df.cia_id.nunique())
df

Number of companies available for backtesting: 375


Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,8856000.0,-1.898100e+07,2.477700e+07,0.200099
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,8856000.0,-1.898000e+07,2.109300e+07,0.171346
...,...,...,...,...,...,...,...,...,...
5600,26700,EUROFARMA LABORATÓRIOS S.A.,2022-03-29 18:27:45,2021-01-01,2021-12-31,858714812.0,1.659228e+09,1.449802e+09,0.287390
5603,26786,Senior Sistemas S.A.,2022-02-01 19:43:03,2021-01-01,2021-12-31,1269683.0,-8.861700e+07,1.372220e+08,0.847144


In [28]:
df.to_csv("../../data/magic_financials.csv", index=False)

In [29]:
# Testar o arquivo
pd.read_csv("../../data/magic_financials.csv")

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,8856000.0,-1.898100e+07,2.477700e+07,0.200099
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,8856000.0,-1.898000e+07,2.109300e+07,0.171346
...,...,...,...,...,...,...,...,...,...
2914,26700,EUROFARMA LABORATÓRIOS S.A.,2022-03-29 18:27:45,2021-01-01,2021-12-31,858714812.0,1.659228e+09,1.449802e+09,0.287390
2915,26786,Senior Sistemas S.A.,2022-02-01 19:43:03,2021-01-01,2021-12-31,1269683.0,-8.861700e+07,1.372220e+08,0.847144
