Notebook for filtering the accounts and indicators that will be used for selecting stocks for the Magic Formula

In [40]:
import pandas as pd
pd.options.display.max_colwidth = 20
pd.options.display.max_rows = 4

In [41]:
# Lendo a base diretamente do S3 -> são mais de 10 milhões de linhas contábeis!
df = pd.read_feather("s3://aq-dl/FinancialStatements/base.feather")
df

Unnamed: 0,cia_id,cia_nome,doc_id,doc_arq,doc_tp,doc_ver,doc_env,doc_ref,per_ini,per_fim,per_ref,dem_tp,conta_id,conta_fixa,conta_desc,conta_valor
0,3,CIA MODELO,54947,0000032016033130...,ITR,1,2016-04-02 12:04:12,2016-03-31,2016-01-01,2016-03-31,0,IND,3.01,1,Receita de Venda...,100000000.0
1,3,CIA MODELO,54947,0000032016033130...,ITR,1,2016-04-02 12:04:12,2016-03-31,2016-01-01,2016-03-31,0,IND,3.02,1,Custo dos Bens e...,-10000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10540375,25496,Jalles Machado S.A.,116986,0254962022033140...,DFP,2,2022-06-13 21:53:46,2022-03-31,2021-04-01,2022-03-31,0,IND,9.01.01,1,Ações Ordinárias...,294697091.0
10540376,25496,Jalles Machado S.A.,116986,0254962022033140...,DFP,2,2022-06-13 21:53:46,2022-03-31,2021-04-01,2022-03-31,0,IND,9.01.03,1,Total de Ações (...,294697091.0


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10540377 entries, 0 to 10540376
Data columns (total 16 columns):
 #   Column       Dtype         
---  ------       -----         
 0   cia_id       category      
 1   cia_nome     category      
 2   doc_id       category      
 3   doc_arq      category      
 4   doc_tp       category      
 5   doc_ver      category      
 6   doc_env      datetime64[ns]
 7   doc_ref      datetime64[ns]
 8   per_ini      datetime64[ns]
 9   per_fim      datetime64[ns]
 10  per_ref      category      
 11  dem_tp       category      
 12  conta_id     category      
 13  conta_fixa   category      
 14  conta_desc   category      
 15  conta_valor  float64       
dtypes: category(11), datetime64[ns](4), float64(1)
memory usage: 640.9 MB


In [43]:
# Versão inicial: somente dados auditados serão usados -> (DFP) 
df.query('doc_tp == "DFP"', inplace=True)
# Remover colunas que não serão usadas no backtesting
# doc_ref -> já está implícito no período inicial e no final
# doc_ver e doc_id -> o controle da versão do documento será feito pela horário
# de envio e pelo empresa
df.drop(columns=["doc_tp", "doc_arq", "doc_ref", "doc_ver", "doc_id"], inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,per_ref,dem_tp,conta_id,conta_fixa,conta_desc,conta_valor
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2008-01-01,2008-12-31,-2,IND,1,1,Ativo Total,184750000.0
1,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2008-01-01,2008-12-31,-2,CON,1,1,Ativo Total,204561000.0
...,...,...,...,...,...,...,...,...,...,...,...
7047597,25496,Jalles Machado S.A.,2022-06-13 21:53:46,2021-04-01,2022-03-31,0,IND,9.01.01,1,Ações Ordinárias...,294697091.0
7047598,25496,Jalles Machado S.A.,2022-06-13 21:53:46,2021-04-01,2022-03-31,0,IND,9.01.03,1,Total de Ações (...,294697091.0


In [44]:
# Iremos simular somente com dados consolidados (CON)
# Os dados com o número de ações foram estão no balanço individual (IND)
# A conta que interessa é a "9.01.03" -> "Total de Ações (Capital Integralizado)"
df.query('dem_tp == "CON" or conta_id == "9.01.03"', inplace=True)
# Remover coluna que não será mais usada no backtesting
df.drop(columns=["dem_tp"], inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,per_ref,conta_id,conta_fixa,conta_desc,conta_valor
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2008-01-01,2008-12-31,-2,1,1,Ativo Total,204561000.0
1,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2008-01-01,2008-12-31,-2,1.01,1,Ativo Circulante,169771000.0
...,...,...,...,...,...,...,...,...,...,...
2911404,25496,Jalles Machado S.A.,2022-06-13 21:53:46,2021-04-01,2022-03-31,0,7.08.04.03,1,Lucros Retidos /...,52428000.0
2911405,25496,Jalles Machado S.A.,2022-06-13 21:53:46,2021-04-01,2022-03-31,0,9.01.03,1,Total de Ações (...,294697091.0


* (pág. 138) *For purposes of the study, earnings-related numbers were based on the latest 12-month period, balance sheet items were based on the most recent balance sheet, and market prices were based on the most recent closing price. Utilities, financial stocks and companies where we could not be certain that the information in the database was timely or complete were eliminated. Adjustments were also made for certain non-interest bearing liabilities. The study was structured so that an average of 30 stocks was held during the study period. Stocks with only limited liquidity were eliminated from the study. Market capitalizations were determined based on 2003 dollars. Both the number of companies in each decile as well as the number of companies in each market capitalization group fluctuated as the number of companies in the database varied during the study period.

In [45]:
# Pelo que está no livro, somentes os indicadores dos últimos 12 meses serão usados. 
# Logo, podemos descartar períodos que não são o corrente -> per_ref == 0
df.query('per_ref == 0', inplace=True)
# Remover coluna que não será mais usada no backtesting
df.drop(columns=["per_ref"], inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,conta_id,conta_fixa,conta_desc,conta_valor
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1,1,Ativo Total,278858000.0
1,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01,1,Ativo Circulante,220365000.0
...,...,...,...,...,...,...,...,...,...
1040230,25496,Jalles Machado S.A.,2022-06-13 21:53:46,2021-04-01,2022-03-31,7.08.04.03,1,Lucros Retidos /...,52428000.0
1040231,25496,Jalles Machado S.A.,2022-06-13 21:53:46,2021-04-01,2022-03-31,9.01.03,1,Total de Ações (...,294697091.0


Plano de contas da CVM (parte inicial "conta_id"):
- 1 -> Balance Sheet - Assets
- 2 -> Balance Sheet - Liabilities and Shareholders’ Equity
- 3 -> Income
- 4 -> Comprehensive Income
- 5 -> Changes in Equity
- 6 -> Cash Flow (Indirect Method)
- 7 -> Added Value

Logo, para usar a fórmula precisamos filtrar as empresas por:
- Liquidez mínima (o livro filtra por valor de mercado no lugar de liquidez)
- Excluir empresas do setor financeiro, seguros e utilities
- ROIC = EBIT / Invested Capital
- Invested Capital = Net Working Capital + Net Fixed Assets
- Earnings yield = EBIT / Enterprise Value*
    - Enterprise Value (EV) = market value of equity + net interest-bearing debt
    - Invested Capital (Damodaran) = PL + Dívida - Caixa (calculados pelo valor contábil)
- Desmembrando os indicadores:
    - EBIT = lucro antes dos juros e impostos -> 3.05
    - Market value of equity = núm. de ações x preço da ação -> 9.01.03
    - Net interest-bearing debt = dívida líquida
        - total_cash = df.loc["1.01.01"] + df.loc["1.01.02"]
        - total_debt = df.loc["2.01.04"] + df.loc["2.02.01"]
        - net_debt = total_debt - total_cash
    - PL -> df.loc["2.03"]

In [46]:
# Filtrar as contas que serão usadas
contas = ["1.01.01", "1.01.02", "2.01.04", "2.02.01", "2.03", "3.05", "9.01.03"]
df.query('conta_id == @contas', inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,conta_id,conta_fixa,conta_desc,conta_valor
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01.01,1,Caixa e Equivale...,4.944000e+06
1,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01.02,1,Aplicações Finan...,7.979400e+07
...,...,...,...,...,...,...,...,...,...
41773,25496,Jalles Machado S.A.,2022-06-13 21:53:46,2021-04-01,2022-03-31,2.03,1,Patrimônio Líqui...,1.388729e+09
41774,25496,Jalles Machado S.A.,2022-06-13 21:53:46,2021-04-01,2022-03-31,9.01.03,1,Total de Ações (...,2.946971e+08


In [47]:
# Verificar se as contas selecionadas são fixas
df.conta_fixa.value_counts()

1    41540
0      235
Name: conta_fixa, dtype: int64

In [48]:
# Todas as empresas que contém contas não fixas nas contas selecionadas são IFs
df.query('conta_fixa == 0').cia_nome.unique()

['BCO BRADESCO S.A.', 'BCO BRASIL S.A.', 'BCO ESTADO DE S..., 'BANESTES S.A. -..., 'BCO ESTADO DO R..., ..., 'BANCO RCI BRASI..., 'Companhia de Cr..., 'BANCO BMG S/A', 'BANCO PATAGONIA..., 'PPLA PARTICIPAT...]
Length: 20
Categories (1129, object): ['2W Energia S.A.', '3A COMPANHIA SE..., '3R PETROLEUM ÓL..., '521 PARTICIPACO..., ..., 'Águas do Rio 1 ..., 'ÉVORA SA', 'ÓLEO E GÁS PART..., 'Óleos de Palma ...]

In [49]:
# Buscar demais bancos e demais IFs
procurar = "bco |banco|crédito|mercantil|seguradora|seguro|PPLA PARTICIPATIONS"
df.query('cia_nome.str.contains(@procurar, case=False)').cia_nome.unique()

['BCO BRADESCO S.A.', 'BCO AMAZONIA S.A.', 'BCO BRASIL S.A.', 'BCO ESTADO DE S..., 'BANESTES S.A. -..., ..., 'BANCO PATAGONIA..., 'PPLA PARTICIPAT..., 'BANCO SANTANDER..., 'BRAMEX - Brasil..., 'Travessia Secur...]
Length: 92
Categories (1129, object): ['2W Energia S.A.', '3A COMPANHIA SE..., '3R PETROLEUM ÓL..., '521 PARTICIPACO..., ..., 'Águas do Rio 1 ..., 'ÉVORA SA', 'ÓLEO E GÁS PART..., 'Óleos de Palma ...]

In [50]:
# Remover essas empresas do dataframe
df.query('~cia_nome.str.contains(@procurar, case=False)', inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,conta_id,conta_fixa,conta_desc,conta_valor
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01.01,1,Caixa e Equivale...,4.944000e+06
1,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01.02,1,Aplicações Finan...,7.979400e+07
...,...,...,...,...,...,...,...,...,...
39465,25496,Jalles Machado S.A.,2022-06-13 21:53:46,2021-04-01,2022-03-31,2.03,1,Patrimônio Líqui...,1.388729e+09
39466,25496,Jalles Machado S.A.,2022-06-13 21:53:46,2021-04-01,2022-03-31,9.01.03,1,Total de Ações (...,2.946971e+08


In [51]:
# Verificar se as empresas foram removidas
df.query('cia_nome.str.contains(@procurar, case=False)').cia_nome.unique()

[], Categories (1129, object): ['2W Energia S.A.', '3A COMPANHIA SE..., '3R PETROLEUM ÓL..., '521 PARTICIPACO..., ..., 'Águas do Rio 1 ..., 'ÉVORA SA', 'ÓLEO E GÁS PART..., 'Óleos de Palma ...]

In [52]:
# Verificar novamente se as contas selecionadas são todas fixas
df.conta_fixa.value_counts()

1    39467
0        0
Name: conta_fixa, dtype: int64

In [53]:
# Remover a coluna conta fixa da base, pois não será mais usada
df.drop(columns=["conta_fixa"], inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,conta_id,conta_desc,conta_valor
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01.01,Caixa e Equivale...,4.944000e+06
1,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01.02,Aplicações Finan...,7.979400e+07
...,...,...,...,...,...,...,...,...
39465,25496,Jalles Machado S.A.,2022-06-13 21:53:46,2021-04-01,2022-03-31,2.03,Patrimônio Líqui...,1.388729e+09
39466,25496,Jalles Machado S.A.,2022-06-13 21:53:46,2021-04-01,2022-03-31,9.01.03,Total de Ações (...,2.946971e+08


In [54]:
# Remover a descrição do código contábil para a operação futura de unstack
df.drop(columns=["conta_desc"], inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,conta_id,conta_valor
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01.01,4.944000e+06
1,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01.02,7.979400e+07
...,...,...,...,...,...,...,...
39465,25496,Jalles Machado S.A.,2022-06-13 21:53:46,2021-04-01,2022-03-31,2.03,1.388729e+09
39466,25496,Jalles Machado S.A.,2022-06-13 21:53:46,2021-04-01,2022-03-31,9.01.03,2.946971e+08


In [55]:
# Unstack -> escolher as colunas que irão para o multiindex
colunas_index = df.columns[:-1].to_list()
colunas_index

['cia_id', 'cia_nome', 'doc_env', 'per_ini', 'per_fim', 'conta_id']

In [56]:
# Passar o dataframe para multiindex
df = df.set_index(colunas_index).sort_index()
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,conta_valor
cia_id,cia_nome,doc_env,per_ini,per_fim,conta_id,Unnamed: 6_level_1
94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01.01,4944000.0
94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,1.01.02,79794000.0
...,...,...,...,...,...,...
503711,Óleos de Palma S.A. Agroindustrial,2017-03-20 18:30:43,2015-01-01,2015-12-31,9.01.03,425555795.0
503711,Óleos de Palma S.A. Agroindustrial,2021-08-11 11:24:53,2018-01-01,2018-12-31,9.01.03,425555795.0


In [57]:
# Fazer o unstack do dataframe com base na última coluna do índice (conta_id)
df = df.unstack(level='conta_id', fill_value=0)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,conta_valor,conta_valor,conta_valor,conta_valor,conta_valor,conta_valor,conta_valor
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,conta_id,1.01.01,1.01.02,2.01.04,2.02.01,2.03,3.05,9.01.03
cia_id,cia_nome,doc_env,per_ini,per_fim,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,4944000.0,79794000.0,61484000.0,4273000.0,142805000.0,24777000.0,8856000.0
94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,4944000.0,79794000.0,61485000.0,4273000.0,142082000.0,21093000.0,8856000.0
...,...,...,...,...,...,...,...,...,...,...,...
503711,Óleos de Palma S.A. Agroindustrial,2017-03-20 18:30:43,2015-01-01,2015-12-31,0.0,0.0,0.0,0.0,0.0,0.0,425555795.0
503711,Óleos de Palma S.A. Agroindustrial,2021-08-11 11:24:53,2018-01-01,2018-12-31,0.0,0.0,0.0,0.0,0.0,0.0,425555795.0


In [58]:
# Remover o multiindex das colunas
df.columns = df.columns.droplevel(0)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,conta_id,1.01.01,1.01.02,2.01.04,2.02.01,2.03,3.05,9.01.03
cia_id,cia_nome,doc_env,per_ini,per_fim,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,4944000.0,79794000.0,61484000.0,4273000.0,142805000.0,24777000.0,8856000.0
94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,4944000.0,79794000.0,61485000.0,4273000.0,142082000.0,21093000.0,8856000.0
...,...,...,...,...,...,...,...,...,...,...,...
503711,Óleos de Palma S.A. Agroindustrial,2017-03-20 18:30:43,2015-01-01,2015-12-31,0.0,0.0,0.0,0.0,0.0,0.0,425555795.0
503711,Óleos de Palma S.A. Agroindustrial,2021-08-11 11:24:53,2018-01-01,2018-12-31,0.0,0.0,0.0,0.0,0.0,0.0,425555795.0


In [59]:
# Remover o multiindex do índice
df.reset_index(inplace=True)
df.columns.name = None
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,1.01.01,1.01.02,2.01.04,2.02.01,2.03,3.05,9.01.03
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,4944000.0,79794000.0,61484000.0,4273000.0,142805000.0,24777000.0,8856000.0
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,4944000.0,79794000.0,61485000.0,4273000.0,142082000.0,21093000.0,8856000.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9473,503711,Óleos de Palma S...,2017-03-20 18:30:43,2015-01-01,2015-12-31,0.0,0.0,0.0,0.0,0.0,0.0,425555795.0
9474,503711,Óleos de Palma S...,2021-08-11 11:24:53,2018-01-01,2018-12-31,0.0,0.0,0.0,0.0,0.0,0.0,425555795.0


In [60]:
# Calcular os indicadores
df["total_cash"] = df["1.01.01"] + df["1.01.02"]
df["total_debt"] = df["2.01.04"] + df["2.02.01"]
df["net_debt"] = df["total_debt"] - df["total_cash"]
df.rename(
    columns={"2.03": "equity", "3.05": "ebit", "9.01.03": "shares_outstanding"},
    inplace=True
)
df["invested_capital"] = df["equity"] + df["net_debt"]
df["roic"] = df["ebit"] / df["invested_capital"]
df.drop(columns=["1.01.01", "1.01.02", "2.01.04", "2.02.01"], inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,equity,ebit,shares_outstanding,total_cash,total_debt,net_debt,invested_capital,roic
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,142805000.0,24777000.0,8856000.0,84738000.0,65757000.0,-18981000.0,123824000.0,0.200099
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,142082000.0,21093000.0,8856000.0,84738000.0,65758000.0,-18980000.0,123102000.0,0.171346
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9473,503711,Óleos de Palma S...,2017-03-20 18:30:43,2015-01-01,2015-12-31,0.0,0.0,425555795.0,0.0,0.0,0.0,0.0,
9474,503711,Óleos de Palma S...,2021-08-11 11:24:53,2018-01-01,2018-12-31,0.0,0.0,425555795.0,0.0,0.0,0.0,0.0,


In [61]:
df.query('equity > 0', inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,equity,ebit,shares_outstanding,total_cash,total_debt,net_debt,invested_capital,roic
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,142805000.0,24777000.0,8856000.0,84738000.0,65757000.0,-18981000.0,123824000.0,0.200099
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,142082000.0,21093000.0,8856000.0,84738000.0,65758000.0,-18980000.0,123102000.0,0.171346
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9467,80195,"G2D Investments,...",2021-03-12 18:33:08,2020-01-01,2020-12-31,378359000.0,92232000.0,66025325.0,1720000.0,105225000.0,103505000.0,481864000.0,0.191407
9471,90212,Multilaser Indus...,2018-05-30 15:43:03,2017-01-01,2017-12-31,859939000.0,241375000.0,216075329.0,133167000.0,62406000.0,-70761000.0,789178000.0,0.305856


In [62]:
df.query('total_cash > 0', inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,equity,ebit,shares_outstanding,total_cash,total_debt,net_debt,invested_capital,roic
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,142805000.0,24777000.0,8856000.0,84738000.0,65757000.0,-18981000.0,123824000.0,0.200099
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,142082000.0,21093000.0,8856000.0,84738000.0,65758000.0,-18980000.0,123102000.0,0.171346
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9467,80195,"G2D Investments,...",2021-03-12 18:33:08,2020-01-01,2020-12-31,378359000.0,92232000.0,66025325.0,1720000.0,105225000.0,103505000.0,481864000.0,0.191407
9471,90212,Multilaser Indus...,2018-05-30 15:43:03,2017-01-01,2017-12-31,859939000.0,241375000.0,216075329.0,133167000.0,62406000.0,-70761000.0,789178000.0,0.305856


In [63]:
# Remover ebit negativos ou próximos de 0 (logo, ROIC negativo ou próximo de 0) da base,
# pois essas empresas não serão selecionadas pela fórmula
df.query('ebit >= 0.001', inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,equity,ebit,shares_outstanding,total_cash,total_debt,net_debt,invested_capital,roic
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,142805000.0,24777000.0,8856000.0,84738000.0,65757000.0,-18981000.0,123824000.0,0.200099
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,142082000.0,21093000.0,8856000.0,84738000.0,65758000.0,-18980000.0,123102000.0,0.171346
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9467,80195,"G2D Investments,...",2021-03-12 18:33:08,2020-01-01,2020-12-31,378359000.0,92232000.0,66025325.0,1720000.0,105225000.0,103505000.0,481864000.0,0.191407
9471,90212,Multilaser Indus...,2018-05-30 15:43:03,2017-01-01,2017-12-31,859939000.0,241375000.0,216075329.0,133167000.0,62406000.0,-70761000.0,789178000.0,0.305856


In [64]:
df.query('roic >= 0', inplace=True)
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,equity,ebit,shares_outstanding,total_cash,total_debt,net_debt,invested_capital,roic
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,142805000.0,24777000.0,8856000.0,84738000.0,65757000.0,-18981000.0,123824000.0,0.200099
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,142082000.0,21093000.0,8856000.0,84738000.0,65758000.0,-18980000.0,123102000.0,0.171346
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9467,80195,"G2D Investments,...",2021-03-12 18:33:08,2020-01-01,2020-12-31,378359000.0,92232000.0,66025325.0,1720000.0,105225000.0,103505000.0,481864000.0,0.191407
9471,90212,Multilaser Indus...,2018-05-30 15:43:03,2017-01-01,2017-12-31,859939000.0,241375000.0,216075329.0,133167000.0,62406000.0,-70761000.0,789178000.0,0.305856


In [65]:
# Remover os indicadores intermediários que não serão usados no backtesting
colunas = (
    df.columns[:5].to_list() +
    ['shares_outstanding', 'net_debt', 'ebit', 'roic']
)
df = df[colunas].copy()
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,8856000.0,-18981000.0,24777000.0,0.200099
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,8856000.0,-18980000.0,21093000.0,0.171346
...,...,...,...,...,...,...,...,...,...
9467,80195,"G2D Investments,...",2021-03-12 18:33:08,2020-01-01,2020-12-31,66025325.0,103505000.0,92232000.0,0.191407
9471,90212,Multilaser Indus...,2018-05-30 15:43:03,2017-01-01,2017-12-31,216075329.0,-70761000.0,241375000.0,0.305856


In [66]:
# Verificar os indicadores da Petro
# O total de ações da publicação enviada em 2011-02-25 21:33:09 foi corrigido
# na versão seguinte
df.query('cia_id == 9512')

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic
1503,9512,PETROLEO BRASILE...,2011-02-25 21:33:09,2010-01-01,2010-12-31,1.304450e+13,6.157476e+10,4.557459e+10,0.122579
1504,9512,PETROLEO BRASILE...,2011-03-04 16:04:52,2010-01-01,2010-12-31,1.304450e+10,6.157476e+10,4.557459e+10,0.122579
...,...,...,...,...,...,...,...,...,...
1520,9512,PETROLEO BRASILE...,2021-03-22 22:06:01,2020-01-01,2020-12-31,1.304450e+10,3.282680e+11,4.962100e+10,0.077603
1521,9512,PETROLEO BRASILE...,2022-02-23 22:36:50,2021-01-01,2021-12-31,1.304450e+10,2.657780e+11,2.108310e+11,0.321703


In [67]:
# Empresas sem o número de ações
# Verificar o doc. da Petro de 2018 00951220181231401.zip
df.query('shares_outstanding == 0')

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic
4096,18660,CPFL ENERGIA S.A.,2022-03-17 19:15:29,2021-01-01,2021-12-31,0.0,19630060000.0,7407940000.0,0.203314
9110,25526,CRUZEIRO DO SUL ...,2022-03-30 19:50:24,2021-01-01,2021-12-31,0.0,91674000.0,313341000.0,0.202749


In [91]:
# Remover essas empresas
df.query('shares_outstanding != 0', inplace=True)
print('Number of companies available for backtesting:', df.cia_id.nunique())
df

Number of companies available for backtesting: 525


Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,8856000.0,-18981000.0,24777000.0,0.200099
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,8856000.0,-18980000.0,21093000.0,0.171346
...,...,...,...,...,...,...,...,...,...
9467,80195,"G2D Investments,...",2021-03-12 18:33:08,2020-01-01,2020-12-31,66025325.0,103505000.0,92232000.0,0.191407
9471,90212,Multilaser Indus...,2018-05-30 15:43:03,2017-01-01,2017-12-31,216075329.0,-70761000.0,241375000.0,0.305856


In [101]:
# Carregar tabela da CVM com os cadastros das empresas de capita aberto
CVM_PATH = "/mnt/aq_disk/data/cad_cia_aberta.csv"
cols = ['DENOM_SOCIAL', 'DT_REG', 'DT_CANCEL', 'SIT', 'DT_INI_SIT', 'CD_CVM',
       'SETOR_ATIV', 'SIT_EMISSOR', 'DT_INI_SIT_EMISSOR']
df_cvm = pd.read_csv(CVM_PATH, sep=";", encoding="iso-8859-1")[cols]
df_cvm

Unnamed: 0,DENOM_SOCIAL,DT_REG,DT_CANCEL,SIT,DT_INI_SIT,CD_CVM,SETOR_ATIV,SIT_EMISSOR,DT_INI_SIT_EMISSOR
0,2W ENERGIA S.A.,2020-10-29,,ATIVO,2020-10-29,25224,Construção Civil...,FASE OPERACIONAL,2020-07-16
1,3A COMPANHIA SEC...,2010-03-08,2015-12-18,CANCELADA,2015-12-18,21954,Securitização de...,FASE PRÉ-OPERACI...,2010-03-08
...,...,...,...,...,...,...,...,...,...
2540,ZIVI SA CUTELARIA,1968-11-01,2003-12-29,CANCELADA,2003-12-29,11843,Metalurgia e Sid...,,
2541,ZOGBI LEASING S/...,1997-09-18,2004-12-23,CANCELADA,2004-12-23,16462,Arrendamento Mer...,,


In [102]:
# Existem linhas repetidas para o mesmo cod_cvm -> adotar o últ. status
df_cvm = (df_cvm
    .sort_values("DT_INI_SIT", ignore_index=True)
    .dropna(subset='SETOR_ATIV')
    .drop_duplicates(subset="CD_CVM", keep="last", ignore_index=True)
    .sort_values("CD_CVM", ignore_index=True)
)
df_cvm

Unnamed: 0,DENOM_SOCIAL,DT_REG,DT_CANCEL,SIT,DT_INI_SIT,CD_CVM,SETOR_ATIV,SIT_EMISSOR,DT_INI_SIT_EMISSOR
0,A J RENNER SA IN...,1969-06-24,1998-06-17,CANCELADA,1998-06-17,35,Emp. Adm. Partic...,,
1,ACOPALMA CIA IND...,1978-05-09,2007-08-07,CANCELADA,2007-08-07,60,Metalurgia e Sid...,,
...,...,...,...,...,...,...,...,...,...
2440,SAP SECURITIZADO...,2022-06-21,,ATIVO,2022-06-21,26840,Securitização de...,FASE PRÉ-OPERACI...,2022-03-10
2441,CONCESSIONARIA C...,2022-06-21,,ATIVO,2022-06-21,26859,Emp. Adm. Part. ...,FASE OPERACIONAL,2022-03-29


In [103]:
for setor in df_cvm.SETOR_ATIV.sort_values().unique():
    print(setor)

Agricultura (Açúcar, Álcool e Cana)
Alimentos
Arrendamento Mercantil
Bancos
Bebidas e Fumo
Bolsas de Valores/Mercadorias e Futuros
Brinquedos e Lazer
Comunicação e Informática
Comércio (Atacado e Varejo)
Comércio Exterior
Construção Civil, Mat. Constr. e Decoração
Crédito Imobiliário
Educação
Embalagens
Emp. Adm. Part. - Agricultura (Açúcar, Álcool e Cana)
Emp. Adm. Part. - Alimentos
Emp. Adm. Part. - Arrendamento Mercantil
Emp. Adm. Part. - Bancos
Emp. Adm. Part. - Brinquedos e Lazer
Emp. Adm. Part. - Comunicação e Informática
Emp. Adm. Part. - Comércio (Atacado e Varejo)
Emp. Adm. Part. - Const. Civil, Mat. Const. e Decoração
Emp. Adm. Part. - Crédito Imobiliário
Emp. Adm. Part. - Educação
Emp. Adm. Part. - Embalagens
Emp. Adm. Part. - Energia Elétrica
Emp. Adm. Part. - Extração Mineral
Emp. Adm. Part. - Farmacêutico e Higiene
Emp. Adm. Part. - Gráficas e Editoras
Emp. Adm. Part. - Hospedagem e Turismo
Emp. Adm. Part. - Intermediação Financeira
Emp. Adm. Part. - Metalurgia e Siderurg

In [105]:
remover_setores = [
    'Arrendamento Mercantil', 
    'Bancos', 
    'Bolsas de Valores/Mercadorias e Futuros', 
    'Crédito Imobiliário', 
    'Emp. Adm. Part. - Arrendamento Mercantil', 
    'Emp. Adm. Part. - Bancos', 
    'Emp. Adm. Part. - Crédito Imobiliário', 
    'Emp. Adm. Part. - Energia Elétrica', 
    'Emp. Adm. Part. - Intermediação Financeira', 
    'Emp. Adm. Part. - Saneamento, Serv. Água e Gás', 
    'Emp. Adm. Part. - Securitização de Recebíveis', 
    'Emp. Adm. Part. - Seguradoras e Corretoras', 
    'Emp. Adm. Part. - Sem Setor Principal', 
    'Emp. Adm. Part.-Bolsas de Valores/Mercadorias e Futuros', 
    'Emp. Adm. Participações', 
    'Energia Elétrica', 
    'Factoring', 
    'Intermediação Financeira', 
    'Saneamento, Serv. Água e Gás', 
    'Securitização de Recebíveis', 
    'Seguradoras e Corretoras', 
    'Telecomunicações'
]

In [106]:
df_cvm.query('SETOR_ATIV == @remover_setores', inplace=True)
df_cvm.reset_index(drop=True, inplace=True)
df_cvm

Unnamed: 0,DENOM_SOCIAL,DT_REG,DT_CANCEL,SIT,DT_INI_SIT,CD_CVM,SETOR_ATIV,SIT_EMISSOR,DT_INI_SIT_EMISSOR
0,A J RENNER SA IN...,1969-06-24,1998-06-17,CANCELADA,1998-06-17,35,Emp. Adm. Partic...,,
1,AGRILEASING SA A...,1983-01-24,1985-09-30,CANCELADA,1985-09-30,191,Arrendamento Mer...,,
...,...,...,...,...,...,...,...,...,...
886,ANEMUS WIND HOLD...,2022-06-08,,ATIVO,2022-06-08,26832,Energia Elétrica,FASE PRÉ-OPERACI...,2022-04-01
887,SAP SECURITIZADO...,2022-06-21,,ATIVO,2022-06-21,26840,Securitização de...,FASE PRÉ-OPERACI...,2022-03-10


In [109]:
remove_codes = list(df_cvm.CD_CVM.unique())
df.query('cia_id != @remove_codes', inplace=True)
print('Number of companies available for backtesting:', df.cia_id.nunique())

Number of companies available for backtesting: 411


In [None]:
df_cvm.query('DENOM_SOCIAL.str.contains("BANCO")')['DENOM_SOCIAL']

In [110]:
df.to_csv("../data/1_financials.csv", index=False)

In [31]:
# Testar o arquivo
pd.read_csv("../data/1_financials.csv")

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic
0,94,PANATLANTICA S.A.,2011-03-31 10:16:48,2010-01-01,2010-12-31,8856000.0,-18981000.0,24777000.0,0.200099
1,94,PANATLANTICA S.A.,2011-04-01 17:31:56,2010-01-01,2010-12-31,8856000.0,-18980000.0,21093000.0,0.171346
...,...,...,...,...,...,...,...,...,...
4056,80195,"G2D Investments,...",2021-03-12 18:33:08,2020-01-01,2020-12-31,66025325.0,103505000.0,92232000.0,0.191407
4057,90212,Multilaser Indus...,2018-05-30 15:43:03,2017-01-01,2017-12-31,216075329.0,-70761000.0,241375000.0,0.305856
