In [1]:
from pathlib import Path
import pandas as pd
# Mostrar floats com duas casas decimas
pd.set_option('display.float_format',  lambda x: '%.2f' % x)
pd.options.display.max_colwidth = 20
pd.options.display.max_columns = 20
pd.options.display.max_rows = 6

In [2]:
# Carregar base NÃO AJUSTADA do S3 na seguinte raíz: s3://aq-dl/HistoricalQuotations/
# Caminho local:
DATA_FOLDER = Path("/mnt/aq_disk/data/HistoricalQuotations/interim")
# BASE_ADJ = DATA_FOLDER / "base_adj.feather"
DATASET1 = DATA_FOLDER / "dataset_95-21.feather"
DATASET2 = DATA_FOLDER / "dataset_22.feather"
cols = ['datneg', 'codneg', 'nomres', 'especi', 'codbdi', 'tpmerc', 'preult', 'premed', 'totneg', 'voltot']
df_magic = (pd
    .concat([pd.read_feather(DATASET1), pd.read_feather(DATASET2)], ignore_index=True)
    [cols]
    .query('tpmerc == 10')
)
df_magic

Unnamed: 0,datneg,codneg,nomres,especi,codbdi,tpmerc,preult,premed,totneg,voltot
0,2020-02-07,A1AP34,ADVANCE AUTO,DRN,2,10,143.12,143.12,2,6139848.00
1,2020-02-10,A1AP34,ADVANCE AUTO,DRN,2,10,142.27,142.27,1,512172.00
2,2020-02-13,A1AP34,ADVANCE AUTO,DRN,2,10,147.37,147.37,1,235792.00
...,...,...,...,...,...,...,...,...,...,...
11050662,2022-07-08,EQPA5,EQTL PARA,PNA,2,10,6.70,6.70,1,670.00
11050663,2022-07-08,EQTL3,EQUATORIAL,ON NM,2,10,22.94,23.13,16729,116353756.00
11050665,2022-07-08,ESGB11,ETF ESG BTG,CI,14,10,91.54,91.83,7,8815.84


#### Filtrar:
1. Cotações após 2011
2. Lote padrão (codbdi == 2) -> remover empresas em concordata, recuperação judicial, etc. da entrada na seleção (não da saída!)
3. Ações ON, PN ou PNA

In [3]:
df_magic.query('\
    codbdi == 2 and \
    datneg >= "2011.01.01" and \
    especi.str.contains("ON |PN |PNA")'
    , inplace=True
)
df_magic.reset_index(drop=True, inplace=True)
df_magic

Unnamed: 0,datneg,codneg,nomres,especi,codbdi,tpmerc,preult,premed,totneg,voltot
0,2016-10-28,AALR3,ALLIAR,ON NM,2,10,19.20,19.28,4460,122334647.00
1,2016-10-31,AALR3,ALLIAR,ON NM,2,10,18.06,18.17,4238,45857231.00
2,2016-11-01,AALR3,ALLIAR,ON NM,2,10,17.90,17.74,2072,17676981.00
...,...,...,...,...,...,...,...,...,...,...
612753,2022-07-08,ENJU3,ENJOEI,ON NM,2,10,1.13,1.16,1335,3389415.00
612754,2022-07-08,EQPA5,EQTL PARA,PNA,2,10,6.70,6.70,1,670.00
612755,2022-07-08,EQTL3,EQUATORIAL,ON NM,2,10,22.94,23.13,16729,116353756.00


In [4]:
# Manter somente as colunas que serão usadas para fazer o corte nas datas
cols = ['datneg', 'codneg', 'nomres', 'premed', 'totneg']
df_magic = df_magic.loc[:, cols]
# Inserir o cód. dos emissores -> 4 primeiros caracteres do código de negociação
df_magic['codemi'] = df_magic['codneg'].str[0:4]
# Inserir o dia do ano e o ano para a operação de corte das ações
df_magic['day_year'] = df_magic['datneg'].dt.day_of_year
df_magic['year'] = df_magic['datneg'].dt.year
# Ordenar o dataframe por ativo e data
df_magic.sort_values(by=['codneg', 'datneg'], inplace=True)
print('Number of companies available for backtesting', df_magic.codemi.nunique())
df_magic

Number of companies available for backtesting 511


Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,day_year,year
0,2016-10-28,AALR3,ALLIAR,19.28,4460,AALR,302,2016
1,2016-10-31,AALR3,ALLIAR,18.17,4238,AALR,305,2016
2,2016-11-01,AALR3,ALLIAR,17.74,2072,AALR,306,2016
...,...,...,...,...,...,...,...,...
612059,2022-07-13,YDUQ3,YDUQS PART,13.88,19815,YDUQ,194,2022
612620,2022-07-14,YDUQ3,YDUQS PART,13.95,9591,YDUQ,195,2022
610427,2022-07-15,YDUQ3,YDUQS PART,14.44,10135,YDUQ,196,2022


In [5]:
# Criar coluna com a média móvel de 30 dias do número de negociações de cada ativo
df_magic['totneg_sma30'] = df_magic.groupby('codneg')['totneg'].transform(lambda x: x.rolling(30, 1).mean())
df_magic

Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,day_year,year,totneg_sma30
0,2016-10-28,AALR3,ALLIAR,19.28,4460,AALR,302,2016,4460.00
1,2016-10-31,AALR3,ALLIAR,18.17,4238,AALR,305,2016,4349.00
2,2016-11-01,AALR3,ALLIAR,17.74,2072,AALR,306,2016,3590.00
...,...,...,...,...,...,...,...,...,...
612059,2022-07-13,YDUQ3,YDUQS PART,13.88,19815,YDUQ,194,2022,10983.07
612620,2022-07-14,YDUQ3,YDUQS PART,13.95,9591,YDUQ,195,2022,11001.23
610427,2022-07-15,YDUQ3,YDUQS PART,14.44,10135,YDUQ,196,2022,11073.77


In [6]:
# Definir o dia de corte e remover negociações posteriores ao corte
df_magic.query('day_year >= 100', inplace=True)
df_magic.reset_index(drop=True, inplace=True)
df_magic

Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,day_year,year,totneg_sma30
0,2016-10-28,AALR3,ALLIAR,19.28,4460,AALR,302,2016,4460.00
1,2016-10-31,AALR3,ALLIAR,18.17,4238,AALR,305,2016,4349.00
2,2016-11-01,AALR3,ALLIAR,17.74,2072,AALR,306,2016,3590.00
...,...,...,...,...,...,...,...,...,...
440549,2022-07-13,YDUQ3,YDUQS PART,13.88,19815,YDUQ,194,2022,10983.07
440550,2022-07-14,YDUQ3,YDUQS PART,13.95,9591,YDUQ,195,2022,11001.23
440551,2022-07-15,YDUQ3,YDUQS PART,14.44,10135,YDUQ,196,2022,11073.77


In [7]:
# Calcular o dia mais próximo de 100 em cada ano
df_balancing = df_magic.groupby(by=['year'])['day_year'].min().reset_index()
df_balancing

Unnamed: 0,year,day_year
0,2011,101
1,2012,100
2,2013,100
...,...,...
9,2020,100
10,2021,102
11,2022,101


In [8]:
# Filtrar o dataframe de ações com dados somente nos dias de corte
# Usar 'year' e 'day_year' como chaves para a operação de união entre os dois dataframes
df_magic = df_magic.merge(
    right=df_balancing, how='inner', on=['year', 'day_year']
)
# A coluna 'day_year' não será mais usada
df_magic.drop(columns=['day_year'], inplace=True)
print('Number of companies available for backtesting:', df_magic.codemi.nunique())
df_magic

Number of companies available for backtesting: 395


Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,year,totneg_sma30
0,2017-04-10,AALR3,ALLIAR,15.36,315,AALR,2017,456.73
1,2017-04-10,ABCB4,ABC BRASIL,18.36,1005,ABCB,2017,2294.30
2,2017-04-10,AGRO3,BRASILAGRO,12.29,199,AGRO,2017,330.10
...,...,...,...,...,...,...,...,...
2587,2016-04-11,VLID3,VALID,31.31,2720,VLID,2016,2778.93
2588,2016-04-11,VVAR3,VIAVAREJO,2.38,1,VVAR,2016,8.40
2589,2016-04-11,WEGE3,WEG,13.55,9383,WEGE,2016,10665.47


In [9]:
# Excluir empresas financeiras e prestadoras de serviços públicos (utilities) conforme
# lista extraída do seguinte arquivo da B3, sem versionamento:
# https://bvmf.bmfbovespa.com.br/InstDados/InformacoesEmpresas/ClassifSetorial.zip
excluded_companies = pd.read_csv('../data/external/excluded_companies.csv')
excluded_companies = excluded_companies['company_code'].to_list()
df_magic.query('codemi != @excluded_companies', inplace=True)
print('Number of companies available for backtesting', df_magic.codemi.nunique())
df_magic

Number of companies available for backtesting 323


Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,year,totneg_sma30
0,2017-04-10,AALR3,ALLIAR,15.36,315,AALR,2017,456.73
2,2017-04-10,AGRO3,BRASILAGRO,12.29,199,AGRO,2017,330.10
3,2017-04-10,ALPA3,ALPARGATAS,10.59,3,ALPA,2017,4.97
...,...,...,...,...,...,...,...,...
2587,2016-04-11,VLID3,VALID,31.31,2720,VLID,2016,2778.93
2588,2016-04-11,VVAR3,VIAVAREJO,2.38,1,VVAR,2016,8.40
2589,2016-04-11,WEGE3,WEG,13.55,9383,WEGE,2016,10665.47


In [10]:
# Carregar dataframe com o código CVM das empresas listadas
df_cod = pd.read_pickle('/mnt/aq_disk/data/AQ/cod_emissor.pkl')
df_cod

Unnamed: 0,codcvm,cnpj,densoc,situac,codemi
0,60,18451005000104,ACOPALMA CIA IND...,CANCELADA,ZWVZ
1,94,92693019000189,PANATLANTICA SA,ATIVO,PATI
2,108,60664810000174,AÇOS VILLARES SA,CANCELADA,AVIL
...,...,...,...,...,...
1766,26824,43335774000186,TRAVESSIA SECURI...,ATIVO,TMER
1767,26832,38482780000126,ANEMUS WIND HOLD...,ATIVO,ANEM
1768,26840,44841035000129,SAP SECURITIZADO...,ATIVO,SAPS


In [11]:
# A chave da união será o cód. do emissor (codemi)
# Somente o cód. CVM (codcvm) será inserido na união -> Remover colunas que não serão 
# usadas na operação de merge
df_cod = df_cod[['codcvm', 'codemi']].copy()
df_cod

Unnamed: 0,codcvm,codemi
0,60,ZWVZ
1,94,PATI
2,108,AVIL
...,...,...
1766,26824,TMER
1767,26832,ANEM
1768,26840,SAPS


In [12]:
# Criar um set com a lista de ativos antes da operação de união dos dataframes
s0 = set(df_magic.codemi.unique())
# Obter os códigos de emissão dos ativos unindo os dataframes 
df_magic = df_magic.merge(right=df_cod, how='inner', on='codemi')
df_magic.reset_index(drop=True, inplace=True)
print('Number of companies available for backtesting:', df_magic.codemi.nunique())
df_magic

Number of companies available for backtesting: 287


Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,year,totneg_sma30,codcvm
0,2017-04-10,AALR3,ALLIAR,15.36,315,AALR,2017,456.73,24058
1,2018-04-10,AALR3,ALLIAR,15.17,175,AALR,2018,557.40,24058
2,2019-04-10,AALR3,ALLIAR,14.64,156,AALR,2019,489.83,24058
...,...,...,...,...,...,...,...,...,...
1869,2014-04-10,CGRA4,GRAZZIOTIN,17.35,6,CGRA,2014,15.97,4537
1870,2015-04-10,RUMO3,RUMO LOG,1.65,14416,RUMO,2015,10434.43,23450
1871,2016-04-11,RUMO3,RUMO LOG,3.31,26340,RUMO,2016,6106.43,23450


In [13]:
# Verificar a diferença entre os dois sets
s1 = set(df_magic.codemi.unique())
print('Núm. de empresas cujo código não foi localizado', len(s0-s1))
# 34 empresas não tiveram seu código localizado. Analisando os dados, são empresas cujo
# código de listagem foi alterado: BVMF->B3SA, VVAR->VIIA, etc
print(s0 - s1)

Núm. de empresas cujo código não foi localizado 36
{'BRDT', 'CCPR', 'CELP', 'ESTC', 'TBLE', 'BVMF', 'RNAR', 'BTOW', 'VVAR', 'QGEP', 'SNSL', 'BBRK', 'HRTP', 'ABRE', 'ENMA', 'BRIN', 'CTAX', 'SSBR', 'LIQO', 'DTEX', 'LLXL', 'PRTX', 'INPR', 'BPNM', 'MPXE', 'PARC', 'IDNT', 'ECOD', 'ALLL', 'CNTO', 'FJTA', 'TIBR', 'DROG', 'VAGR', 'KROT', 'OHLB'}


In [14]:
# Carregar dataframe com os dados financeiros das empresas
df_fin = (pd
    .read_csv(
        '../data/magic_financials.csv',
        parse_dates=['doc_env', 'per_ini', 'per_fim']
        )
)
# Renomear coluna com o código CVM para coincidir com as outras bases
df_fin.rename(columns={'cia_id': 'codcvm'}, inplace=True)
# As colunas 'per_ini' e 'cia_nome' não serão usadas
df_fin.drop(columns=['per_ini', 'cia_nome'], inplace=True)
# year = ano em que a informação será usada -> ano seguinte ao fim do período
df_fin['year'] = df_fin.per_fim.dt.year + 1
df_fin

Unnamed: 0,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,year
0,94,2011-03-31 10:16:48,2010-12-31,8856000.00,-18981000.00,24777000.00,0.13,2011
1,94,2011-04-01 17:31:56,2010-12-31,8856000.00,-18980000.00,21093000.00,0.11,2011
2,94,2012-03-20 15:20:37,2011-12-31,9480000.00,-46182000.00,8720000.00,0.05,2012
...,...,...,...,...,...,...,...,...
2800,26700,2021-12-16 16:23:45,2020-12-31,855863854.00,607732000.00,1086628000.00,0.21,2021
2801,26700,2022-03-29 18:27:45,2021-12-31,858714812.00,1659228000.00,1449802000.00,0.19,2022
2802,26786,2022-02-01 19:43:03,2021-12-31,1269683.00,-88617000.00,137222000.00,0.56,2022


In [15]:
# Incluir os dados contábeis em 'df_magic' 
df_magic = df_magic.merge(right=df_fin, how='inner', on=['year', 'codcvm'])
df_magic

Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic
0,2017-04-10,AALR3,ALLIAR,15.36,315,AALR,2017,456.73,24058,2017-03-22 23:42:13,2016-12-31,114898767.00,308503000.00,100612000.00,0.04
1,2018-04-10,AALR3,ALLIAR,15.17,175,AALR,2018,557.40,24058,2018-03-28 20:20:31,2017-12-31,118292816.00,507645000.00,70337000.00,0.03
2,2019-04-10,AALR3,ALLIAR,14.64,156,AALR,2019,489.83,24058,2019-03-19 19:58:58,2018-12-31,118292816.00,532346000.00,137286000.00,0.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1709,2014-04-10,CGRA4,GRAZZIOTIN,17.35,6,CGRA,2014,15.97,4537,2014-02-27 08:09:24,2013-12-31,21815945.00,-87381310.00,53334519.00,0.13
1710,2015-04-10,RUMO3,RUMO LOG,1.65,14416,RUMO,2015,10434.43,23450,2015-03-03 20:02:32,2014-12-31,1026488214.00,699234000.00,206689000.00,0.07
1711,2016-04-11,RUMO3,RUMO LOG,3.31,26340,RUMO,2016,6106.43,23450,2016-02-25 20:38:25,2015-12-31,299015898.00,8003920000.00,1051477000.00,0.06


In [16]:
# Converter os dados contábeis para milhões para facilitar verificações posteriores
accounting_columns = ['shares_outstanding', 'net_debt', 'ebit']
df_magic[accounting_columns] = df_magic[accounting_columns] / 1_000_000
df_magic

Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic
0,2017-04-10,AALR3,ALLIAR,15.36,315,AALR,2017,456.73,24058,2017-03-22 23:42:13,2016-12-31,114.90,308.50,100.61,0.04
1,2018-04-10,AALR3,ALLIAR,15.17,175,AALR,2018,557.40,24058,2018-03-28 20:20:31,2017-12-31,118.29,507.64,70.34,0.03
2,2019-04-10,AALR3,ALLIAR,14.64,156,AALR,2019,489.83,24058,2019-03-19 19:58:58,2018-12-31,118.29,532.35,137.29,0.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1709,2014-04-10,CGRA4,GRAZZIOTIN,17.35,6,CGRA,2014,15.97,4537,2014-02-27 08:09:24,2013-12-31,21.82,-87.38,53.33,0.13
1710,2015-04-10,RUMO3,RUMO LOG,1.65,14416,RUMO,2015,10434.43,23450,2015-03-03 20:02:32,2014-12-31,1026.49,699.23,206.69,0.07
1711,2016-04-11,RUMO3,RUMO LOG,3.31,26340,RUMO,2016,6106.43,23450,2016-02-25 20:38:25,2015-12-31,299.02,8003.92,1051.48,0.06


In [17]:
# Calcular os indicadores que dependem do preço do ação
df_magic['market_cap'] = df_magic['shares_outstanding'] * df_magic['premed']
df_magic['enterprise_value'] = df_magic['market_cap'] + df_magic['net_debt']
df_magic['earnings_yield'] = df_magic['ebit'] / df_magic['enterprise_value']
# A coluna 'premed' não será mais usada
df_magic.drop(columns=['premed'], inplace=True)
df_magic

Unnamed: 0,datneg,codneg,nomres,totneg,codemi,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2017-04-10,AALR3,ALLIAR,315,AALR,2017,456.73,24058,2017-03-22 23:42:13,2016-12-31,114.90,308.50,100.61,0.04,1764.85,2073.35,0.05
1,2018-04-10,AALR3,ALLIAR,175,AALR,2018,557.40,24058,2018-03-28 20:20:31,2017-12-31,118.29,507.64,70.34,0.03,1794.50,2302.15,0.03
2,2019-04-10,AALR3,ALLIAR,156,AALR,2019,489.83,24058,2019-03-19 19:58:58,2018-12-31,118.29,532.35,137.29,0.05,1731.81,2264.15,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1709,2014-04-10,CGRA4,GRAZZIOTIN,6,CGRA,2014,15.97,4537,2014-02-27 08:09:24,2013-12-31,21.82,-87.38,53.33,0.13,378.51,291.13,0.18
1710,2015-04-10,RUMO3,RUMO LOG,14416,RUMO,2015,10434.43,23450,2015-03-03 20:02:32,2014-12-31,1026.49,699.23,206.69,0.07,1693.71,2392.94,0.09
1711,2016-04-11,RUMO3,RUMO LOG,26340,RUMO,2016,6106.43,23450,2016-02-25 20:38:25,2015-12-31,299.02,8003.92,1051.48,0.06,989.74,8993.66,0.12


In [18]:
# Existem algumas empresas, como a COGN3, com baixo valor de mercado e excesso de
# dívida líquida resultando em um EV negativo e, consequentemente, um EY (EBIT/EV) 
# negativo ou distorcido (tendendo a infinito). Apesar do lívro não deixar isso claro,
# iremos remover essas empresas da seleção.
df_magic.query('enterprise_value > 100', inplace=True)
print('Number of companies available for backtesting', df_magic.codneg.nunique())

Number of companies available for backtesting 231


In [19]:
# Remover revisões de DFPs publicadas no mesmo dia ou posteriores ao corte
# No livro, o corte é de uma semana
df_magic.query('doc_env.dt.date < datneg', inplace=True)
df_magic

Unnamed: 0,datneg,codneg,nomres,totneg,codemi,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2017-04-10,AALR3,ALLIAR,315,AALR,2017,456.73,24058,2017-03-22 23:42:13,2016-12-31,114.90,308.50,100.61,0.04,1764.85,2073.35,0.05
1,2018-04-10,AALR3,ALLIAR,175,AALR,2018,557.40,24058,2018-03-28 20:20:31,2017-12-31,118.29,507.64,70.34,0.03,1794.50,2302.15,0.03
2,2019-04-10,AALR3,ALLIAR,156,AALR,2019,489.83,24058,2019-03-19 19:58:58,2018-12-31,118.29,532.35,137.29,0.05,1731.81,2264.15,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1709,2014-04-10,CGRA4,GRAZZIOTIN,6,CGRA,2014,15.97,4537,2014-02-27 08:09:24,2013-12-31,21.82,-87.38,53.33,0.13,378.51,291.13,0.18
1710,2015-04-10,RUMO3,RUMO LOG,14416,RUMO,2015,10434.43,23450,2015-03-03 20:02:32,2014-12-31,1026.49,699.23,206.69,0.07,1693.71,2392.94,0.09
1711,2016-04-11,RUMO3,RUMO LOG,26340,RUMO,2016,6106.43,23450,2016-02-25 20:38:25,2015-12-31,299.02,8003.92,1051.48,0.06,989.74,8993.66,0.12


In [20]:
# Manter somente a DFP mais recente ao corte para cada ativo
df_magic.sort_values('doc_env', inplace=True)
df_magic.drop_duplicates(subset=['codneg', 'year'], keep='last', inplace=True)
df_magic

Unnamed: 0,datneg,codneg,nomres,totneg,codemi,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
1225,2011-04-11,TOTS3,TOTVS,244,TOTS,2011,485.33,19992,2011-01-31 19:05:59,2010-12-31,31.46,179.42,211.67,0.17,994.10,1173.53,0.18
680,2011-04-11,LREN3,LOJAS RENNER,3604,LREN,2011,3465.70,8133,2011-02-16 19:53:52,2010-12-31,122.35,-27.16,404.47,0.27,6682.70,6655.54,0.06
52,2011-04-11,AMAR3,LOJAS MARISA,183,AMAR,2011,407.27,22055,2011-02-18 15:38:17,2010-12-31,184.55,37.21,285.95,0.22,4981.03,5018.24,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1593,2022-04-11,KRSA3,KORA SAUDE,201,KRSA,2022,802.60,25879,2022-03-30 23:29:03,2021-12-31,767.17,1290.38,132.81,0.03,2631.41,3921.78,0.03
1491,2022-04-11,CEDO4,CEDRO,5,CEDO,2022,16.27,3077,2022-03-31 11:19:14,2021-12-31,10.00,206.73,44.58,0.08,50.20,256.93,0.17
1489,2022-04-11,CEDO3,CEDRO,1,CEDO,2022,7.63,3077,2022-03-31 11:19:14,2021-12-31,10.00,206.73,44.58,0.08,70.00,276.73,0.16


In [21]:
# Remover o ativo menos líquido da empresa (coluna 'totneg_sma30')
df_magic.sort_values(by=['year', 'codemi', 'totneg_sma30'], inplace=True)
df_magic.drop_duplicates(
    subset=['codemi', 'year'], keep='last', inplace=True, ignore_index=True
)
# A coluna 'codemi' não será mais necessária
df_magic.drop(columns='codemi', inplace=True)
print('Number of companies available for backtesting', df_magic.codneg.nunique())
df_magic

Number of companies available for backtesting 210


Unnamed: 0,datneg,codneg,nomres,totneg,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2011-04-11,AEDU3,ANHANGUERA,978,2011,1031.77,18961,2011-03-30 00:09:51,2010-12-31,145690.26,-571.50,157.01,0.07,5537686.82,5537115.32,0.00
1,2011-04-11,ALPA4,ALPARGATAS,158,2011,210.80,10456,2011-03-30 16:55:23,2010-12-31,353.46,-358.70,324.23,0.23,4011.73,3653.02,0.09
2,2011-04-11,AMAR3,LOJAS MARISA,183,2011,407.27,22055,2011-02-18 15:38:17,2010-12-31,184.55,37.21,285.95,0.22,4981.03,5018.24,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,2022-04-11,WEGE3,WEG,25960,2022,28835.07,5410,2022-02-16 07:05:37,2021-12-31,4197.32,-1428.02,4158.34,0.22,138427.55,136999.53,0.03
1092,2022-04-11,WLMM4,WLM IND COM,14,2022,11.37,11070,2022-03-22 22:18:18,2021-12-31,36.41,-152.00,136.13,0.23,1258.13,1106.12,0.12
1093,2022-04-11,YDUQ3,YDUQS PART,10252,2022,14513.70,21016,2022-03-15 18:09:20,2021-12-31,309.09,3692.84,546.59,0.05,6092.14,9784.98,0.06


In [22]:
# Verificar erro no núm de ações na DFP da Braskem de 2012-03-14 09:48:31	
df_magic.query('codneg.str.startswith("BRKM")')

Unnamed: 0,datneg,codneg,nomres,totneg,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
11,2011-04-11,BRKM5,BRASKEM,2593,2011,2518.33,4820,2011-03-17 09:14:46,2010-12-31,801.66,9867.90,3214.96,0.11,18085.56,27953.46,0.12
97,2012-04-09,BRKM5,BRASKEM,1670,2012,3455.03,4820,2012-03-26 20:08:37,2011-12-31,801.66,12006.80,1929.90,0.06,11560.01,23566.81,0.08
183,2013-04-10,BRKM5,BRASKEM,6716,2013,6376.77,4820,2013-02-07 10:13:32,2012-12-31,797.27,14051.87,1538.60,0.04,12453.28,26505.15,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,2018-04-10,BRKM5,BRASKEM,7898,2018,6297.00,4820,2018-03-29 05:11:07,2017-12-31,797.26,17569.80,9359.06,0.27,38204.58,55774.38,0.17
666,2019-04-10,BRKM5,BRASKEM,8495,2019,8476.80,4820,2019-03-13 19:01:57,2018-12-31,797.22,17259.68,8303.94,0.24,38282.43,55542.12,0.15
969,2022-04-11,BRKM5,BRASKEM,20793,2022,13829.07,4820,2022-03-16 19:43:41,2021-12-31,797.21,22861.69,26043.55,0.59,36137.43,58999.13,0.44


In [23]:
# O Livro fala em empresas com pelos menos USD 50 milhões de valor de mercado
# Remover empresas com menos de R$ 250 milhões de valor de mercado
df_magic.query('market_cap > 250', inplace=True)
# Remover ações com baixíssima liquidez no dia
df_magic.query('totneg_sma30 > 100', inplace=True)
# As colunas 'totneg' e 'totneg_sma30' não são mais necessárias
df_magic.drop(columns=['totneg', 'totneg_sma30'], inplace=True)
print('Number of companies available for backtesting', df_magic.codneg.nunique())
df_magic

Number of companies available for backtesting 191


Unnamed: 0,datneg,codneg,nomres,year,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2011-04-11,AEDU3,ANHANGUERA,2011,18961,2011-03-30 00:09:51,2010-12-31,145690.26,-571.50,157.01,0.07,5537686.82,5537115.32,0.00
1,2011-04-11,ALPA4,ALPARGATAS,2011,10456,2011-03-30 16:55:23,2010-12-31,353.46,-358.70,324.23,0.23,4011.73,3653.02,0.09
2,2011-04-11,AMAR3,LOJAS MARISA,2011,22055,2011-02-18 15:38:17,2010-12-31,184.55,37.21,285.95,0.22,4981.03,5018.24,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,2022-04-11,VVEO3,VIVEO,2022,25682,2022-03-30 18:25:34,2021-12-31,286.12,-76.53,492.30,0.16,4506.44,4429.92,0.11
1091,2022-04-11,WEGE3,WEG,2022,5410,2022-02-16 07:05:37,2021-12-31,4197.32,-1428.02,4158.34,0.22,138427.55,136999.53,0.03
1093,2022-04-11,YDUQ3,YDUQS PART,2022,21016,2022-03-15 18:09:20,2021-12-31,309.09,3692.84,546.59,0.05,6092.14,9784.98,0.06


In [24]:
df_magic['rank_roic'] = (df_magic
    .groupby(by=['year'])['roic']
    .rank(method='dense', ascending=False)
)
df_magic['rank_ey'] = (df_magic
    .groupby(by=['year'])['earnings_yield']
    .rank(method='dense', ascending=False)
)
df_magic['ranks_sum'] = df_magic['rank_roic'] + df_magic['rank_ey']
df_magic['rank_final'] = (df_magic
    .groupby(by=['year'])['ranks_sum']
    .rank(method='first', ascending=True)
)
cols_integer = ['rank_roic', 'rank_ey', 'ranks_sum', 'rank_final']
df_magic[cols_integer] = df_magic[cols_integer].astype(int)
df_magic

Unnamed: 0,datneg,codneg,nomres,year,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_roic,rank_ey,ranks_sum,rank_final
0,2011-04-11,AEDU3,ANHANGUERA,2011,18961,2011-03-30 00:09:51,2010-12-31,145690.26,-571.50,157.01,0.07,5537686.82,5537115.32,0.00,51,68,119,62
1,2011-04-11,ALPA4,ALPARGATAS,2011,10456,2011-03-30 16:55:23,2010-12-31,353.46,-358.70,324.23,0.23,4011.73,3653.02,0.09,7,33,40,13
2,2011-04-11,AMAR3,LOJAS MARISA,2011,22055,2011-02-18 15:38:17,2010-12-31,184.55,37.21,285.95,0.22,4981.03,5018.24,0.06,9,53,62,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,2022-04-11,VVEO3,VIVEO,2022,25682,2022-03-30 18:25:34,2021-12-31,286.12,-76.53,492.30,0.16,4506.44,4429.92,0.11,43,55,98,41
1091,2022-04-11,WEGE3,WEG,2022,5410,2022-02-16 07:05:37,2021-12-31,4197.32,-1428.02,4158.34,0.22,138427.55,136999.53,0.03,28,113,141,71
1093,2022-04-11,YDUQ3,YDUQS PART,2022,21016,2022-03-15 18:09:20,2021-12-31,309.09,3692.84,546.59,0.05,6092.14,9784.98,0.06,103,89,192,106


In [25]:
df_magic.query('year == 2012').sort_values('rank_final')

Unnamed: 0,datneg,codneg,nomres,year,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_roic,rank_ey,ranks_sum,rank_final
95,2012-04-09,BRAP4,BRADESPAR,2012,18724,2012-03-16 19:07:53,2011-12-31,349.55,582.93,2167.70,0.85,11874.15,12457.08,0.17,3,3,6,1
170,2012-04-09,VALE5,VALE,2012,4170,2012-02-16 11:24:28,2011-12-31,5365.31,39166.50,53087.42,0.20,217616.77,256783.27,0.21,11,2,13,2
112,2012-04-09,FHER3,FER HERINGER,2012,20621,2012-03-14 08:59:20,2011-12-31,48.47,611.98,306.05,0.19,640.30,1252.28,0.24,15,1,16,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,2012-04-09,IMCH3,IMC HOLDINGS,2012,22438,2012-03-14 21:14:16,2011-12-31,83.68,104.34,33.84,0.02,1394.12,1498.46,0.02,68,68,136,68
149,2012-04-09,RADL3,RAIADROGASIL,2012,5258,2012-03-26 20:52:19,2011-12-31,330.39,-177.66,79.56,0.03,5900.69,5723.03,0.01,66,70,136,69
113,2012-04-09,FIBR3,FIBRIA,2012,12793,2012-02-01 20:40:29,2011-12-31,467.59,9264.58,377.66,0.01,7004.53,16269.10,0.02,70,67,137,70


In [26]:
df_magic.sort_values(by=['year', 'rank_final'], inplace=True)
# Renomear a coluna 'datneg'
df_magic.rename(columns={'datneg': 'balancing_on'}, inplace=True)
# A coluna 'year' já está implícita na coluna 'balancing_on' e não será mais usada
df_magic.drop(columns=['year'], inplace=True)
# Remover colunas de cálculo intemediário
df_magic.drop(columns=['rank_roic', 'rank_ey', 'ranks_sum'], inplace=True)
df_magic

Unnamed: 0,balancing_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
21,2011-04-11,ETER3,ETERNIT,5762,2011-03-11 11:18:46,2010-12-31,89.50,-56.72,123.66,0.23,987.18,930.47,0.13,1
78,2011-04-11,TOTS3,TOTVS,19992,2011-01-31 19:05:59,2010-12-31,31.46,179.42,211.67,0.17,994.10,1173.53,0.18,2
66,2011-04-11,RAPT4,RANDON PART,14109,2011-03-16 07:37:57,2010-12-31,243.78,69.82,450.23,0.18,2913.23,2983.05,0.15,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
976,2022-04-11,COGN3,COGNA ON,17973,2022-03-24 21:24:09,2021-12-31,1876.61,5867.08,78.40,0.00,5048.07,10915.15,0.01,133
1020,2022-04-11,LWSA3,LOCAWEB,24910,2022-03-29 15:08:01,2021-12-31,589.58,-1480.19,8.53,0.00,5052.71,3572.52,0.00,134
958,2022-04-11,AMAR3,LOJAS MARISA,22055,2022-03-16 19:26:20,2021-12-31,261.67,1201.01,4.80,0.00,761.45,1962.46,0.00,135


In [27]:
df_magic.sort_values('rank_final')

Unnamed: 0,balancing_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
21,2011-04-11,ETER3,ETERNIT,5762,2011-03-11 11:18:46,2010-12-31,89.50,-56.72,123.66,0.23,987.18,930.47,0.13,1
185,2013-04-10,CSAN3,COSAN,19836,2012-05-31 00:22:35,2012-03-31,407.21,3397.91,4229.14,0.22,18491.59,21889.50,0.19,1
306,2014-04-10,MGLU3,MAGAZ LUIZA,22470,2014-02-24 07:35:59,2013-12-31,186.49,548.69,374.97,0.25,1333.43,1882.12,0.20,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
976,2022-04-11,COGN3,COGNA ON,17973,2022-03-24 21:24:09,2021-12-31,1876.61,5867.08,78.40,0.00,5048.07,10915.15,0.01,133
1020,2022-04-11,LWSA3,LOCAWEB,24910,2022-03-29 15:08:01,2021-12-31,589.58,-1480.19,8.53,0.00,5052.71,3572.52,0.00,134
958,2022-04-11,AMAR3,LOJAS MARISA,22055,2022-03-16 19:26:20,2021-12-31,261.67,1201.01,4.80,0.00,761.45,1962.46,0.00,135


In [28]:
# Verificar se a Petrobras foi cortada do ranking
df_magic.query('codneg == "PETR4"')

Unnamed: 0,balancing_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final


In [29]:
# Verificar o ranking da Braskem antes do corte para teste
df_magic.query('codneg == "BRKM5"')

Unnamed: 0,balancing_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
11,2011-04-11,BRKM5,BRASKEM,4820,2011-03-17 09:14:46,2010-12-31,801.66,9867.90,3214.96,0.11,18085.56,27953.46,0.12,24
97,2012-04-09,BRKM5,BRASKEM,4820,2012-03-26 20:08:37,2011-12-31,801.66,12006.80,1929.90,0.06,11560.01,23566.81,0.08,46
183,2013-04-10,BRKM5,BRASKEM,4820,2013-02-07 10:13:32,2012-12-31,797.27,14051.87,1538.60,0.04,12453.28,26505.15,0.06,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,2018-04-10,BRKM5,BRASKEM,4820,2018-03-29 05:11:07,2017-12-31,797.26,17569.80,9359.06,0.27,38204.58,55774.38,0.17,1
666,2019-04-10,BRKM5,BRASKEM,4820,2019-03-13 19:01:57,2018-12-31,797.22,17259.68,8303.94,0.24,38282.43,55542.12,0.15,2
969,2022-04-11,BRKM5,BRASKEM,4820,2022-03-16 19:43:41,2021-12-31,797.21,22861.69,26043.55,0.59,36137.43,58999.13,0.44,4


In [30]:
# Select only the first 30 companies in the magic rank for each year
df_magic.query('rank_final <= 30', inplace=True)
df_magic.reset_index(drop=True, inplace=True)
print('Number of selected companies for backtesting', df_magic.codneg.nunique())
df_magic

Number of selected companies for backtesting 110


Unnamed: 0,balancing_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
0,2011-04-11,ETER3,ETERNIT,5762,2011-03-11 11:18:46,2010-12-31,89.50,-56.72,123.66,0.23,987.18,930.47,0.13,1
1,2011-04-11,TOTS3,TOTVS,19992,2011-01-31 19:05:59,2010-12-31,31.46,179.42,211.67,0.17,994.10,1173.53,0.18,2
2,2011-04-11,RAPT4,RANDON PART,14109,2011-03-16 07:37:57,2010-12-31,243.78,69.82,450.23,0.18,2913.23,2983.05,0.15,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,2022-04-11,JALL3,JALLESMACHAD,25496,2021-06-23 08:15:11,2021-03-31,294.70,707.22,519.71,0.20,2890.98,3598.19,0.14,28
358,2022-04-11,RAPT4,RANDON PART,14109,2022-02-24 19:36:51,2021-12-31,329.33,1812.93,1100.52,0.14,3392.11,5205.04,0.21,29
359,2022-04-11,MLAS3,MULTILASER,26034,2022-03-24 23:45:31,2021-12-31,820.54,-362.41,778.48,0.14,4824.77,4462.36,0.17,30


In [31]:
# Save file
df_magic.to_csv('../data/magic_stocks.csv', index=False)

In [32]:
# Test file
pd.read_csv('../data/magic_stocks.csv')

Unnamed: 0,balancing_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
0,2011-04-11,ETER3,ETERNIT,5762,2011-03-11 11:18:46,2010-12-31,89.50,-56.72,123.66,0.23,987.18,930.47,0.13,1
1,2011-04-11,TOTS3,TOTVS,19992,2011-01-31 19:05:59,2010-12-31,31.46,179.42,211.67,0.17,994.10,1173.53,0.18,2
2,2011-04-11,RAPT4,RANDON PART,14109,2011-03-16 07:37:57,2010-12-31,243.78,69.82,450.23,0.18,2913.23,2983.05,0.15,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,2022-04-11,JALL3,JALLESMACHAD,25496,2021-06-23 08:15:11,2021-03-31,294.70,707.22,519.71,0.20,2890.98,3598.19,0.14,28
358,2022-04-11,RAPT4,RANDON PART,14109,2022-02-24 19:36:51,2021-12-31,329.33,1812.93,1100.52,0.14,3392.11,5205.04,0.21,29
359,2022-04-11,MLAS3,MULTILASER,26034,2022-03-24 23:45:31,2021-12-31,820.54,-362.41,778.48,0.14,4824.77,4462.36,0.17,30


In [33]:
# Some of the stocks will be selected in multiple periods
print(df_magic.codneg.value_counts().head(5).to_markdown())

|       |   codneg |
|:------|---------:|
| BEEF3 |        9 |
| IGTA3 |        9 |
| GRND3 |        9 |
| ODPV3 |        8 |
| JHSF3 |        8 |


In [34]:
# Check 2022 data
df_22 = df_magic.query('balancing_on > "2022-01-01"')[['codneg', 'nomres', 'roic', 'earnings_yield']].reset_index(drop=True)
df_22.index += 1
df_22[['roic', 'earnings_yield']] = df_22[['roic', 'earnings_yield']].round(2)
print(df_22.to_markdown(mode='github'))

|    | codneg   | nomres       |   roic |   earnings_yield |
|---:|:---------|:-------------|-------:|-----------------:|
|  1 | BRAP4    | BRADESPAR    |  33.13 |             0.64 |
|  2 | ENAT3    | ENAUTA PART  |   0.73 |             0.6  |
|  3 | CMIN3    | CSNMINERACAO |   0.97 |             0.37 |
|  4 | BRKM5    | BRASKEM      |   0.59 |             0.44 |
|  5 | USIM5    | USIMINAS     |   0.34 |             0.74 |
|  6 | GOAU4    | GERDAU MET   |   0.3  |             1.09 |
|  7 | TASA4    | TAURUS ARMAS |   0.54 |             0.29 |
|  8 | VALE3    | VALE         |   0.47 |             0.29 |
|  9 | GGBR4    | GERDAU       |   0.3  |             0.36 |
| 10 | MRFG3    | MARFRIG      |   0.3  |             0.33 |
| 11 | CURY3    | CURY S/A     |   0.72 |             0.2  |
| 12 | LEVE3    | METAL LEVE   |   0.29 |             0.25 |
| 13 | ALLD3    | ALLIED       |   0.23 |             0.39 |
| 14 | JHSF3    | JHSF PART    |   0.37 |             0.2  |
| 15 | DEXP3    | DEXXOS

In [35]:
df_magic.query('codneg == "PRIO3"')

Unnamed: 0,balancing_on,codneg,nomres,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
180,2017-04-10,PRIO3,PETRORIO,2017-03-28 17:48:01,2016-12-31,13.19,-539.18,259.08,0.88,553.09,1092.27,0.24,1
271,2020-04-09,PRIO3,PETRORIO,2020-02-22 00:56:46,2019-12-31,143.19,1511.96,924.34,0.27,3800.15,2288.19,0.4,2
