df_fin -> input dataframe com os dados financeiros das empresas

df_cod -> input dataframe com os códigos CVM dos ativos

df_magic -> output dataframe que irá armazenar o resultado com as magic stocks

In [1]:
from pathlib import Path
import pandas as pd
# Mostrar floats com duas casas decimas
pd.set_option('display.float_format',  lambda x: '%.2f' % x)
pd.options.display.max_colwidth = 20
pd.options.display.max_columns = 20
pd.options.display.max_rows = 6

In [2]:
# Carregar base NÃO AJUSTADA do S3 na seguinte raíz: s3://aq-dl/HistoricalQuotations/
# Caminho local:
DATA_FOLDER = Path("/mnt/aq_disk/data/HistoricalQuotations/interim")
# BASE_ADJ = DATA_FOLDER / "base_adj.feather"
DATASET1 = DATA_FOLDER / "dataset_95-21.feather"
DATASET2 = DATA_FOLDER / "dataset_22.feather"
cols = ['datneg', 'codneg', 'nomres', 'especi', 'codbdi', 'tpmerc', 'preult', 'premed', 'totneg', 'voltot']
df_magic = (pd
    .concat([pd.read_feather(DATASET1), pd.read_feather(DATASET2)], ignore_index=True)
    [cols]
    .query('tpmerc == 10')
)
df_magic

Unnamed: 0,datneg,codneg,nomres,especi,codbdi,tpmerc,preult,premed,totneg,voltot
0,2020-02-07,A1AP34,ADVANCE AUTO,DRN,2,10,143.12,143.12,2,6139848.00
1,2020-02-10,A1AP34,ADVANCE AUTO,DRN,2,10,142.27,142.27,1,512172.00
2,2020-02-13,A1AP34,ADVANCE AUTO,DRN,2,10,147.37,147.37,1,235792.00
...,...,...,...,...,...,...,...,...,...,...
11045279,2022-07-12,SLED11,SARAIVA LIVR,BNS N2,22,10,1.50,1.50,1,150.00
11045280,2022-07-12,TASA17,TAURUS ARMAS,BNS PRE N2,22,10,10.00,9.99,7,42981.00
11045281,2022-07-12,VLID11,VALID,BNS ORD NM,22,10,0.68,0.67,14,17280.00


#### Filtrar:
1. Cotações após 2011
2. Lote padrão (codbdi == 2) -> remover empresas em concordata, recuperação judicial, etc. da entrada na seleção (não da saída!)
3. Ações ON, PN ou PNA

In [3]:
df_magic.query('\
    codbdi == 2 and \
    datneg >= "2011.01.01" and \
    especi.str.contains("ON |PN |PNA")'
    , inplace=True
)
df_magic.reset_index(drop=True, inplace=True)
df_magic

Unnamed: 0,datneg,codneg,nomres,especi,codbdi,tpmerc,preult,premed,totneg,voltot
0,2016-10-28,AALR3,ALLIAR,ON NM,2,10,19.20,19.28,4460,122334647.00
1,2016-10-31,AALR3,ALLIAR,ON NM,2,10,18.06,18.17,4238,45857231.00
2,2016-11-01,AALR3,ALLIAR,ON NM,2,10,17.90,17.74,2072,17676981.00
...,...,...,...,...,...,...,...,...,...,...
611920,2022-07-07,QUAL3,QUALICORP,ON NM,2,10,12.47,12.54,7508,21304784.00
611921,2022-07-07,RADL3,RAIADROGASIL,ON EJ NM,2,10,20.00,19.89,12391,121884011.00
611922,2022-07-07,RAIL3,RUMO S.A.,ON NM,2,10,16.12,16.24,16254,177053334.00


In [4]:
# Manter somente as colunas que serão usadas para fazer o corte nas datas
cols = ['datneg', 'codneg', 'nomres', 'premed', 'totneg']
df_magic = df_magic.loc[:, cols]
# Inserir o cód. dos emissores -> 4 primeiros caracteres do código de negociação
df_magic['codemi'] = df_magic['codneg'].str[0:4]
# Inserir o dia do ano e o ano para a operação de corte das ações
df_magic['day_year'] = df_magic['datneg'].dt.day_of_year
df_magic['year'] = df_magic['datneg'].dt.year
# Ordenar o dataframe por ativo e data
df_magic.sort_values(by=['codneg', 'datneg'], inplace=True)
print('Number of companies available for backtesting', df_magic.codemi.nunique())
df_magic

Number of companies available for backtesting 511


Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,day_year,year
0,2016-10-28,AALR3,ALLIAR,19.28,4460,AALR,302,2016
1,2016-10-31,AALR3,ALLIAR,18.17,4238,AALR,305,2016
2,2016-11-01,AALR3,ALLIAR,17.74,2072,AALR,306,2016
...,...,...,...,...,...,...,...,...
610719,2022-07-08,YDUQ3,YDUQS PART,14.93,12980,YDUQ,189,2022
611428,2022-07-11,YDUQ3,YDUQS PART,14.41,9675,YDUQ,192,2022
611887,2022-07-12,YDUQ3,YDUQS PART,14.39,10528,YDUQ,193,2022


In [5]:
# Criar coluna com a média móvel de 30 dias do número de negociações de cada ativo
df_magic['totneg_sma30'] = df_magic.groupby('codneg')['totneg'].transform(lambda x: x.rolling(30, 1).mean())
df_magic

Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,day_year,year,totneg_sma30
0,2016-10-28,AALR3,ALLIAR,19.28,4460,AALR,302,2016,4460.00
1,2016-10-31,AALR3,ALLIAR,18.17,4238,AALR,305,2016,4349.00
2,2016-11-01,AALR3,ALLIAR,17.74,2072,AALR,306,2016,3590.00
...,...,...,...,...,...,...,...,...,...
610719,2022-07-08,YDUQ3,YDUQS PART,14.93,12980,YDUQ,189,2022,10388.60
611428,2022-07-11,YDUQ3,YDUQS PART,14.41,9675,YDUQ,192,2022,10460.63
611887,2022-07-12,YDUQ3,YDUQS PART,14.39,10528,YDUQ,193,2022,10624.23


In [6]:
# Definir o dia de corte e remover negociações posteriores ao corte
df_magic.query('day_year >= 100', inplace=True)
df_magic.reset_index(drop=True, inplace=True)
df_magic

Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,day_year,year,totneg_sma30
0,2016-10-28,AALR3,ALLIAR,19.28,4460,AALR,302,2016,4460.00
1,2016-10-31,AALR3,ALLIAR,18.17,4238,AALR,305,2016,4349.00
2,2016-11-01,AALR3,ALLIAR,17.74,2072,AALR,306,2016,3590.00
...,...,...,...,...,...,...,...,...,...
439716,2022-07-08,YDUQ3,YDUQS PART,14.93,12980,YDUQ,189,2022,10388.60
439717,2022-07-11,YDUQ3,YDUQS PART,14.41,9675,YDUQ,192,2022,10460.63
439718,2022-07-12,YDUQ3,YDUQS PART,14.39,10528,YDUQ,193,2022,10624.23


In [7]:
# Calcular o dia mais próximo de 100 em cada ano
df_balancing = df_magic.groupby(by=['year'])['day_year'].min().reset_index()
df_balancing

Unnamed: 0,year,day_year
0,2011,101
1,2012,100
2,2013,100
...,...,...
9,2020,100
10,2021,102
11,2022,101


In [8]:
# Filtrar o dataframe de ações com dados somente nos dias de corte
# Usar 'year' e 'day_year' como chaves para a operação de união entre os dois dataframes
df_magic = df_magic.merge(
    right=df_balancing, how='inner', on=['year', 'day_year']
)
# A coluna 'day_year' não será mais usada
df_magic.drop(columns=['day_year'], inplace=True)
print('Number of companies available for backtesting:', df_magic.codemi.nunique())
df_magic

Number of companies available for backtesting: 395


Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,year,totneg_sma30
0,2017-04-10,AALR3,ALLIAR,15.36,315,AALR,2017,456.73
1,2017-04-10,ABCB4,ABC BRASIL,18.36,1005,ABCB,2017,2294.30
2,2017-04-10,AGRO3,BRASILAGRO,12.29,199,AGRO,2017,330.10
...,...,...,...,...,...,...,...,...
2587,2016-04-11,VLID3,VALID,31.31,2720,VLID,2016,2778.93
2588,2016-04-11,VVAR3,VIAVAREJO,2.38,1,VVAR,2016,8.40
2589,2016-04-11,WEGE3,WEG,13.55,9383,WEGE,2016,10665.47


In [9]:
# Excluir empresas financeiras e prestadoras de serviços públicos (utilities) conforme
# lista extraída do seguinte arquivo da B3, sem versionamento:
# https://bvmf.bmfbovespa.com.br/InstDados/InformacoesEmpresas/ClassifSetorial.zip
excluded_companies = pd.read_csv('../data/external/excluded_companies.csv')
excluded_companies = excluded_companies['company_code'].to_list()
df_magic.query('codemi != @excluded_companies', inplace=True)
print('Number of companies available for backtesting', df_magic.codemi.nunique())
df_magic

Number of companies available for backtesting 323


Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,year,totneg_sma30
0,2017-04-10,AALR3,ALLIAR,15.36,315,AALR,2017,456.73
2,2017-04-10,AGRO3,BRASILAGRO,12.29,199,AGRO,2017,330.10
3,2017-04-10,ALPA3,ALPARGATAS,10.59,3,ALPA,2017,4.97
...,...,...,...,...,...,...,...,...
2587,2016-04-11,VLID3,VALID,31.31,2720,VLID,2016,2778.93
2588,2016-04-11,VVAR3,VIAVAREJO,2.38,1,VVAR,2016,8.40
2589,2016-04-11,WEGE3,WEG,13.55,9383,WEGE,2016,10665.47


In [10]:
# Carregar dataframe com o código CVM das empresas listadas
df_cod = pd.read_pickle('/mnt/aq_disk/data/AQ/cod_emissor.pkl')
df_cod

Unnamed: 0,codcvm,cnpj,densoc,situac,codemi
0,60,18451005000104,ACOPALMA CIA IND...,CANCELADA,ZWVZ
1,94,92693019000189,PANATLANTICA SA,ATIVO,PATI
2,108,60664810000174,AÇOS VILLARES SA,CANCELADA,AVIL
...,...,...,...,...,...
1766,26824,43335774000186,TRAVESSIA SECURI...,ATIVO,TMER
1767,26832,38482780000126,ANEMUS WIND HOLD...,ATIVO,ANEM
1768,26840,44841035000129,SAP SECURITIZADO...,ATIVO,SAPS


In [11]:
# A chave da união será o cód. do emissor (codemi)
# Somente o cód. CVM (codcvm) será inserido na união -> Remover colunas que não serão 
# usadas na operação de merge
df_cod = df_cod[['codcvm', 'codemi']].copy()
df_cod

Unnamed: 0,codcvm,codemi
0,60,ZWVZ
1,94,PATI
2,108,AVIL
...,...,...
1766,26824,TMER
1767,26832,ANEM
1768,26840,SAPS


In [12]:
# Criar um set com a lista de ativos antes da operação de união dos dataframes
s0 = set(df_magic.codemi.unique())
# Obter os códigos de emissão dos ativos unindo os dataframes 
df_magic = df_magic.merge(right=df_cod, how='inner', on='codemi')
df_magic.reset_index(drop=True, inplace=True)
print('Number of companies available for backtesting:', df_magic.codemi.nunique())
df_magic

Number of companies available for backtesting: 287


Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,year,totneg_sma30,codcvm
0,2017-04-10,AALR3,ALLIAR,15.36,315,AALR,2017,456.73,24058
1,2018-04-10,AALR3,ALLIAR,15.17,175,AALR,2018,557.40,24058
2,2019-04-10,AALR3,ALLIAR,14.64,156,AALR,2019,489.83,24058
...,...,...,...,...,...,...,...,...,...
1869,2014-04-10,CGRA4,GRAZZIOTIN,17.35,6,CGRA,2014,15.97,4537
1870,2015-04-10,RUMO3,RUMO LOG,1.65,14416,RUMO,2015,10434.43,23450
1871,2016-04-11,RUMO3,RUMO LOG,3.31,26340,RUMO,2016,6106.43,23450


In [13]:
# Verificar a diferença entre os dois sets
s1 = set(df_magic.codemi.unique())
print('Núm. de empresas cujo código não foi localizado', len(s0-s1))
# 34 empresas não tiveram seu código localizado. Analisando os dados, são empresas cujo
# código de listagem foi alterado: BVMF->B3SA, VVAR->VIIA, etc
print(s0 - s1)

Núm. de empresas cujo código não foi localizado 36
{'CTAX', 'ESTC', 'ECOD', 'ENMA', 'CELP', 'BPNM', 'BRIN', 'IDNT', 'DROG', 'TIBR', 'VVAR', 'HRTP', 'VAGR', 'MPXE', 'RNAR', 'PRTX', 'SSBR', 'DTEX', 'CNTO', 'LLXL', 'LIQO', 'INPR', 'TBLE', 'SNSL', 'KROT', 'QGEP', 'BRDT', 'OHLB', 'CCPR', 'PARC', 'BVMF', 'ALLL', 'BBRK', 'ABRE', 'FJTA', 'BTOW'}


In [14]:
# Carregar dataframe com os dados financeiros das empresas
df_fin = (pd
    .read_csv(
        '../data/magic_financials.csv',
        parse_dates=['doc_env', 'per_ini', 'per_fim']
        )
)
# Renomear coluna com o código CVM para coincidir com as outras bases
df_fin.rename(columns={'cia_id': 'codcvm'}, inplace=True)
# As colunas 'per_ini' e 'cia_nome' não serão usadas
df_fin.drop(columns=['per_ini', 'cia_nome'], inplace=True)
# year = ano em que a informação será usada -> ano seguinte ao fim do período
df_fin['year'] = df_fin.per_fim.dt.year + 1
df_fin

Unnamed: 0,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,year
0,94,2011-03-31 10:16:48,2010-12-31,8856000.00,-18981000.00,24777000.00,0.20,2011
1,94,2011-04-01 17:31:56,2010-12-31,8856000.00,-18980000.00,21093000.00,0.17,2011
2,94,2012-03-20 15:20:37,2011-12-31,9480000.00,-46182000.00,8720000.00,0.08,2012
...,...,...,...,...,...,...,...,...
2913,26700,2021-12-16 16:23:45,2020-12-31,855863854.00,607732000.00,1086628000.00,0.32,2021
2914,26700,2022-03-29 18:27:45,2021-12-31,858714812.00,1659228000.00,1449802000.00,0.29,2022
2915,26786,2022-02-01 19:43:03,2021-12-31,1269683.00,-88617000.00,137222000.00,0.85,2022


In [15]:
# Incluir os dados contábeis em 'df_magic' 
df_magic = df_magic.merge(right=df_fin, how='inner', on=['year', 'codcvm'])
df_magic

Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic
0,2017-04-10,AALR3,ALLIAR,15.36,315,AALR,2017,456.73,24058,2017-03-22 23:42:13,2016-12-31,114898767.00,308503000.00,100612000.00,0.07
1,2018-04-10,AALR3,ALLIAR,15.17,175,AALR,2018,557.40,24058,2018-03-28 20:20:31,2017-12-31,118292816.00,507645000.00,70337000.00,0.04
2,2019-04-10,AALR3,ALLIAR,14.64,156,AALR,2019,489.83,24058,2019-03-19 19:58:58,2018-12-31,118292816.00,532346000.00,137286000.00,0.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1769,2014-04-10,CGRA4,GRAZZIOTIN,17.35,6,CGRA,2014,15.97,4537,2014-02-27 08:09:24,2013-12-31,21815945.00,-87381310.00,53334519.00,0.20
1770,2015-04-10,RUMO3,RUMO LOG,1.65,14416,RUMO,2015,10434.43,23450,2015-03-03 20:02:32,2014-12-31,1026488214.00,699234000.00,206689000.00,0.10
1771,2016-04-11,RUMO3,RUMO LOG,3.31,26340,RUMO,2016,6106.43,23450,2016-02-25 20:38:25,2015-12-31,299015898.00,8003920000.00,1051477000.00,0.09


In [16]:
# Converter os dados contábeis para milhões para facilitar verificações posteriores
accounting_columns = ['shares_outstanding', 'net_debt', 'ebit']
df_magic[accounting_columns] = df_magic[accounting_columns] / 1_000_000
df_magic

Unnamed: 0,datneg,codneg,nomres,premed,totneg,codemi,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic
0,2017-04-10,AALR3,ALLIAR,15.36,315,AALR,2017,456.73,24058,2017-03-22 23:42:13,2016-12-31,114.90,308.50,100.61,0.07
1,2018-04-10,AALR3,ALLIAR,15.17,175,AALR,2018,557.40,24058,2018-03-28 20:20:31,2017-12-31,118.29,507.64,70.34,0.04
2,2019-04-10,AALR3,ALLIAR,14.64,156,AALR,2019,489.83,24058,2019-03-19 19:58:58,2018-12-31,118.29,532.35,137.29,0.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1769,2014-04-10,CGRA4,GRAZZIOTIN,17.35,6,CGRA,2014,15.97,4537,2014-02-27 08:09:24,2013-12-31,21.82,-87.38,53.33,0.20
1770,2015-04-10,RUMO3,RUMO LOG,1.65,14416,RUMO,2015,10434.43,23450,2015-03-03 20:02:32,2014-12-31,1026.49,699.23,206.69,0.10
1771,2016-04-11,RUMO3,RUMO LOG,3.31,26340,RUMO,2016,6106.43,23450,2016-02-25 20:38:25,2015-12-31,299.02,8003.92,1051.48,0.09


In [17]:
# Calcular os indicadores que dependem do preço do ação
df_magic['market_cap'] = df_magic['shares_outstanding'] * df_magic['premed']
df_magic['enterprise_value'] = df_magic['market_cap'] - df_magic['net_debt']
df_magic['earnings_yield'] = df_magic['ebit'] / df_magic['enterprise_value']
# A coluna 'premed' não será mais usada
df_magic.drop(columns=['premed'], inplace=True)
df_magic

Unnamed: 0,datneg,codneg,nomres,totneg,codemi,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2017-04-10,AALR3,ALLIAR,315,AALR,2017,456.73,24058,2017-03-22 23:42:13,2016-12-31,114.90,308.50,100.61,0.07,1764.85,1456.34,0.07
1,2018-04-10,AALR3,ALLIAR,175,AALR,2018,557.40,24058,2018-03-28 20:20:31,2017-12-31,118.29,507.64,70.34,0.04,1794.50,1286.86,0.05
2,2019-04-10,AALR3,ALLIAR,156,AALR,2019,489.83,24058,2019-03-19 19:58:58,2018-12-31,118.29,532.35,137.29,0.07,1731.81,1199.46,0.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1769,2014-04-10,CGRA4,GRAZZIOTIN,6,CGRA,2014,15.97,4537,2014-02-27 08:09:24,2013-12-31,21.82,-87.38,53.33,0.20,378.51,465.89,0.11
1770,2015-04-10,RUMO3,RUMO LOG,14416,RUMO,2015,10434.43,23450,2015-03-03 20:02:32,2014-12-31,1026.49,699.23,206.69,0.10,1693.71,994.47,0.21
1771,2016-04-11,RUMO3,RUMO LOG,26340,RUMO,2016,6106.43,23450,2016-02-25 20:38:25,2015-12-31,299.02,8003.92,1051.48,0.09,989.74,-7014.18,-0.15


In [18]:
# Existem algumas empresas, como a COGN3, com baixo valor de mercado e excesso de
# dívida líquida resultando em um EV negativo e, consequentemente, um EY (EBIT/EV) 
# negativo ou distorcido (tendendo a infinito). Apesar do lívro não deixar isso claro,
# iremos remover essas empresas da seleção.
df_magic.query('enterprise_value > 100', inplace=True)
print('Number of companies available for backtesting', df_magic.codneg.nunique())

Number of companies available for backtesting 229


In [19]:
# Remover revisões de DFPs publicadas no mesmo dia ou posteriores ao corte
# No livro, o corte é de uma semana
df_magic.query('doc_env.dt.date < datneg', inplace=True)
df_magic

Unnamed: 0,datneg,codneg,nomres,totneg,codemi,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2017-04-10,AALR3,ALLIAR,315,AALR,2017,456.73,24058,2017-03-22 23:42:13,2016-12-31,114.90,308.50,100.61,0.07,1764.85,1456.34,0.07
1,2018-04-10,AALR3,ALLIAR,175,AALR,2018,557.40,24058,2018-03-28 20:20:31,2017-12-31,118.29,507.64,70.34,0.04,1794.50,1286.86,0.05
2,2019-04-10,AALR3,ALLIAR,156,AALR,2019,489.83,24058,2019-03-19 19:58:58,2018-12-31,118.29,532.35,137.29,0.07,1731.81,1199.46,0.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1768,2014-04-10,CGRA3,GRAZZIOTIN,2,CGRA,2014,6.76,4537,2014-02-27 08:09:24,2013-12-31,21.82,-87.38,53.33,0.20,376.98,464.36,0.11
1769,2014-04-10,CGRA4,GRAZZIOTIN,6,CGRA,2014,15.97,4537,2014-02-27 08:09:24,2013-12-31,21.82,-87.38,53.33,0.20,378.51,465.89,0.11
1770,2015-04-10,RUMO3,RUMO LOG,14416,RUMO,2015,10434.43,23450,2015-03-03 20:02:32,2014-12-31,1026.49,699.23,206.69,0.10,1693.71,994.47,0.21


In [20]:
# Manter somente a DFP mais recente ao corte para cada ativo
df_magic.sort_values('doc_env', inplace=True)
df_magic.drop_duplicates(subset=['codneg', 'year'], keep='last', inplace=True)
df_magic

Unnamed: 0,datneg,codneg,nomres,totneg,codemi,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
1277,2011-04-11,TOTS3,TOTVS,244,TOTS,2011,485.33,19992,2011-01-31 19:05:59,2010-12-31,31.46,179.42,211.67,0.26,994.10,814.68,0.26
732,2011-04-11,LREN3,LOJAS RENNER,3604,LREN,2011,3465.70,8133,2011-02-16 19:53:52,2010-12-31,122.35,-27.16,404.47,0.41,6682.70,6709.86,0.06
67,2011-04-11,AMAR3,LOJAS MARISA,183,AMAR,2011,407.27,22055,2011-02-18 15:38:17,2010-12-31,184.55,37.21,285.95,0.33,4981.03,4943.82,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319,2022-04-11,EUCA4,EUCATEX,231,EUCA,2022,353.73,5770,2022-03-30 20:18:52,2021-12-31,92.62,380.81,549.72,0.24,928.97,548.16,1.00
318,2022-04-11,EUCA3,EUCATEX,15,EUCA,2022,22.27,5770,2022-03-30 20:18:52,2021-12-31,92.62,380.81,549.72,0.24,1497.65,1116.84,0.49
1650,2022-04-11,KRSA3,KORA SAUDE,201,KRSA,2022,802.60,25879,2022-03-30 23:29:03,2021-12-31,767.17,1290.38,132.81,0.05,2631.41,1341.03,0.10


In [21]:
# Remover o ativo menos líquido da empresa (coluna 'totneg_sma30')
df_magic.sort_values(by=['year', 'codemi', 'totneg_sma30'], inplace=True)
df_magic.drop_duplicates(
    subset=['codemi', 'year'], keep='last', inplace=True, ignore_index=True
)
# A coluna 'codemi' não será mais necessária
df_magic.drop(columns='codemi', inplace=True)
print('Number of companies available for backtesting', df_magic.codneg.nunique())
df_magic

Number of companies available for backtesting 211


Unnamed: 0,datneg,codneg,nomres,totneg,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2011-04-11,AEDU3,ANHANGUERA,978,2011,1031.77,18961,2011-03-30 00:09:51,2010-12-31,145690.26,-571.50,157.01,0.11,5537686.82,5538258.32,0.00
1,2011-04-11,ALPA4,ALPARGATAS,158,2011,210.80,10456,2011-03-30 16:55:23,2010-12-31,353.46,-358.70,324.23,0.33,4011.73,4370.43,0.07
2,2011-04-11,ALSC3,ALIANSCE,102,2011,274.13,21300,2011-03-29 11:28:24,2010-12-31,139.47,-141.72,99.44,0.09,1941.38,2083.10,0.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
958,2022-04-11,WEGE3,WEG,25960,2022,28835.07,5410,2022-02-16 07:05:37,2021-12-31,4197.32,-1428.02,4158.34,0.33,138427.55,139855.57,0.03
959,2022-04-11,WLMM4,WLM IND COM,14,2022,11.37,11070,2022-03-22 22:18:18,2021-12-31,36.41,-152.00,136.13,0.35,1258.13,1410.13,0.10
960,2022-04-11,YDUQ3,YDUQS PART,10252,2022,14513.70,21016,2022-03-15 18:09:20,2021-12-31,309.09,3692.84,546.59,0.08,6092.14,2399.31,0.23


In [22]:
# Verificar erro no núm de ações na DFP da Braskem de 2012-03-14 09:48:31	
df_magic.query('codneg.str.startswith("BRKM")')

Unnamed: 0,datneg,codneg,nomres,totneg,year,totneg_sma30,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
11,2011-04-11,BRKM5,BRASKEM,2593,2011,2518.33,4820,2011-03-17 09:14:46,2010-12-31,801.66,9867.9,3214.96,0.16,18085.56,8217.67,0.39
89,2012-04-09,BRKM5,BRASKEM,1670,2012,3455.03,4820,2012-03-14 09:48:31,2011-12-31,801665.62,12006.8,1929.9,0.09,11560018.2,11548011.4,0.0
492,2018-04-10,BRKM5,BRASKEM,7898,2018,6297.0,4820,2018-03-29 05:11:07,2017-12-31,797.26,17569.8,9359.06,0.4,38204.58,20634.79,0.45
566,2019-04-10,BRKM5,BRASKEM,8495,2019,8476.8,4820,2019-03-13 19:01:57,2018-12-31,797.22,17259.68,8303.94,0.36,38282.43,21022.75,0.39
845,2022-04-11,BRKM5,BRASKEM,20793,2022,13829.07,4820,2022-03-16 19:43:41,2021-12-31,797.21,22861.69,26043.55,0.9,36137.43,13275.74,1.96


In [23]:
# O Livro fala em empresas com pelos menos USD 50 milhões de valor de mercado
# Remover empresas com menos de R$ 250 milhões de valor de mercado
df_magic.query('market_cap > 250', inplace=True)
# Remover ações com baixíssima liquidez no dia
df_magic.query('totneg_sma30 > 100', inplace=True)
# As colunas 'totneg' e 'totneg_sma30' não são mais necessárias
df_magic.drop(columns=['totneg', 'totneg_sma30'], inplace=True)
print('Number of companies available for backtesting', df_magic.codneg.nunique())
df_magic

Number of companies available for backtesting 192


Unnamed: 0,datneg,codneg,nomres,year,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield
0,2011-04-11,AEDU3,ANHANGUERA,2011,18961,2011-03-30 00:09:51,2010-12-31,145690.26,-571.50,157.01,0.11,5537686.82,5538258.32,0.00
1,2011-04-11,ALPA4,ALPARGATAS,2011,10456,2011-03-30 16:55:23,2010-12-31,353.46,-358.70,324.23,0.33,4011.73,4370.43,0.07
2,2011-04-11,ALSC3,ALIANSCE,2011,21300,2011-03-29 11:28:24,2010-12-31,139.47,-141.72,99.44,0.09,1941.38,2083.10,0.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
957,2022-04-11,VVEO3,VIVEO,2022,25682,2022-03-30 18:25:34,2021-12-31,286.12,-76.53,492.30,0.24,4506.44,4582.97,0.11
958,2022-04-11,WEGE3,WEG,2022,5410,2022-02-16 07:05:37,2021-12-31,4197.32,-1428.02,4158.34,0.33,138427.55,139855.57,0.03
960,2022-04-11,YDUQ3,YDUQS PART,2022,21016,2022-03-15 18:09:20,2021-12-31,309.09,3692.84,546.59,0.08,6092.14,2399.31,0.23


In [24]:
df_magic['rank_roic'] = (df_magic
    .groupby(by=['year'])['roic']
    .rank(method='dense', ascending=False)
)
df_magic['rank_ey'] = (df_magic
    .groupby(by=['year'])['earnings_yield']
    .rank(method='dense', ascending=False)
)
df_magic['ranks_sum'] = df_magic['rank_roic'] + df_magic['rank_ey']
df_magic['rank_final'] = (df_magic
    .groupby(by=['year'])['ranks_sum']
    .rank(method='first', ascending=True)
)
cols_integer = ['rank_roic', 'rank_ey', 'ranks_sum', 'rank_final']
df_magic[cols_integer] = df_magic[cols_integer].astype(int)
df_magic

Unnamed: 0,datneg,codneg,nomres,year,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_roic,rank_ey,ranks_sum,rank_final
0,2011-04-11,AEDU3,ANHANGUERA,2011,18961,2011-03-30 00:09:51,2010-12-31,145690.26,-571.50,157.01,0.11,5537686.82,5538258.32,0.00,48,63,111,59
1,2011-04-11,ALPA4,ALPARGATAS,2011,10456,2011-03-30 16:55:23,2010-12-31,353.46,-358.70,324.23,0.33,4011.73,4370.43,0.07,8,42,50,17
2,2011-04-11,ALSC3,ALIANSCE,2011,21300,2011-03-29 11:28:24,2010-12-31,139.47,-141.72,99.44,0.09,1941.38,2083.10,0.05,56,55,111,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
957,2022-04-11,VVEO3,VIVEO,2022,25682,2022-03-30 18:25:34,2021-12-31,286.12,-76.53,492.30,0.24,4506.44,4582.97,0.11,36,64,100,47
958,2022-04-11,WEGE3,WEG,2022,5410,2022-02-16 07:05:37,2021-12-31,4197.32,-1428.02,4158.34,0.33,138427.55,139855.57,0.03,21,108,129,67
960,2022-04-11,YDUQ3,YDUQS PART,2022,21016,2022-03-15 18:09:20,2021-12-31,309.09,3692.84,546.59,0.08,6092.14,2399.31,0.23,94,43,137,72


In [25]:
df_magic.query('year == 2012').sort_values('rank_final')

Unnamed: 0,datneg,codneg,nomres,year,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_roic,rank_ey,ranks_sum,rank_final
155,2012-04-09,VALE5,VALE,2012,4170,2012-02-16 11:24:28,2011-12-31,5365.31,39166.50,53087.42,0.29,217616.77,178450.27,0.30,11,8,19,1
143,2012-04-09,SLED4,SARAIVA LIVR,2012,10472,2012-03-22 17:05:21,2011-12-31,28.60,274.13,139.59,0.19,616.24,342.12,0.41,24,5,29,2
87,2012-04-09,BRAP4,BRADESPAR,2012,18724,2012-03-16 19:07:53,2011-12-31,349.55,582.93,2167.70,0.23,11874.15,11291.21,0.19,19,11,30,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,2012-04-09,AEDU3,ANHANGUERA,2012,18961,2012-03-31 23:15:31,2011-12-31,145.69,292.39,86.10,0.04,3752.98,3460.59,0.02,65,64,129,65
113,2012-04-09,IMCH3,IMC HOLDINGS,2012,22438,2012-03-14 21:14:16,2011-12-31,83.68,104.34,33.84,0.04,1394.12,1289.78,0.03,66,63,129,66
136,2012-04-09,RADL3,RAIADROGASIL,2012,5258,2012-03-26 20:52:19,2011-12-31,330.39,-177.66,79.56,0.04,5900.69,6078.35,0.01,64,65,129,67


In [26]:
df_magic.sort_values(by=['year', 'rank_final'], inplace=True)
# Renomear a coluna 'datneg'
df_magic.rename(columns={'datneg': 'balancing_on'}, inplace=True)
# A coluna 'year' já está implícita na coluna 'balancing_on' e não será mais usada
df_magic.drop(columns=['year'], inplace=True)
# Remover colunas de cálculo intemediário
df_magic.drop(columns=['rank_roic', 'rank_ey', 'ranks_sum'], inplace=True)
df_magic

Unnamed: 0,balancing_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
72,2011-04-11,TOTS3,TOTVS,19992,2011-01-31 19:05:59,2010-12-31,31.46,179.42,211.67,0.26,994.10,814.68,0.26,1
6,2011-04-11,AUTM3,AUTOMETAL,22381,2011-02-28 14:52:34,2010-12-31,94.42,252.11,247.23,0.29,1606.17,1354.06,0.18,2
76,2011-04-11,VALE5,VALE,4170,2011-02-24 23:49:15,2010-12-31,5365.31,30321.40,40442.26,0.28,254959.29,224637.90,0.18,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
855,2022-04-11,DASA3,DASA,19623,2022-03-29 12:56:36,2021-12-31,560.51,4032.79,88.52,0.01,12953.41,8920.62,0.01,122
859,2022-04-11,ELMD3,ELETROMIDIA,25569,2022-03-29 19:03:12,2021-12-31,139.14,28.30,9.11,0.01,2177.62,2149.31,0.00,123
894,2022-04-11,LWSA3,LOCAWEB,24910,2022-03-29 15:08:01,2021-12-31,589.58,-1480.19,8.53,0.01,5052.71,6532.91,0.00,124


In [27]:
df_magic.sort_values('rank_final')

Unnamed: 0,balancing_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
72,2011-04-11,TOTS3,TOTVS,19992,2011-01-31 19:05:59,2010-12-31,31.46,179.42,211.67,0.26,994.10,814.68,0.26,1
316,2015-04-10,CVCB3,CVC BRASIL,23310,2015-02-09 20:27:56,2014-12-31,131.47,-50.50,324.08,0.79,2107.39,2157.89,0.15,1
378,2016-04-11,CVCB3,CVC BRASIL,23310,2016-02-19 10:27:38,2015-12-31,134.33,154.36,384.45,0.47,2510.63,2356.28,0.16,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
855,2022-04-11,DASA3,DASA,19623,2022-03-29 12:56:36,2021-12-31,560.51,4032.79,88.52,0.01,12953.41,8920.62,0.01,122
859,2022-04-11,ELMD3,ELETROMIDIA,25569,2022-03-29 19:03:12,2021-12-31,139.14,28.30,9.11,0.01,2177.62,2149.31,0.00,123
894,2022-04-11,LWSA3,LOCAWEB,24910,2022-03-29 15:08:01,2021-12-31,589.58,-1480.19,8.53,0.01,5052.71,6532.91,0.00,124


In [28]:
# Verificar se a Petrobras foi cortada do ranking
df_magic.query('codneg == "PETR4"')

Unnamed: 0,balancing_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final


In [29]:
# Verificar o ranking da Braskem antes do corte para teste
df_magic.query('codneg == "BRKM5"')

Unnamed: 0,balancing_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
11,2011-04-11,BRKM5,BRASKEM,4820,2011-03-17 09:14:46,2010-12-31,801.66,9867.9,3214.96,0.16,18085.56,8217.67,0.39,8
89,2012-04-09,BRKM5,BRASKEM,4820,2012-03-14 09:48:31,2011-12-31,801665.62,12006.8,1929.9,0.09,11560018.2,11548011.4,0.0,62
492,2018-04-10,BRKM5,BRASKEM,4820,2018-03-29 05:11:07,2017-12-31,797.26,17569.8,9359.06,0.4,38204.58,20634.79,0.45,1
566,2019-04-10,BRKM5,BRASKEM,4820,2019-03-13 19:01:57,2018-12-31,797.22,17259.68,8303.94,0.36,38282.43,21022.75,0.39,1
845,2022-04-11,BRKM5,BRASKEM,4820,2022-03-16 19:43:41,2021-12-31,797.21,22861.69,26043.55,0.9,36137.43,13275.74,1.96,1


In [30]:
# Select only the first 30 companies in the magic rank for each year
df_magic.query('rank_final <= 30', inplace=True)
df_magic.reset_index(drop=True, inplace=True)
print('Number of selected companies for backtesting', df_magic.codneg.nunique())
df_magic

Number of selected companies for backtesting 113


Unnamed: 0,balancing_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
0,2011-04-11,TOTS3,TOTVS,19992,2011-01-31 19:05:59,2010-12-31,31.46,179.42,211.67,0.26,994.10,814.68,0.26,1
1,2011-04-11,AUTM3,AUTOMETAL,22381,2011-02-28 14:52:34,2010-12-31,94.42,252.11,247.23,0.29,1606.17,1354.06,0.18,2
2,2011-04-11,VALE5,VALE,4170,2011-02-24 23:49:15,2010-12-31,5365.31,30321.40,40442.26,0.28,254959.29,224637.90,0.18,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,2022-04-11,DXCO3,DEXCO,21091,2022-02-09 20:32:27,2021-12-31,760.96,2448.35,1891.39,0.23,10021.88,7573.54,0.25,28
358,2022-04-11,JHSF3,JHSF PART,20605,2022-02-24 19:37:23,2021-12-31,686.22,1099.50,1113.51,0.20,4412.42,3312.92,0.34,29
359,2022-04-11,CSAN3,COSAN,19836,2022-02-24 14:39:01,2021-12-31,1874.07,32752.58,8676.31,0.14,43440.96,10688.38,0.81,30


In [31]:
# Save file
df_magic.to_csv('../data/magic_stocks.csv', index=False)

In [32]:
# Test file
pd.read_csv('../data/magic_stocks.csv')

Unnamed: 0,balancing_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
0,2011-04-11,TOTS3,TOTVS,19992,2011-01-31 19:05:59,2010-12-31,31.46,179.42,211.67,0.26,994.10,814.68,0.26,1
1,2011-04-11,AUTM3,AUTOMETAL,22381,2011-02-28 14:52:34,2010-12-31,94.42,252.11,247.23,0.29,1606.17,1354.06,0.18,2
2,2011-04-11,VALE5,VALE,4170,2011-02-24 23:49:15,2010-12-31,5365.31,30321.40,40442.26,0.28,254959.29,224637.90,0.18,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,2022-04-11,DXCO3,DEXCO,21091,2022-02-09 20:32:27,2021-12-31,760.96,2448.35,1891.39,0.23,10021.88,7573.54,0.25,28
358,2022-04-11,JHSF3,JHSF PART,20605,2022-02-24 19:37:23,2021-12-31,686.22,1099.50,1113.51,0.20,4412.42,3312.92,0.34,29
359,2022-04-11,CSAN3,COSAN,19836,2022-02-24 14:39:01,2021-12-31,1874.07,32752.58,8676.31,0.14,43440.96,10688.38,0.81,30


In [34]:
# Some of the stocks will be selected in multiple periods
print(df_magic.codneg.value_counts().head(5).to_markdown())

|       |   codneg |
|:------|---------:|
| TGMA3 |       10 |
| KLBN4 |        9 |
| CCRO3 |        8 |
| LREN3 |        8 |
| SEER3 |        8 |


In [34]:
# Check 2022 data
df_22 = df_magic.query('balancing_on > "2022-01-01"')[['codneg', 'nomres', 'roic', 'earnings_yield']].reset_index(drop=True)
df_22.index += 1
df_22[['roic', 'earnings_yield']] = df_22[['roic', 'earnings_yield']].round(2)
print(df_22.to_markdown(mode='github'))

|    | codneg   | nomres       |   roic |   earnings_yield |
|---:|:---------|:-------------|-------:|-----------------:|
|  1 | BRKM5    | BRASKEM      |   0.9  |             1.96 |
|  2 | GOAU4    | GERDAU MET   |   0.42 |             3.37 |
|  3 | BRAP4    | BRADESPAR    |   1.11 |             0.61 |
|  4 | USIM5    | USIMINAS     |   0.49 |             0.68 |
|  5 | TASA4    | TAURUS ARMAS |   0.82 |             0.4  |
|  6 | GGBR4    | GERDAU       |   0.42 |             0.49 |
|  7 | JBSS3    | JBS          |   0.28 |             1.4  |
|  8 | VALE3    | VALE         |   0.68 |             0.3  |
|  9 | BEEF3    | MINERVA      |   0.3  |             0.95 |
| 10 | CMIN3    | CSNMINERACAO |   1.22 |             0.25 |
| 11 | ENAT3    | ENAUTA PART  |   1.1  |             0.26 |
| 12 | SUZB3    | SUZANO S.A.  |   0.25 |             1.17 |
| 13 | DEXP3    | DEXXOS PAR   |   0.33 |             0.55 |
| 14 | EUCA4    | EUCATEX      |   0.24 |             1    |
| 15 | PTBL3    | PORTOB

In [35]:
df_magic.query('codneg == "PRIO3"')

Unnamed: 0,balancing_on,codneg,nomres,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
180,2017-04-10,PRIO3,PETRORIO,2017-03-28 17:48:01,2016-12-31,13.19,-539.18,259.08,0.88,553.09,1092.27,0.24,1
271,2020-04-09,PRIO3,PETRORIO,2020-02-22 00:56:46,2019-12-31,143.19,1511.96,924.34,0.27,3800.15,2288.19,0.4,2
