Notebook for filtering the accounts and indicators that will be used for selecting stocks for the Magic Formula

In [1]:
import pandas as pd
pd.options.display.max_colwidth = 30
pd.options.display.max_rows = 4
TAX_RATE = 0.34

In [2]:
# Lendo a base diretamente do S3 -> são mais de 10 milhões de linhas contábeis!
# df = pd.read_feather("s3://aq-dl/FinancialStatements/processed/dataset.feather")
df = pd.read_feather("/mnt/aq_disk/data/FinancialStatements/processed/dataset.feather")
print('Number of companies available for backtesting:', df.cia_id.nunique())
df

Number of companies available for backtesting: 1112


Unnamed: 0,cia_id,cia_nome,doc_tp,doc_env,per_ini,per_fim,per_ref,dem_tp,shares_outstanding,revenue,ebit,total_cash,total_debt,net_debt,total_equity,invested_capital,roic
0,21377,BANCO INDUSTRIAL DO BRASIL,DFP,2011-01-27 12:30:29,2010-01-01,2010-12-31,0,CON,168.545105,,,,,,,,
1,21377,BANCO INDUSTRIAL DO BRASIL,DFP,2011-01-27 12:30:29,2008-01-01,2008-12-31,-2,IND,,292.816,36.948,134.250,,,0.835,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102599,25720,RIO ALTO ENERGIAS RENOVÁVE...,ITR,2022-07-15 16:55:20,2021-01-01,2021-12-31,0,CON,46.000000,2.554,-40.589,193.350,713.544,520.194,-36.078,484.116,
102600,25720,RIO ALTO ENERGIAS RENOVÁVE...,ITR,2022-07-15 16:55:20,2021-01-01,2021-12-31,0,IND,46.000000,,-47.917,211.138,,,-36.078,,


In [3]:
# Ler arquivo com as empresas que farão parte do backtesting
df_included = pd.read_csv("../data/included_companies.psv", sep="|")
df_included

Unnamed: 0,DENOM_SOCIAL,DT_REG,DT_CANCEL,SIT,DT_INI_SIT,CD_CVM,SETOR_ATIV,CONTROLE_ACIONARIO
0,ACOPALMA CIA INDL ACOS V P...,1978-05-09,2007-08-07,CANCELADA,2007-08-07,60,Metalurgia e Siderurgia,PRIVADO
1,ACOS ANHANGUERA SA,1971-10-20,1994-02-09,CANCELADA,1994-02-09,78,Metalurgia e Siderurgia,PRIVADO
...,...,...,...,...,...,...,...,...
1441,QESTRA TECNOLOGIA ADMINIST...,2022-05-30,,ATIVO,2022-05-30,26816,Comunicação e Informática,PRIVADO
1442,CONCESSIONARIA CATARINENSE...,2022-06-21,,ATIVO,2022-06-21,26859,Emp. Adm. Part. - Serviços...,PRIVADO


In [4]:
# Create list with companies that must be excluded from backtest
included_companies = df_included.CD_CVM.to_list()
print(included_companies[:10])

[60, 78, 86, 94, 116, 132, 159, 167, 175, 183]


In [5]:
# Remove those companies from financials dataframe
df.query('cia_id == @included_companies', inplace=True)
print('Number of companies available for backtesting:', df.cia_id.nunique())
df

Number of companies available for backtesting: 585


Unnamed: 0,cia_id,cia_nome,doc_tp,doc_env,per_ini,per_fim,per_ref,dem_tp,shares_outstanding,revenue,ebit,total_cash,total_debt,net_debt,total_equity,invested_capital,roic
4,19992,TOTVS S.A.,DFP,2011-01-31 19:05:59,2008-12-31,2009-01-01,-2,CON,,,,134.978,386.403,251.425,475.730,727.155,
5,19992,TOTVS S.A.,DFP,2011-01-31 19:05:59,2009-01-01,2009-12-31,-1,CON,,988.679,179.347,230.190,441.478,211.288,542.025,753.313,0.162784
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102597,24228,CAMIL ALIMENTOS S.A.,ITR,2022-07-14 18:09:59,2022-03-01,2022-05-31,0,CON,360.0,2396.625,190.008,1342.144,3637.976,2295.832,2801.618,5097.450,
102598,24228,CAMIL ALIMENTOS S.A.,ITR,2022-07-14 18:09:59,2022-03-01,2022-05-31,0,IND,360.0,1849.619,188.592,890.196,2699.112,1808.916,2801.729,4610.645,


* (pág. 138) *For purposes of the study, earnings-related numbers were based on the latest 12-month period, balance sheet items were based on the most recent balance sheet, and market prices were based on the most recent closing price. Utilities, financial stocks and companies where we could not be certain that the information in the database was timely or complete were eliminated. Adjustments were also made for certain non-interest bearing liabilities. The study was structured so that an average of 30 stocks was held during the study period. Stocks with only limited liquidity were eliminated from the study. Market capitalizations were determined based on 2003 dollars. Both the number of companies in each decile as well as the number of companies in each market capitalization group fluctuated as the number of companies in the database varied during the study period.

In [6]:
# Pelo que está no livro, somentes os indicadores dos últimos 12 meses serão usados. 
# Logo, podemos descartar períodos que não são o corrente 
# somente dados auditados e consolidados serão usados -> DFP e CON
df.query('doc_tp == "DFP" and dem_tp == "CON" and per_ref == 0', inplace=True)
# Remover colunas que não serão usadas no backtesting
df.drop(columns=["doc_tp", "dem_tp", "per_ref"], inplace=True)
df.reset_index(drop=True, inplace=True)
print('Number of companies available for backtesting:', df.cia_id.nunique())
df

Number of companies available for backtesting: 585


Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,revenue,ebit,total_cash,total_debt,net_debt,total_equity,invested_capital,roic
0,19992,TOTVS S.A.,2011-01-31 19:05:59,2010-01-01,2010-12-31,31.459000,1129.475,211.669,238.825,418.246,179.421,631.576,810.997,0.185450
1,16446,Numeral 80 Participações S.A.,2011-02-03 20:05:28,2010-01-01,2010-12-31,655.776000,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5606,26522,CERRADINHO BIOENERGIA S.A.,2022-06-21 19:15:55,2021-04-01,2022-03-31,458.277128,2622.623,795.159,1146.144,1820.215,674.071,1128.168,1802.239,0.360617
5607,25496,Jalles Machado S.A.,2022-06-22 20:02:33,2021-04-01,2022-03-31,294.697091,1449.073,751.944,1300.450,2157.896,857.446,1388.729,2246.175,0.269251


In [7]:
df.query('cia_id == 5410 and per_fim == "2021-12-31"')

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,revenue,ebit,total_cash,total_debt,net_debt,total_equity,invested_capital,roic
5133,5410,WEG S.A.,2022-02-16 07:05:37,2021-01-01,2021-12-31,4197.317998,23563.338,4158.343,3217.135,1789.115,-1428.02,14010.672,12582.652,0.300539


In [8]:
df.query('cia_id == 23272')

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,revenue,ebit,total_cash,total_debt,net_debt,total_equity,invested_capital,roic
1580,23272,LOG COMMERCIAL PROPERTIES ...,2013-11-13 17:43:51,2012-01-01,2012-12-31,134.362,27.656,8.923,36.931,508.746,471.815,435.822,907.637,0.012471
1727,23272,LOG COMMERCIAL PROPERTIES ...,2014-03-13 20:26:39,2013-01-01,2013-12-31,171.453,89.255,39.330,151.200,766.916,615.716,739.985,1355.701,0.028201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4610,23272,LOG COMMERCIAL PROPERTIES ...,2021-02-09 18:30:05,2020-01-01,2020-12-31,102.159,141.537,250.749,453.855,800.663,346.808,3079.961,3426.769,0.049599
5123,23272,LOG COMMERCIAL PROPERTIES ...,2022-02-08 18:45:25,2021-01-01,2021-12-31,102.159,149.367,413.701,207.564,1267.705,1060.141,3340.742,4400.883,0.079679


In [9]:
# Testar se sobrou alguma IF na seleção
procurar = "bco |banco|crédito|mercantil|seguradora|seguro|PPLA PARTICIPATIONS"
df.query('cia_nome.str.contains(@procurar, case=False)').drop_duplicates('cia_nome')

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,revenue,ebit,total_cash,total_debt,net_debt,total_equity,invested_capital,roic


In [10]:
query_exp = """
    revenue > 0 \
    & roic > 0 \
    & invested_capital > 0 \
    & ebit > 0 \
    & total_cash > 0 \
    & total_equity > 0 \
    & shares_outstanding > 0
"""
df.query(query_exp, inplace=True)
df.reset_index(drop=True, inplace=True)
print('Number of companies available for backtesting:', df.cia_id.nunique())
df

Number of companies available for backtesting: 340


Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,revenue,ebit,total_cash,total_debt,net_debt,total_equity,invested_capital,roic
0,19992,TOTVS S.A.,2011-01-31 19:05:59,2010-01-01,2010-12-31,31.459000,1129.475,211.669,238.825,418.246,179.421,631.576,810.997,0.185450
1,7510,INDUSTRIAS ROMI S.A.,2011-02-08 20:00:11,2010-01-01,2010-12-31,74.758000,673.529,76.901,246.935,995.425,748.490,702.017,1450.507,0.036732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2628,26522,CERRADINHO BIOENERGIA S.A.,2022-06-21 19:15:55,2021-04-01,2022-03-31,458.277128,2622.623,795.159,1146.144,1820.215,674.071,1128.168,1802.239,0.360617
2629,25496,Jalles Machado S.A.,2022-06-22 20:02:33,2021-04-01,2022-03-31,294.697091,1449.073,751.944,1300.450,2157.896,857.446,1388.729,2246.175,0.269251


In [11]:
# Remover os indicadores intermediários que não serão usados no backtesting
colunas = (
    df.columns[:5].to_list() +
    ['shares_outstanding', 'net_debt', 'ebit', 'roic']
)
df = df[colunas].copy()
df

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic
0,19992,TOTVS S.A.,2011-01-31 19:05:59,2010-01-01,2010-12-31,31.459000,179.421,211.669,0.185450
1,7510,INDUSTRIAS ROMI S.A.,2011-02-08 20:00:11,2010-01-01,2010-12-31,74.758000,748.490,76.901,0.036732
...,...,...,...,...,...,...,...,...,...
2628,26522,CERRADINHO BIOENERGIA S.A.,2022-06-21 19:15:55,2021-04-01,2022-03-31,458.277128,674.071,795.159,0.360617
2629,25496,Jalles Machado S.A.,2022-06-22 20:02:33,2021-04-01,2022-03-31,294.697091,857.446,751.944,0.269251


In [12]:
df.to_csv("../data/magic_financials.csv", index=False)

In [13]:
# Testar o arquivo
pd.read_csv("../data/magic_financials.csv")

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic
0,19992,TOTVS S.A.,2011-01-31 19:05:59,2010-01-01,2010-12-31,31.459000,179.421,211.669,0.185450
1,7510,INDUSTRIAS ROMI S.A.,2011-02-08 20:00:11,2010-01-01,2010-12-31,74.758000,748.490,76.901,0.036732
...,...,...,...,...,...,...,...,...,...
2628,26522,CERRADINHO BIOENERGIA S.A.,2022-06-21 19:15:55,2021-04-01,2022-03-31,458.277128,674.071,795.159,0.360617
2629,25496,Jalles Machado S.A.,2022-06-22 20:02:33,2021-04-01,2022-03-31,294.697091,857.446,751.944,0.269251


In [14]:
df.query('cia_id == 23272')

Unnamed: 0,cia_id,cia_nome,doc_env,per_ini,per_fim,shares_outstanding,net_debt,ebit,roic
770,23272,LOG COMMERCIAL PROPERTIES ...,2013-11-13 17:43:51,2012-01-01,2012-12-31,134.362,471.815,8.923,0.012471
861,23272,LOG COMMERCIAL PROPERTIES ...,2014-03-13 20:26:39,2013-01-01,2013-12-31,171.453,615.716,39.330,0.028201
...,...,...,...,...,...,...,...,...,...
2129,23272,LOG COMMERCIAL PROPERTIES ...,2021-02-09 18:30:05,2020-01-01,2020-12-31,102.159,346.808,250.749,0.049599
2381,23272,LOG COMMERCIAL PROPERTIES ...,2022-02-08 18:45:25,2021-01-01,2021-12-31,102.159,1060.141,413.701,0.079679
