In [1]:
import pandas as pd
import numpy as np

# Mostrar floats com duas casas decimas
pd.set_option('display.float_format',  lambda x: '%.2f' % x)
pd.options.display.max_colwidth = 20
pd.options.display.max_columns = 20
pd.options.display.max_rows = 4

In [2]:
# Load stock list
df_magic = pd.read_csv("../data/magic_stocks.csv", parse_dates=["balancing_on"])
df_magic.rank_final = df_magic.rank_final.astype(int)
df_magic

Unnamed: 0,balancing_on,codneg,nomres,codcvm,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
0,2011-04-11,TOTS3,TOTVS,19992,2011-01-31 19:05:59,2010-12-31,31.46,179.42,211.67,0.26,994.10,814.68,0.26,1
1,2011-04-11,AUTM3,AUTOMETAL,22381,2011-02-28 14:52:34,2010-12-31,94.42,252.11,247.23,0.29,1606.17,1354.06,0.18,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,2022-04-11,JHSF3,JHSF PART,20605,2022-02-24 19:37:23,2021-12-31,686.22,1099.50,1113.51,0.20,4412.42,3312.92,0.34,29
359,2022-04-11,CSAN3,COSAN,19836,2022-02-24 14:39:01,2021-12-31,1874.07,32752.58,8676.31,0.14,43440.96,10688.38,0.81,30


In [3]:
# Slice dataframe with columns that will be used
cols = ['balancing_on', 'codneg']
df_magic = df_magic.loc[:, cols]
df_magic

Unnamed: 0,balancing_on,codneg
0,2011-04-11,TOTS3
1,2011-04-11,AUTM3
...,...,...
358,2022-04-11,JHSF3
359,2022-04-11,CSAN3


In [4]:
# List of magic tickers
magic_tickers = list(df_magic['codneg'].unique())
print('Number of tickers =', len(magic_tickers))
print(magic_tickers)

Number of tickers = 113
['TOTS3', 'AUTM3', 'VALE5', 'BRAP4', 'UGPA4', 'RAPT4', 'CCIM3', 'BRKM5', 'ETER3', 'EVEN3', 'HBOR3', 'POMO4', 'TGMA3', 'MYPK3', 'NATU3', 'JHSF3', 'ALPA4', 'BISA3', 'JSLG3', 'LREN3', 'POSI3', 'RENT3', 'ARZZ3', 'HGTX3', 'AMAR3', 'DASA3', 'MAGG3', 'EZTC3', 'GOLL4', 'SLED4', 'LAME4', 'MRVE3', 'TAMM4', 'GSHP3', 'ODPV3', 'CCRO3', 'SHOW3', 'BRML3', 'MGLU3', 'SLCE3', 'PCAR4', 'PDGR3', 'CSAN3', 'BEEF3', 'PTBL3', 'KLBN4', 'EUCA4', 'RDNI3', 'EMBR3', 'CYRE3', 'JBSS3', 'TIMP3', 'LCAM3', 'IGTA3', 'PFRM3', 'DIRR3', 'CVCB3', 'LEVE3', 'TPIS3', 'SEER3', 'RLOG3', 'ALSC3', 'BEMA3', 'WEGE3', 'GGBR4', 'RUMO3', 'TECN3', 'ANIM3', 'SMLE3', 'FIBR3', 'FESA4', 'BRFS3', 'UGPA3', 'TUPY3', 'SMTO3', 'GRND3', 'PRIO3', 'SUZB5', 'FRAS3', 'MDIA3', 'FLRY3', 'VULC3', 'VALE3', 'GUAR3', 'MOVI3', 'SUZB3', 'PARD3', 'SMLS3', 'TEND3', 'CAML3', 'CEAB3', 'LOGN3', 'MTRE3', 'VIVA3', 'YDUQ3', 'ROMI3', 'CMIN3', 'INTB3', 'TASA4', 'ASAI3', 'PLPL3', 'ALLD3', 'PCAR3', 'CURY3', 'GOAU4', 'RANI3', 'USIM5', 'BMOB3', 'EN

In [5]:
# Load complete B3 adjusted price data
# s3://aq-dl/HistoricalQuotations/processed/dataset.feather
# Select magic stocks after 2011 (first year with available accounting data)
# Remove columns that will not be used for backtesting
# Daily average stock price (premed) will be used for entering and exiting positions
file_path = "/mnt/aq_disk/data/HistoricalQuotations/processed/dataset.feather"
df_prices = (pd
    .read_feather(file_path)
    .query('codneg == @magic_tickers and datneg >= "2011.01.01"')
    .reset_index(drop=True)
    [['datneg', 'codneg', 'nomres', 'preult', 'premed']]
)
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed
0,2021-04-12,ALLD3,ALLIED,15.92,16.27
1,2021-04-13,ALLD3,ALLIED,15.85,15.96
...,...,...,...,...,...
237348,2022-07-14,YDUQ3,YDUQS PART,14.15,13.95
237349,2022-07-15,YDUQ3,YDUQS PART,14.72,14.44


In [6]:
# Join price dataframe with magic stocks dataframe
# Since there are stocks in multiple periods, this is a one-to-many join
df_prices = df_prices.merge(right=df_magic, how='inner')
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed,balancing_on
0,2021-04-12,ALLD3,ALLIED,15.92,16.27,2021-04-12
1,2021-04-12,ALLD3,ALLIED,15.92,16.27,2022-04-11
...,...,...,...,...,...,...
876342,2022-07-14,YDUQ3,YDUQS PART,14.15,13.95,2020-04-09
876343,2022-07-15,YDUQ3,YDUQS PART,14.72,14.44,2020-04-09


In [7]:
# Create a list of ordered unique balancing dates
values = list(df_prices.balancing_on.sort_values().drop_duplicates())
# Add one extra year to the end of the list
values.append(values[-1] + pd.DateOffset(years=1))
keys = list(range(len(values)))
# Create a dictionary where the keys are the balancing dates
balancing_dict = dict(zip(keys, values))
balancing_dict

{0: Timestamp('2011-04-11 00:00:00'),
 1: Timestamp('2012-04-09 00:00:00'),
 2: Timestamp('2013-04-10 00:00:00'),
 3: Timestamp('2014-04-10 00:00:00'),
 4: Timestamp('2015-04-10 00:00:00'),
 5: Timestamp('2016-04-11 00:00:00'),
 6: Timestamp('2017-04-10 00:00:00'),
 7: Timestamp('2018-04-10 00:00:00'),
 8: Timestamp('2019-04-10 00:00:00'),
 9: Timestamp('2020-04-09 00:00:00'),
 10: Timestamp('2021-04-12 00:00:00'),
 11: Timestamp('2022-04-11 00:00:00'),
 12: Timestamp('2023-04-11 00:00:00')}

In [8]:
# Rank the balancing dates so we have the keys for mapping with the balancing_dict
df_prices['balancing_key'] = df_prices['balancing_on'].rank(method='dense').astype(int)
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed,balancing_on,balancing_key
0,2021-04-12,ALLD3,ALLIED,15.92,16.27,2021-04-12,11
1,2021-04-12,ALLD3,ALLIED,15.92,16.27,2022-04-11,12
...,...,...,...,...,...,...,...
876342,2022-07-14,YDUQ3,YDUQS PART,14.15,13.95,2020-04-09,10
876343,2022-07-15,YDUQ3,YDUQS PART,14.72,14.44,2020-04-09,10


In [9]:
# Map values so that we have the next year balancing dates
df_prices['next_balancing'] = df_prices['balancing_key'].map(balancing_dict)
df_prices.drop(columns='balancing_key', inplace=True)
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed,balancing_on,next_balancing
0,2021-04-12,ALLD3,ALLIED,15.92,16.27,2021-04-12,2022-04-11
1,2021-04-12,ALLD3,ALLIED,15.92,16.27,2022-04-11,2023-04-11
...,...,...,...,...,...,...,...
876342,2022-07-14,YDUQ3,YDUQS PART,14.15,13.95,2020-04-09,2021-04-12
876343,2022-07-15,YDUQ3,YDUQS PART,14.72,14.44,2020-04-09,2021-04-12


In [10]:
# Select prices that are between each of the balancing intervals
df_prices.query('balancing_on <= datneg <= next_balancing', inplace=True)
df_prices.sort_values(['codneg', 'datneg'], inplace=True, ignore_index=True)
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed,balancing_on,next_balancing
0,2021-04-12,ALLD3,ALLIED,15.92,16.27,2021-04-12,2022-04-11
1,2021-04-13,ALLD3,ALLIED,15.85,15.96,2021-04-12,2022-04-11
...,...,...,...,...,...,...,...
82792,2021-04-09,YDUQ3,YDUQS PART,30.57,30.53,2020-04-09,2021-04-12
82793,2021-04-12,YDUQ3,YDUQS PART,30.58,30.59,2020-04-09,2021-04-12


In [11]:
df_prices.to_csv('../data/adjusted_prices.csv', index=False)