In [2]:
import pandas as pd
import numpy as np

# Mostrar floats com duas casas decimas
pd.set_option('display.float_format',  lambda x: '%.2f' % x)
pd.options.display.max_colwidth = 20
pd.options.display.max_columns = 20
pd.options.display.max_rows = 4

In [3]:
# Load stock list
df_magic = pd.read_csv("../data/2_magic_stocks.csv", parse_dates=["cutoff_date"])
df_magic.rank_final = df_magic.rank_final.astype(int)
df_magic

Unnamed: 0,cutoff_date,codneg,nomres,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
0,2011-04-11,BRPR3,BR PROPERT,2011-03-21 17:07:01,2010-12-31,139.51,1597.73,1208.17,0.28,2433.09,835.35,1.45,1
1,2011-04-11,TOTS3,TOTVS,2011-01-31 19:05:59,2010-12-31,31.46,179.42,211.67,0.26,994.10,814.68,0.26,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,2022-04-11,POSI3,POSITIVO TEC,2022-03-30 18:17:00,2021-12-31,141.80,541.60,303.15,0.18,1210.97,669.38,0.45,29
359,2022-04-11,DXCO3,DEXCO,2022-02-09 20:32:27,2021-12-31,760.96,2448.35,1891.39,0.23,10021.88,7573.54,0.25,30


In [4]:
# Some of the stocks will be selected in multiple periods
df_magic.codneg.value_counts()

TGMA3    9
KLBN4    9
        ..
CAML3    1
DXCO3    1
Name: codneg, Length: 122, dtype: int64

In [5]:
# Slice dataframe with columns that will be used
cols = ['cutoff_date', 'codneg']
df_magic = df_magic.loc[:, cols]
df_magic

Unnamed: 0,cutoff_date,codneg
0,2011-04-11,BRPR3
1,2011-04-11,TOTS3
...,...,...
358,2022-04-11,POSI3
359,2022-04-11,DXCO3


In [6]:
# List of magic tickers
magic_tickers = list(df_magic['codneg'].unique())
print('Number of tickers =', len(magic_tickers))
print(magic_tickers)

Number of tickers = 122
['BRPR3', 'TOTS3', 'AUTM3', 'ECOR3', 'VALE5', 'BRAP4', 'UGPA4', 'RAPT4', 'VLID3', 'CCIM3', 'BRKM5', 'ETER3', 'TGMA3', 'EVEN3', 'HBOR3', 'MYPK3', 'POMO4', 'VIVO4', 'NATU3', 'LPSB3', 'ALPA4', 'BISA3', 'JHSF3', 'JSLG3', 'LREN3', 'POSI3', 'MAGG3', 'RENT3', 'ARZZ3', 'GOLL4', 'SLED4', 'LAME4', 'TAMM4', 'MRVE3', 'ODPV3', 'CNFB4', 'GSHP3', 'SHOW3', 'CCRO3', 'MGLU3', 'BRML3', 'EZTC3', 'CSAN3', 'TPIS3', 'PTBL3', 'BEEF3', 'ARTR3', 'KLBN4', 'EUCA4', 'PCAR4', 'CTIP3', 'RDNI3', 'HGTX3', 'EMBR3', 'CYRE3', 'JBSS3', 'AMAR3', 'LCAM3', 'CVCB3', 'SCAR3', 'GFSA3', 'DIRR3', 'LEVE3', 'SEER3', 'TEMP3', 'RLOG3', 'ALSC3', 'BEMA3', 'WEGE3', 'GGBR4', 'AGRO3', 'FIBR3', 'FESA4', 'TIMP3', 'BRFS3', 'UGPA3', 'SLCE3', 'TUPY3', 'PRIO3', 'TIET4', 'SMLE3', 'SUZB5', 'PFRM3', 'IGTA3', 'FRAS3', 'VULC3', 'CRFB3', 'VALE3', 'MOVI3', 'GUAR3', 'SMTO3', 'OMGE3', 'PARD3', 'FLRY3', 'TEND3', 'SMLS3', 'SUZB3', 'CAML3', 'PETR4', 'CEAB3', 'LOGN3', 'MTRE3', 'VIVA3', 'YDUQ3', 'ROMI3', 'CMIN3', 'INTB3', 'TASA4', 'PO

In [7]:
# Load complete B3 adjusted price data
# s3://aq-dl/HistoricalQuotations/base_adj.feather
# Select magic stocks after 2011 (first year with available accounting data)
# Remove columns that will not be used for backtesting
# Daily average stock price (premed) will be used for entering and exiting positions
file_path = "/mnt/aq_disk/data/HistoricalQuotations/processed/base_adj.feather"
df_prices = (pd
    .read_feather(file_path)
    .query('codneg == @magic_tickers and datneg >= "2011.01.01"')
    .reset_index(drop=True)
    [['datneg', 'codneg', 'nomres', 'preult', 'premed']]
)
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed
0,2011-01-04,AGRO3,BRASILAGRO,6.65,6.63
1,2011-01-06,AGRO3,BRASILAGRO,6.45,6.73
...,...,...,...,...,...
249790,2022-06-15,YDUQ3,YDUQS PART,13.74,13.72
249791,2022-06-17,YDUQ3,YDUQS PART,13.33,13.31


In [8]:
# Join price dataframe with magic stocks dataframe
# Since there are stocks in multiple periods, this is a one-to-many join
df_prices = df_prices.merge(right=df_magic, how='inner')
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed,cutoff_date
0,2011-01-04,AGRO3,BRASILAGRO,6.65,6.63,2016-04-11
1,2011-01-04,AGRO3,BRASILAGRO,6.65,6.63,2019-04-10
...,...,...,...,...,...,...
857091,2022-06-15,YDUQ3,YDUQS PART,13.74,13.72,2020-04-09
857092,2022-06-17,YDUQ3,YDUQS PART,13.33,13.31,2020-04-09


In [9]:
# Create a list of ordered unique cutoff dates
values = list(df_prices.cutoff_date.sort_values().drop_duplicates())
# Add one extra year to the end of the list
values.append(values[-1] + pd.DateOffset(years=1))
keys = list(range(len(values)))
# Create a dictionary where the keys are the cutoff dates
cutoff_dict = dict(zip(keys, values))
cutoff_dict

{0: Timestamp('2011-04-11 00:00:00'),
 1: Timestamp('2012-04-09 00:00:00'),
 2: Timestamp('2013-04-10 00:00:00'),
 3: Timestamp('2014-04-10 00:00:00'),
 4: Timestamp('2015-04-10 00:00:00'),
 5: Timestamp('2016-04-11 00:00:00'),
 6: Timestamp('2017-04-10 00:00:00'),
 7: Timestamp('2018-04-10 00:00:00'),
 8: Timestamp('2019-04-10 00:00:00'),
 9: Timestamp('2020-04-09 00:00:00'),
 10: Timestamp('2021-04-12 00:00:00'),
 11: Timestamp('2022-04-11 00:00:00'),
 12: Timestamp('2023-04-11 00:00:00')}

In [10]:
# Rank the cutoff dates so we have the keys for mapping with the cutoff_dict
df_prices['cutoff_key'] = df_prices['cutoff_date'].rank(method='dense').astype(int)
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed,cutoff_date,cutoff_key
0,2011-01-04,AGRO3,BRASILAGRO,6.65,6.63,2016-04-11,6
1,2011-01-04,AGRO3,BRASILAGRO,6.65,6.63,2019-04-10,9
...,...,...,...,...,...,...,...
857091,2022-06-15,YDUQ3,YDUQS PART,13.74,13.72,2020-04-09,10
857092,2022-06-17,YDUQ3,YDUQS PART,13.33,13.31,2020-04-09,10


In [11]:
# Map values so that we have the next year cutoff dates
df_prices['next_cutoff'] = df_prices['cutoff_key'].map(cutoff_dict)
df_prices.drop(columns='cutoff_key', inplace=True)
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed,cutoff_date,next_cutoff
0,2011-01-04,AGRO3,BRASILAGRO,6.65,6.63,2016-04-11,2017-04-10
1,2011-01-04,AGRO3,BRASILAGRO,6.65,6.63,2019-04-10,2020-04-09
...,...,...,...,...,...,...,...
857091,2022-06-15,YDUQ3,YDUQS PART,13.74,13.72,2020-04-09,2021-04-12
857092,2022-06-17,YDUQ3,YDUQS PART,13.33,13.31,2020-04-09,2021-04-12


In [12]:
# Select prices that are between each of the cutoff intervals
df_prices.query('cutoff_date <= datneg <= next_cutoff', inplace=True)
df_prices.sort_values(['codneg', 'datneg'], inplace=True, ignore_index=True)
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed,cutoff_date,next_cutoff
0,2016-04-11,AGRO3,BRASILAGRO,7.90,7.86,2016-04-11,2017-04-10
1,2016-04-12,AGRO3,BRASILAGRO,7.90,7.86,2016-04-11,2017-04-10
...,...,...,...,...,...,...,...
81830,2021-04-09,YDUQ3,YDUQS PART,30.57,30.53,2020-04-09,2021-04-12
81831,2021-04-12,YDUQ3,YDUQS PART,30.58,30.59,2020-04-09,2021-04-12


In [13]:
df_prices.to_csv('../data/3_prices.csv', index=False)