In [1]:
import pandas as pd
import numpy as np

# Mostrar floats com duas casas decimas
pd.set_option('display.float_format',  lambda x: '%.4g' % x)
pd.options.display.max_colwidth = 20
pd.options.display.max_columns = 20
pd.options.display.max_rows = 4

In [2]:
# Load stock list
df_stocks = pd.read_csv("../data/2_magic_stocks.csv", parse_dates=["cutoff_date"])
df_stocks.rank_final = df_stocks.rank_final.astype(int)
df_stocks

Unnamed: 0,cutoff_date,codneg,nomres,doc_env,per_fim,shares_outstanding,net_debt,ebit,roic,market_cap,enterprise_value,earnings_yield,rank_final
0,2011-04-11,BRPR3,BR PROPERT,2011-03-21 17:07:01,2010-12-31,139.5,1598,1208,0.2805,2433,835.4,1.446,1
1,2011-04-11,TOTS3,TOTVS,2011-01-31 19:05:59,2010-12-31,31.46,179.4,211.7,0.261,994.1,814.7,0.2598,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,2022-04-11,POSI3,POSITIVO TEC,2022-03-30 18:17:00,2021-12-31,141.8,541.6,303.1,0.1759,1211,669.4,0.4529,29
359,2022-04-11,DXCO3,DEXCO,2022-02-09 20:32:27,2021-12-31,761,2448,1891,0.2311,1.002e+04,7574,0.2497,30


In [3]:
# Some of the stocks will be selected in multiple periods
df_stocks.codneg.value_counts()

KLBN4    10
VALE3     9
         ..
BISA3     1
DXCO3     1
Name: codneg, Length: 126, dtype: int64

In [4]:
# Slice dataframe with columns that will be used
cols = ['cutoff_date', 'codneg']
df_stocks = df_stocks.loc[:, cols]
df_stocks

Unnamed: 0,cutoff_date,codneg
0,2011-04-11,BRPR3
1,2011-04-11,TOTS3
...,...,...
358,2022-04-11,POSI3
359,2022-04-11,DXCO3


In [5]:
# Load complete B3 adjusted price data
# s3://aq-dl/HistoricalQuotations/base_adj.feather
file_path = "/mnt/aq_disk/data/HistoricalQuotations/processed/base_adj.feather"
df_b3 = pd.read_feather(file_path)
df_b3

Unnamed: 0,datneg,codneg,codisi,nomres,especi,codbdi,tpmerc,dismes,datven,prazot,...,preult,preofc,preofv,preexe,totneg,quatot,voltot,evento,valpro,ajuste
0,2022-02-03,5GTK11,BR5GTKCTF000,INVESTO 5GTK,CI,14,10,100,NaT,0,...,94.86,94.86,97.7,0,85,2.085e+04,1.995e+06,,,1
1,2022-02-04,5GTK11,BR5GTKCTF000,INVESTO 5GTK,CI,14,10,100,NaT,0,...,95.84,95.37,95.84,0,50,1107,1.06e+05,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10971867,2013-12-18,ZNTE6L,BRZNTEACNPB8,FERR ZANETTE,PNB,52,17,104,NaT,0,...,0.82,0,0,0,1,7.442e+05,6.102e+05,,,1
10971868,2012-12-13,ZNTE7L,BRZNTEACNPC6,FERR ZANETTE,PNC*,52,17,111,NaT,0,...,0.82,0,0,0,1,4.76e+08,3.903e+05,,,1


In [6]:
# Select only stocks after 2011 (first year with available accounting data) and remove
# other assets (stock options, ETFs, etc)
df_prices = df_b3.copy()
df_prices.query('\
    codbdi == 2 and \
    datneg >= "2011.01.01" and \
    especi.str.contains("ON |PN ")'
    , inplace=True
)
df_prices.reset_index(drop=True, inplace=True)
# Remove columns that will not be used for backtesting
# Daily average stock price (premed) will be used for entering and exiting positions
cols = ['datneg', 'codneg', 'nomres', 'preult', 'premed']
df_prices = df_prices.loc[:, cols]
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed
0,2016-10-28,AALR3,ALLIAR,18.93,19.01
1,2016-10-31,AALR3,ALLIAR,17.81,17.92
...,...,...,...,...,...
578462,2022-06-15,YDUQ3,YDUQS PART,13.74,13.72
578463,2022-06-17,YDUQ3,YDUQS PART,13.33,13.31


In [7]:
# Join price dataframe with magic stocks dataframe
# Since there are stocks in multiple periods, this is a one-to-many join
df_prices = df_prices.merge(right=df_stocks, how='inner')
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed,cutoff_date
0,2011-01-04,AGRO3,BRASILAGRO,6.646,6.627,2020-04-09
1,2011-01-06,AGRO3,BRASILAGRO,6.45,6.725,2020-04-09
...,...,...,...,...,...,...
837323,2022-06-15,YDUQ3,YDUQS PART,13.74,13.72,2020-04-09
837324,2022-06-17,YDUQ3,YDUQS PART,13.33,13.31,2020-04-09


In [8]:
# Create a list of ordered unique cutoff dates
values = list(df_prices.cutoff_date.sort_values().drop_duplicates())
# Add one extra year to the end of the list
values.append(values[-1] + pd.DateOffset(years=1))
keys = list(range(len(values)))
# Create a dictionary where the keys are the cutoff dates
cutoff_dict = dict(zip(keys, values))
cutoff_dict

{0: Timestamp('2011-04-11 00:00:00'),
 1: Timestamp('2012-04-09 00:00:00'),
 2: Timestamp('2013-04-10 00:00:00'),
 3: Timestamp('2014-04-10 00:00:00'),
 4: Timestamp('2015-04-10 00:00:00'),
 5: Timestamp('2016-04-11 00:00:00'),
 6: Timestamp('2017-04-10 00:00:00'),
 7: Timestamp('2018-04-10 00:00:00'),
 8: Timestamp('2019-04-10 00:00:00'),
 9: Timestamp('2020-04-09 00:00:00'),
 10: Timestamp('2021-04-12 00:00:00'),
 11: Timestamp('2022-04-11 00:00:00'),
 12: Timestamp('2023-04-11 00:00:00')}

In [9]:
# Rank the cutoff dates so we have the keys for mapping with the cutoff_dict
df_prices['cutoff_key'] = df_prices['cutoff_date'].rank(method='dense').astype(int)
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed,cutoff_date,cutoff_key
0,2011-01-04,AGRO3,BRASILAGRO,6.646,6.627,2020-04-09,10
1,2011-01-06,AGRO3,BRASILAGRO,6.45,6.725,2020-04-09,10
...,...,...,...,...,...,...,...
837323,2022-06-15,YDUQ3,YDUQS PART,13.74,13.72,2020-04-09,10
837324,2022-06-17,YDUQ3,YDUQS PART,13.33,13.31,2020-04-09,10


In [10]:
# Map values so that we have the next year cutoff dates
df_prices['next_cutoff'] = df_prices['cutoff_key'].map(cutoff_dict)
df_prices.drop(columns='cutoff_key', inplace=True)
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed,cutoff_date,next_cutoff
0,2011-01-04,AGRO3,BRASILAGRO,6.646,6.627,2020-04-09,2021-04-12
1,2011-01-06,AGRO3,BRASILAGRO,6.45,6.725,2020-04-09,2021-04-12
...,...,...,...,...,...,...,...
837323,2022-06-15,YDUQ3,YDUQS PART,13.74,13.72,2020-04-09,2021-04-12
837324,2022-06-17,YDUQ3,YDUQS PART,13.33,13.31,2020-04-09,2021-04-12


In [11]:
# Select prices that are between each of the cutoff intervals
df_prices.query('cutoff_date <= datneg <= next_cutoff', inplace=True)
df_prices.sort_values(['codneg', 'datneg'], inplace=True, ignore_index=True)
df_prices

Unnamed: 0,datneg,codneg,nomres,preult,premed,cutoff_date,next_cutoff
0,2020-04-09,AGRO3,BRASILAGRO,14.46,14.33,2020-04-09,2021-04-12
1,2020-04-13,AGRO3,BRASILAGRO,14.36,14.28,2020-04-09,2021-04-12
...,...,...,...,...,...,...,...
81347,2021-04-09,YDUQ3,YDUQS PART,30.57,30.53,2020-04-09,2021-04-12
81348,2021-04-12,YDUQ3,YDUQS PART,30.58,30.59,2020-04-09,2021-04-12


In [12]:
df_prices.to_csv('../data/3_prices.csv', index=False)