# Build Initial Dataset 

Example of how to build the initial investing dataset


### 0. Importing Necessary Libs

In [1]:
import Extractors.Extract as methods
from pathlib import Path
import pandas as pd 

data_path = "./DATA"
path = Path(data_path)

### 1. First Build Fundamentus data

In [1]:
methods.build_fundamentos(data_path)

In [2]:
fundamentos = pd.read_csv(Path(data_path) / "fundamentos.csv")

In [3]:
assets = methods.build_full_assets(data_path, fundamentos)

In [4]:
grouping = assets.groupby('Codigo')['P/L'].sum().reset_index()
not_found_codes = grouping[grouping['P/L'] == 0]['Codigo']

In [5]:
teste = methods.get_fundamentos_simple()
teste['Codigo'] = teste.index.str[:4]
not_found_tickers = teste[teste['Codigo'].isin(not_found_codes)].index
not_found_codes = [x for x in not_found_codes if x not in teste['Codigo'].ravel()]

In [6]:
new_fundamentos = methods.build_fundamentos_tickers(data_path, not_found_tickers)

  0%|          | 0/65 [00:00<?, ?it/s]Building Fundamentus data

  2%|▏         | 1/65 [00:03<04:10,  3.92s/it]
error in trying to get data from ticker NTCO3
  5%|▍         | 3/65 [00:27<09:07,  8.83s/it]
error in trying to get data from ticker CEAB3
  9%|▉         | 6/65 [01:00<09:07,  9.29s/it]
error in trying to get data from ticker COGN3
 12%|█▏        | 8/65 [01:15<07:38,  8.05s/it]
error in trying to get data from ticker GPIV33
 14%|█▍        | 9/65 [01:19<06:18,  6.75s/it]
error in trying to get data from ticker ATMP3
 18%|█▊        | 12/65 [01:33<04:37,  5.23s/it]
error in trying to get data from ticker AVLL3
 20%|██        | 13/65 [01:37<04:11,  4.84s/it]
error in trying to get data from ticker CASH3
 22%|██▏       | 14/65 [01:40<03:49,  4.50s/it]
error in trying to get data from ticker CURY3
 23%|██▎       | 15/65 [01:44<03:38,  4.38s/it]
error in trying to get data from ticker DMVF3
 25%|██▍       | 16/65 [01:48<03:24,  4.17s/it]
error in trying to get data from ticker ENJU3

In [7]:
fundamentos = fundamentos.append(new_fundamentos)
fundamentos.to_csv(Path(data_path) / "fundamentos.csv", index = False)
assets = methods.build_full_assets(data_path, fundamentos)

In [9]:
with open(Path(data_path) / 'log.txt', 'r') as f:
    log = f.readlines()
log = [x.replace("\n", "") for x in log]

In [49]:
renaming = {"Cotacao" : "Última Cotação ON",
"P/L" : "P/L",
"P/VP" : "P/VPA",
"PSR" : "PSR",
"DY" : "DY",
"P/Ativo" : "P/Ativo",
"P/Cap.Giro" : 'P/Capital de Giro',
"P/EBIT" : 'P/EBIT',
"P/ACL" : 'P/ACL',
"EV/EBIT" : 'EV/EBIT',
"EV/EBITDA" : "REMOVER",
"Mrg.Ebit" : 'Margem EBIT',
"Mrg.Liq." : "REMOVER",
"Liq.Corr." : "REMOVER",
"ROIC" : "REMOVER",
"ROE" : "ROE",
"Liq.2meses" : "REMOVER",
"Pat.Liq" : "REMOVER",
"Div.Brut/Pat." : "REMOVER",
"Cresc.5anos" : "REMOVER",
"Codigo" : "Codigo" }

In [48]:
teste['Ticker'] = teste.index
teste = teste.reset_index()
teste = teste[teste['Ticker'].isin(log)]
teste = teste.rename(columns = renaming)
teste = teste.drop(columns = "REMOVER")
teste = teste.drop(columns = 'index')
teste['Data'] = ["2020"] * len(teste)

In [68]:
fundamentos = fundamentos.append(teste)
fundamentos.to_csv(Path(data_path) / "fundamentos.csv", index = False)
assets = methods.build_full_assets(data_path, fundamentos)

### 2. Extract Macro Price data 

Extracting 2 years of data for every ticker in assets database. This data will be daily. This will be used only for extracting some metrics. More precise data will be extract later to use in ML models.

In [5]:
assets = pd.read_csv(path / "assets.csv")
tickers = assets['Ticker'].unique()

In [6]:
history = methods.build_price_history(data_path, tickers)

In [7]:
history = pd.read_csv(path / "history.csv")

In [9]:
history['symbol'].nunique()

464

### 3. Extract Cripto data 

Extracting 2 years of data for every ticker in assets database. This data will be daily. This will be used only for extracting some metrics. More precise data will be extract later to use in ML models.

In [2]:
criptos = pd.read_csv(Path("./Extractors/cripto/digital_currency_list.csv"))

In [3]:
history = methods.build_cripto_history(data_path, criptos['currency code'].unique().ravel())

 history from ELEC 

 43%|████▎     | 234/542 [02:51<01:40,  3.08it/s]Could not read history from ELIX 

 43%|████▎     | 235/542 [02:52<01:31,  3.34it/s]Could not read history from EMB 

 44%|████▎     | 236/542 [02:52<01:29,  3.41it/s]Could not read history from EMC 

 44%|████▍     | 238/542 [02:53<02:15,  2.24it/s]Could not read history from ENG 

 44%|████▍     | 239/542 [02:53<02:00,  2.52it/s]Could not read history from ENRG 

 44%|████▍     | 240/542 [02:54<01:46,  2.83it/s]Could not read history from EOT 

 44%|████▍     | 241/542 [02:54<01:38,  3.05it/s]Could not read history from EQT 

 45%|████▍     | 242/542 [02:54<01:34,  3.18it/s]Could not read history from ERC 

 45%|████▍     | 243/542 [02:54<01:28,  3.39it/s]Could not read history from ETHD 

 45%|████▌     | 246/542 [02:58<03:20,  1.47it/s]Could not read history from ETT 

 46%|████▌     | 247/542 [02:58<02:46,  1.77it/s]Could not read history from EVE 

 46%|████▌     | 248/542 [02:58<02:23,  2.05it/s]Could not read

In [5]:
history['symbol'].nunique()

169

### 4. Extract Fundos data 

Extracting data from fundos de investimento

In [2]:
fundos = methods.build_fundos_history(data_path)

 44%|████▍     | 11/25 [01:17<04:36, 19.78s/it]Could not get data for 201912 

100%|██████████| 25/25 [01:48<00:00,  4.34s/it]


In [4]:
fundos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1219220 entries, 0 to 1565424
Data columns (total 16 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   CNPJ_FUNDO       1219220 non-null  object 
 1   DT_COMPTC        1219220 non-null  object 
 2   VL_TOTAL         1219220 non-null  float64
 3   VL_QUOTA         1219220 non-null  float64
 4   VL_PATRIM_LIQ    1219220 non-null  float64
 5   CAPTC_DIA        1219220 non-null  float64
 6   RESG_DIA         1219220 non-null  float64
 7   NR_COTST         1219220 non-null  int64  
 8   DENOM_SOCIAL     1219220 non-null  object 
 9   SIT              1219220 non-null  object 
 10  CLASSE           1219220 non-null  object 
 11  FUNDO_EXCLUSIVO  1219220 non-null  object 
 12  TAXA_PERFM       1113857 non-null  float64
 13  TAXA_ADM         1128977 non-null  float64
 14  INF_TAXA_PERFM   253073 non-null   object 
 15  INVEST_QUALIF    1219220 non-null  object 
dtypes: float64(7), int

In [5]:
fundos.head()

Unnamed: 0,CNPJ_FUNDO,DT_COMPTC,VL_TOTAL,VL_QUOTA,VL_PATRIM_LIQ,CAPTC_DIA,RESG_DIA,NR_COTST,DENOM_SOCIAL,SIT,CLASSE,FUNDO_EXCLUSIVO,TAXA_PERFM,TAXA_ADM,INF_TAXA_PERFM,INVEST_QUALIF
0,00.068.305/0001-35,2019-02-01,60965397.58,26.077652,60945563.17,11908.19,66929.27,7567,FUNDO DE INVESTIMENTO EM COTAS DE FUNDOS DE IN...,EM FUNCIONAMENTO NORMAL,Fundo de Renda Fixa,N,0.0,1.5,,N
2,00.068.305/0001-35,2019-02-04,60670915.16,26.082503,60647004.3,4295.79,314192.51,7566,FUNDO DE INVESTIMENTO EM COTAS DE FUNDOS DE IN...,EM FUNCIONAMENTO NORMAL,Fundo de Renda Fixa,N,0.0,1.5,,N
4,00.068.305/0001-35,2019-02-05,60586886.21,26.08711,60577143.92,39870.0,120442.61,7565,FUNDO DE INVESTIMENTO EM COTAS DE FUNDOS DE IN...,EM FUNCIONAMENTO NORMAL,Fundo de Renda Fixa,N,0.0,1.5,,N
6,00.068.305/0001-35,2019-02-06,60690808.39,26.091727,60677577.24,119907.81,30195.72,7562,FUNDO DE INVESTIMENTO EM COTAS DE FUNDOS DE IN...,EM FUNCIONAMENTO NORMAL,Fundo de Renda Fixa,N,0.0,1.5,,N
8,00.068.305/0001-35,2019-02-07,60694891.02,26.096755,60678202.82,5459.37,16527.13,7565,FUNDO DE INVESTIMENTO EM COTAS DE FUNDOS DE IN...,EM FUNCIONAMENTO NORMAL,Fundo de Renda Fixa,N,0.0,1.5,,N
