# ***Coleta final dos dados***
---

## Descrição

A partir dos scripts de teste individuais, criaremos funções para coletar todos os dados e, posteriormente, juntarmos todos os dados em um dataframe final: `now_data_raw.csv` e `.xlsx`.

# Bibliotecas
---

In [41]:
import pandas as pd
import numpy as np
import datetime as dt
from functools import reduce

import sys
sys.path.insert(0, '..')

from bcb import sgs
import ipeadatapy as ip
import pandas_datareader.data as web

# Definindo funções

In [42]:
def get_bcb_data(path, start, end):
    # Lendo a planilha completa
    df1 = pd.read_csv(path)

    # Filtrando apenas para os dados que constam no SGS-BCB
    df2 = df1[df1['where'] == 'BCB']

    # Definindo os nomes e os códigos das séries
    names_df = list(df2['name_df'])
    codes_bcb = list(map(int, df2['code']))

    # Coletando as séries com o API: criando uma lista compacta com os nomes e códigos para coletar todos os dados simultâneamente
    df3 = sgs.get(list(zip(names_df, codes_bcb)), start = start, end = end, multi = True) # multi: coletando diversas séries em uma única requisição
    df3 = df3.reset_index()

    return df3

In [43]:
def get_ipea_data(path, year_greater):
    # Lendo a planilha completa
    df1 = pd.read_csv(path)

    # Filtrando apenas para os dados que constam no Ipeadata
    df2 = df1[df1['where'] == 'Ipeadata']

    # Criando lista de códigos
    codes = list(df2['code'])

    # Coletando os dados
    df3 = pd.DataFrame()
    for code in codes:
        df3[code] = ip.timeseries(series = code, yearGreaterThan = year_greater).iloc[:, -1:]
    
    df3 = df3.reset_index()

    return df3

In [44]:
def get_data_yahoo(path, start, end, interval, series):
    # Lendo a planilha completa
    df1 = pd.read_csv(path)

    # Filtrando apenas para os dados que constam no Yahoo! Finance
    df2 = df1[df1['where'] == 'Yahoo! Finance']

    # Criando lista de códigos
    codes = list(df2['code'])

    # Coletando os dados
    df3 = pd.DataFrame()

    for code in codes:
        df3[code] = web.get_data_yahoo(code, start = start, end = end, interval = interval)[series]
    
    df3 = df3.reset_index()

    return df3

# Banco Central do Brasil (BCB)

In [45]:
path = 'meta_data.csv'
start = '2002-01-01'
end = '2022-09-01'
bcb_raw = get_bcb_data(path, start, end)
bcb_raw

'''bcb_raw = pd.read_csv('../bcb_data_raw.csv')
bcb_raw.drop('Unnamed: 0', axis = 1, inplace = True)
bcb_raw.rename({'ref.date' : 'Date'}, axis = 1, inplace = True)
bcb_raw['Date'] = pd.to_datetime(bcb_raw['Date'])
bcb_raw'''

ValueError: Expected object or value

In [46]:
bcb_raw

Unnamed: 0,Date,gdp,ibc_br,mon_gdp,ind_prod,ind_prod_ext,ind_prod_transf,ind_cap_goods,ind_prod_int_goods,ind_prod_man,...,bndes_desem_comserv,bndes_desem_ind_ext,exr_usd_mon_mean,household_debt,serv_conf_index_fgv,serv_curr_situ_index_fgv,serv_expec_index_fgv,ind_conf_cons_feco,ind_eco_cond_feco,ind_fut_expec_feco
0,2002-01-01,110.63,,112374.8,72.4,62.4,73.0,51.7,77.2,70.9,...,509.0,20.0,2.3779,,,,,87.04,70.70,97.93
1,2002-02-01,,,111477.1,69.7,58.9,70.4,50.4,74.8,67.4,...,2046.0,22.0,2.4196,,,,,90.66,74.34,101.54
2,2002-03-01,,,118444.7,77.4,67.8,78.0,57.4,83.3,73.9,...,2791.0,55.0,2.3466,,,,,91.30,77.69,100.37
3,2002-04-01,115.41,,120385.9,79.6,66.5,80.4,61.2,84.3,77.6,...,3379.0,63.0,2.3204,,,,,96.00,74.39,110.41
4,2002-05-01,,,123552.5,80.4,68.2,81.2,59.2,87.0,76.4,...,4030.0,69.0,2.4804,,,,,96.15,74.99,110.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,2022-05-01,,142.70,803819.1,88.8,83.7,89.5,95.1,90.6,84.4,...,16405.0,96.0,4.9551,31.58,96.9,96.0,98.1,105.87,69.73,129.96
245,2022-06-01,,142.13,809548.5,87.5,85.4,87.8,90.1,90.4,82.0,...,20500.0,125.0,5.0492,31.60,95.5,94.1,97.2,103.60,66.19,128.53
246,2022-07-01,182.14,149.62,847876.8,92.0,91.6,92.0,90.2,95.7,86.6,...,,,5.3681,31.70,98.0,97.2,99.1,105.58,70.01,129.29
247,2022-08-01,,150.09,847439.1,96.0,91.8,96.5,99.4,98.3,91.9,...,,,5.1433,31.63,98.9,97.1,100.6,106.76,70.19,131.14


# BCB Focus - Expectativas de mercado 

In [47]:
focus_raw = pd.read_csv('focus_data.csv')
focus_raw.drop('Unnamed: 0', axis = 1, inplace = True)
focus_raw['Date'] = pd.to_datetime(bcb_raw['Date'])
focus_raw

Unnamed: 0,Date,expec_gdp,expec_ipca,expec_selic,expec_exr_usd
0,2002-01-01,2.4000,4.8000,17.00,2.5500
1,2002-02-01,2.4500,4.8700,16.53,2.5500
2,2002-03-01,2.4200,5.0400,16.50,2.5000
3,2002-04-01,2.4000,5.4600,16.60,2.5000
4,2002-05-01,2.3000,5.4600,17.00,2.5000
...,...,...,...,...,...
244,2022-05-01,1.1885,8.7936,13.25,5.0456
245,2022-06-01,1.5074,8.0733,13.75,5.0850
246,2022-07-01,1.9717,7.1517,13.75,5.2000
247,2022-08-01,2.0959,6.6661,13.75,5.2000


# Ipeadata

In [48]:
path = 'meta_data.csv'
year_greater = 2001
ipea_raw = get_ipea_data(path, year_greater)
ipea_raw.columns = ['Date', 'exp_fob', 'exp_kgood_fob', 'exp_comgood_fob', 'imp_fob', 'imp_kgood_fob', 'imp_comgood_fob', 'revenue_rf']
ipea_raw = ipea_raw[ipea_raw['Date'] <= '2022-09-01']
ipea_raw

Unnamed: 0,Date,exp_fob,exp_kgood_fob,exp_comgood_fob,imp_fob,imp_kgood_fob,imp_comgood_fob,revenue_rf
0,2002-01-01,3952.038148,677.815093,824.920064,3886.422498,701.691574,373.487327,22680.845000
1,2002-02-01,3652.691211,383.777102,957.020453,3483.558340,554.428338,341.300112,17503.650936
2,2002-03-01,4251.755321,462.735576,1028.145481,3749.115387,550.711189,442.221529,17285.790167
3,2002-04-01,4612.069084,448.902045,1016.324080,4225.843874,621.889543,424.275893,19831.720577
4,2002-05-01,4424.075300,383.111035,1027.540607,4154.270359,585.755356,395.439583,18065.018868
...,...,...,...,...,...,...,...,...
244,2022-05-01,29641.748838,1265.370823,3595.572861,24684.032424,2306.095136,2217.875599,153448.964730
245,2022-06-01,32734.865006,1412.615240,4000.330737,23851.356788,2139.732156,2316.953419,166705.347128
246,2022-07-01,29848.660624,1179.832712,3850.994179,24486.094504,2190.884826,2133.615540,181804.319492
247,2022-08-01,30770.081998,1482.972580,4288.625474,26668.439655,2654.372865,2696.037729,


# Yahoo! Finance
---

In [49]:
path = 'meta_data.csv'
start = '2002-01-01'
end = '2022-09-01'
interval = 'm'
series = 'Adj Close'
yahoo_raw = get_data_yahoo(path, start, end, interval, series)
# yahoo_raw.drop('^N100', axis = 1, inplace = True)
yahoo_raw.columns = ['Date', 'brent_oil_price', 'ibov', 'nasdaq', 'snp_500', 'dji']
yahoo_raw 

Unnamed: 0,Date,brent_oil_price,ibov,nasdaq,snp_500,dji
0,2007-08-01,72.690002,54637,2596.360107,1473.989990,13357.740234
1,2007-09-01,79.169998,60465,2701.500000,1526.750000,13895.629883
2,2007-10-01,90.629997,65318,2859.120117,1549.380005,13930.009766
3,2007-11-01,88.260002,63006,2660.959961,1481.140015,13371.719727
4,2007-12-01,93.849998,63886,2652.280029,1468.359985,13264.820312
...,...,...,...,...,...,...
151,2022-04-01,109.339996,107876,12334.639648,4131.930176,32977.210938
152,2022-06-01,114.809998,98542,11028.740234,3785.379883,30775.429688
153,2022-07-01,110.010002,103165,12390.690430,4130.290039,32845.128906
154,2022-08-01,96.489998,109523,11816.200195,3955.000000,31510.429688


# Juntando as bases de dados
---

In [50]:
# Lendo base de dados da FGV com os índices de confiança 

fgv_raw = pd.read_excel('fgv_expect_nuci.xlsx')
fgv_raw.columns = ['Date', 'emp_conf_index_fgv', 'emp_curr_situ_index_fgv', 'emp_expec_index_fgv', 'ind_conf_index_fgv', 
                   'ind_curr_situ_index_fgv', 'ind_expec_index_fgv', 'nuci_gv', 'cons_conf_index_fgv', 'cons_curr_situ_index_fgv', 
                   'cons_expec_index_fgv']
fgv_raw.set_index(['Date'], inplace = True)
fgv_raw = fgv_raw.reset_index()
fgv_raw

Unnamed: 0,Date,emp_conf_index_fgv,emp_curr_situ_index_fgv,emp_expec_index_fgv,ind_conf_index_fgv,ind_curr_situ_index_fgv,ind_expec_index_fgv,nuci_gv,cons_conf_index_fgv,cons_curr_situ_index_fgv,cons_expec_index_fgv
0,2002-01-01,93.0,93.7,92.6,94.9,95.5,94.8,78.6,,,
1,2002-02-01,97.1,93.8,100.5,98.9,95.6,102.5,79.2,,,
2,2002-03-01,97.8,92.8,102.9,99.6,94.6,104.8,78.3,,,
3,2002-04-01,99.3,93.0,105.6,101.1,94.8,107.5,77.4,,,
4,2002-05-01,95.2,88.2,102.6,97.0,90.2,104.5,77.6,,,
...,...,...,...,...,...,...,...,...,...,...,...
244,2022-05-01,96.7,97.3,96.5,100.4,100.4,100.6,80.3,75.3,68.4,81.6
245,2022-06-01,96.8,97.3,96.5,101.4,101.1,101.5,80.8,78.1,68.7,86.0
246,2022-07-01,97.3,97.8,97.4,101.0,100.0,102.1,81.7,78.6,69.5,86.3
247,2022-08-01,100.2,99.5,100.9,103.0,102.2,103.5,82.4,82.9,71.3,92.1


In [51]:
# Lendo base de dados Google Trends

gtrends_data = pd.read_csv('gtrends_data.csv')
gtrends_data.drop('Unnamed: 0', axis = 1, inplace = True)
gtrends_data['Date'] = pd.to_datetime(gtrends_data['Date'])

In [54]:
dfs = [bcb_raw, fgv_raw, focus_raw, ipea_raw, yahoo_raw, gtrends_data]

now_data_raw = reduce(lambda left, right: pd.merge(left, right, on = ['Date'], how = 'outer'), dfs)

# Deslocando duas linhas abaixo para as séries trimestrais para que o lançamento dos dados fique no fim do período

now_data_raw['gdp'] = now_data_raw['gdp'].shift(2)

# Escrevendo em csv e xlsx

now_data_raw.to_excel('now_data_raw.xlsx')
now_data_raw.to_csv('now_data_raw.csv')

In [53]:
now_data_raw

Unnamed: 0,Date,gdp,ibc_br,mon_gdp,ind_prod,ind_prod_ext,ind_prod_transf,ind_cap_goods,ind_prod_int_goods,ind_prod_man,...,dji,gt_crise,gt_desemprego,gt_seguro_desemprego,gt_empregos,gt_trabalho,gt_fgts,gt_renda,gt_salario,gt_economia
0,2002-01-01,,,112374.8,72.4,62.4,73.0,51.7,77.2,70.9,...,,,,,,,,,,
1,2002-02-01,,,111477.1,69.7,58.9,70.4,50.4,74.8,67.4,...,,,,,,,,,,
2,2002-03-01,110.63,,118444.7,77.4,67.8,78.0,57.4,83.3,73.9,...,,,,,,,,,,
3,2002-04-01,,,120385.9,79.6,66.5,80.4,61.2,84.3,77.6,...,,,,,,,,,,
4,2002-05-01,,,123552.5,80.4,68.2,81.2,59.2,87.0,76.4,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,2022-05-01,,142.70,803819.1,88.8,83.7,89.5,95.1,90.6,84.4,...,,20.0,17.0,14.0,18.0,33.0,20.0,27.0,17.0,5.0
245,2022-06-01,177.74,142.13,809548.5,87.5,85.4,87.8,90.1,90.4,82.0,...,30775.429688,21.0,17.0,14.0,17.0,32.0,12.0,16.0,15.0,5.0
246,2022-07-01,,149.62,847876.8,92.0,91.6,92.0,90.2,95.7,86.6,...,32845.128906,16.0,17.0,14.0,16.0,26.0,9.0,14.0,15.0,4.0
247,2022-08-01,,150.09,847439.1,96.0,91.8,96.5,99.4,98.3,91.9,...,31510.429688,19.0,18.0,14.0,17.0,31.0,9.0,13.0,16.0,6.0


In [None]:
now_data_raw.columns

Index(['Date', 'gdp', 'ibc_br', 'mon_gdp', 'ipca', 'ipca_ex', 'ipca_exi',
       'ipca_exii', 'ipca_exiii', 'inpc',
       ...
       'dji', 'gt_crise', 'gt_desemprego', 'gt_seguro_desemprego',
       'gt_empregos', 'gt_trabalho', 'gt_fgts', 'gt_renda', 'gt_salario',
       'gt_economia'],
      dtype='object', length=161)

# Observações

1. Completar os NAs da série do salério mínimo com 1212,00 reais pois a base tem apenas esses NAs quando o valor do SM em voga era esse. Provável erro de cômputo na base de dados.

2. Descartar série de horas trabalhadas na produção (hrs_wkd_prod) da fiesp: série incompleta. 

3. Descartar série de Utilização da capacidade instalada - Geral (CNI) (uci_cni): série incompleta.

4. Descartar série de vendas de máquinas agrícolas (agri_mach_sales): série incompleta

5. Para o fluxo de veículos (road_flow_gross_vehi), vamos precisar de apenas uma decomposição para estacionarizar a série, dado que sua série já existe apenas na forma dessazonalizada, poupando uma defasagem.

6. Decobrir coleta de PMIs