# Libraries

In [1]:
import pandas as pd
import os
import quandl

pd.set_option('display.max_columns', 200)

# Quandl Connection

In [2]:
quandl_api_key = os.environ.get('QUANDL_API_KEY')
quandl.ApiConfig.api_key = quandl_api_key

### Oil Price

In [3]:
wti_crude = quandl.get('EIA/PET_RWTC_D' , start_date='2004-05-03', end_date='2019-06-23')
wti_crude.rename(columns={'Value': 'Oil'}, inplace=True)

### Selic

In [4]:
selic = quandl.get('BCB/432' , start_date='2004-05-03', end_date='2019-06-23')
selic.rename(columns={'Value': 'Selic'}, inplace=True)

### Dollar

In [5]:
dollar = quandl.get('BCB/10813' , start_date='2004-05-03', end_date='2019-06-23')
dollar.rename(columns={'Value': 'Dollar'}, inplace=True)

### PIB BR

In [6]:
pib_BR = quandl.get('ODA/BRA_NGDPD' , start_date='2004-05-03', end_date='2019-06-23')
pib_BR.rename(columns={'Value': 'PIB_BR'}, inplace=True)

### IPCA

In [7]:
ipca = quandl.get('BCB/13522' , start_date='2004-05-03', end_date='2019-06-23')
ipca.rename(columns={'Value': 'IPCA'}, inplace=True)

### GOLD BR

In [8]:
gold = quandl.get('BCB/4' , start_date='2004-05-03', end_date='2019-06-23')
gold.rename(columns={'Value': 'Gold'}, inplace=True)

### CDI

In [9]:
cdi = quandl.get('BCB/4392' , start_date='2004-05-03', end_date='2019-06-23')
cdi.rename(columns={'Value': 'CDI'}, inplace=True)

# Join all the economic data into one DataFrame

In [10]:
economic_data = pd.concat([wti_crude, selic, dollar, pib_BR, ipca, gold, cdi], axis=1)

In [11]:
economic_data.head(10)

Unnamed: 0_level_0,Oil,Selic,Dollar,PIB_BR,IPCA,Gold,CDI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2004-05-03,38.26,16.0,2.9561,,,36.65,
2004-05-04,38.86,16.0,2.9688,,,36.9,
2004-05-05,39.69,16.0,2.9608,,,36.8,
2004-05-06,39.41,16.0,2.9891,,,36.7,
2004-05-07,39.98,16.0,3.0496,,,37.2,
2004-05-08,,16.0,,,,,
2004-05-09,,16.0,,,,,
2004-05-10,38.9,16.0,3.1241,,,38.0,
2004-05-11,40.3,16.0,3.1043,,,37.0,
2004-05-12,40.3,16.0,3.1203,,,37.7,


# Verification of Missing Values

In [12]:
economic_data.isnull().mean()

Oil       0.309042
Selic     0.000000
Dollar    0.312658
PIB_BR    0.997288
IPCA      0.967269
Gold      0.331826
CDI       0.967269
dtype: float64

#### >>Economic indicators with a high incidence of null values will be discarded

# Statistical Summary

In [15]:
economic_data.describe()

Unnamed: 0,Oil,Selic,Dollar,Gold
count,3821.0,5530.0,3801.0,3695.0
mean,71.350667,11.690416,2.491177,87.971137
std,22.384423,3.418981,0.719662,39.184392
min,26.19,6.5,1.5337,32.0
25%,52.19,8.75,1.8806,47.9
50%,67.81,11.25,2.2483,92.0
75%,91.07,14.0,3.1358,121.645
max,145.31,19.75,4.1942,170.5


# Drop columns with high incidence of null Values

In [13]:
economic_data.drop(['PIB_BR', 'IPCA', 'CDI'], axis=1, inplace=True)

In [14]:
economic_data.sample(10)

Unnamed: 0_level_0,Oil,Selic,Dollar,Gold
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-11-02,46.12,14.25,,
2012-07-08,,8.5,,
2005-11-21,57.75,19.0,2.2173,34.89
2014-08-05,97.34,11.0,2.275,
2013-09-14,,9.0,,
2011-11-22,97.76,11.5,1.8062,98.5
2018-05-27,,6.5,,
2004-09-12,,16.0,,
2012-06-24,,8.5,,
2006-03-24,63.9,16.5,2.162,38.7


# Interpolation of values to fill the missing values

In [16]:
columns_with_null_values = economic_data.columns[economic_data.isnull().any()]

for col in columns_with_null_values:
    economic_data[col].interpolate(inplace=True)

In [17]:
economic_data.isnull().mean()

Oil       0.0
Selic     0.0
Dollar    0.0
Gold      0.0
dtype: float64

# Checking for missing days in the selected interval

In [18]:
economic_data['diff_Date'] = economic_data.index
economic_data['diff_Date'] = economic_data['diff_Date'].diff().shift(-1)

In [20]:
economic_data['diff_Date'].value_counts()

1 days    5529
Name: diff_Date, dtype: int64

In [31]:
economic_data.drop('diff_Date', axis=1, inplace=True)

# Resample DataFrame to Week

In [32]:
col_names = economic_data.columns

In [34]:
economic_data_mean = economic_data.resample('W').mean()
economic_data_mean.columns = col_names + '_mean'

In [36]:
economic_data_median = economic_data.resample('W').median()
economic_data_median.columns = col_names + '_median'

In [35]:
economic_data_std = economic_data.resample('W').std()
economic_data_std.columns = col_names + '_std'

# Join all the resample metrics data into one DataFrame

In [37]:
economic_data = pd.concat([economic_data_mean, economic_data_median, economic_data_std], axis=1)

In [38]:
economic_data.head()

Unnamed: 0_level_0,Oil_mean,Selic_mean,Dollar_mean,Gold_mean,Oil_median,Selic_median,Dollar_median,Gold_median,Oil_std,Selic_std,Dollar_std,Gold_std
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2004-05-09,39.297143,16.0,3.014014,37.064286,39.41,16.0,2.9891,36.9,0.578179,0.0,0.059211,0.414151
2004-05-16,40.687143,16.0,3.113114,37.492857,40.94,16.0,3.113267,37.45,0.943112,0.0,0.011204,0.332757
2004-05-23,40.867143,16.0,3.158643,38.29,40.92,16.0,3.1805,38.3,0.664057,0.0,0.042535,0.569077
2004-05-30,40.693214,16.0,3.1375,38.778571,40.6,16.0,3.1516,38.7,0.939225,0.0,0.031106,0.305007
2004-06-06,39.843214,16.0,3.133371,38.828571,39.29,16.0,3.1294,38.7,1.59166,0.0,0.012524,0.541322


In [42]:
economic_data.to_csv('../01-Data/economic_data.csv', index=True)