# Retrieval of Macroeconomic Predictors from the World Bank

In [1]:
import wbgapi as wb
import pandas as pd
import yfinance as yf

In [2]:
indicators = wb.series.info(q='gdp growth')
indicators

id,value
NY.GDP.MKTP.KD.ZG,GDP growth (annual %)
,1 elements


In [3]:

# gdp growth rate
gdp = wb.data.DataFrame('NY.GDP.MKTP.KD.ZG', time=range(2006, 2022), labels=True).reset_index()
gdp = gdp.melt(id_vars=['economy', 'Country'], var_name='Year', value_name='GDP_growth_rate')
gdp.insert(3, "temp", gdp["Year"].str[2:])
gdp = gdp.drop(columns=['Year'])
gdp = gdp.rename(columns={'temp': 'Year'})
gdp = gdp.astype({'Year': 'int64'})
gdp = gdp.sort_values(['economy', 'Year'])
gdp = gdp.dropna()
print(gdp.shape)
print(gdp.info())

(4084, 4)
<class 'pandas.core.frame.DataFrame'>
Index: 4084 entries, 207 to 3990
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   economy          4084 non-null   object 
 1   Country          4084 non-null   object 
 2   Year             4084 non-null   int64  
 3   GDP_growth_rate  4084 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 159.5+ KB
None


In [4]:
gdp.head()

Unnamed: 0,economy,Country,Year,GDP_growth_rate
207,ABW,Aruba,2006,1.127412
473,ABW,Aruba,2007,3.089544
739,ABW,Aruba,2008,1.835756
1005,ABW,Aruba,2009,-11.677742
1271,ABW,Aruba,2010,-2.733456


In [5]:
# create gdp growth rate lag variable
gdp['GDP_growth_rate_lag'] = gdp.groupby('economy')['GDP_growth_rate'].shift(1)

gdp.head(18)

Unnamed: 0,economy,Country,Year,GDP_growth_rate,GDP_growth_rate_lag
207,ABW,Aruba,2006,1.127412,
473,ABW,Aruba,2007,3.089544,1.127412
739,ABW,Aruba,2008,1.835756,3.089544
1005,ABW,Aruba,2009,-11.677742,1.835756
1271,ABW,Aruba,2010,-2.733456,-11.677742
1537,ABW,Aruba,2011,3.369238,-2.733456
1803,ABW,Aruba,2012,-1.0408,3.369238
2069,ABW,Aruba,2013,6.431482,-1.0408
2335,ABW,Aruba,2014,-1.586575,6.431482
2601,ABW,Aruba,2015,-0.623626,-1.586575


In [6]:
# !pip install yfinance

In [7]:
# Get VIX data
vix = yf.download('^VIX', start='2006-12-01', end='2022-01-01')

# Resample to monthly frequency and get the last trading day's data of each month
vix_monthly = vix.resample('M').last()

# Print the resampled data
print(vix_monthly.head())

[*********************100%%**********************]  1 of 1 completed

                 Open       High    Low  Close  Adj Close  Volume
Date                                                             
2006-12-31  10.950000  11.650000  10.71  11.56      11.56       0
2007-01-31  11.090000  11.260000  10.27  10.42      10.42       0
2007-02-28  17.209999  17.290001  14.50  15.42      15.42       0
2007-03-31  14.940000  15.820000  14.14  14.64      14.64       0
2007-04-30  12.900000  14.310000  12.78  14.22      14.22       0





In [8]:
vix_monthly = vix_monthly.reset_index()

# keep only Close column
vix_monthly = vix_monthly[['Date', 'Close']]
vix_monthly = vix_monthly.rename(columns={'Close': 'VIX'})

vix_monthly.head()

Unnamed: 0,Date,VIX
0,2006-12-31,11.56
1,2007-01-31,10.42
2,2007-02-28,15.42
3,2007-03-31,14.64
4,2007-04-30,14.22


In [9]:
# change the date format so that the day of the month is always 01
vix_monthly['Date'] = vix_monthly['Date'].dt.to_period('M').dt.to_timestamp()

vix_monthly.head()

Unnamed: 0,Date,VIX
0,2006-12-01,11.56
1,2007-01-01,10.42
2,2007-02-01,15.42
3,2007-03-01,14.64
4,2007-04-01,14.22


In [10]:
# create one-month lagged VIX
vix_monthly['VIX_lag'] = vix_monthly['VIX'].shift(1)

vix_monthly.head()

Unnamed: 0,Date,VIX,VIX_lag
0,2006-12-01,11.56,
1,2007-01-01,10.42,11.56
2,2007-02-01,15.42,10.42
3,2007-03-01,14.64,15.42
4,2007-04-01,14.22,14.64


In [11]:
# create a Year column to merge
vix_monthly['Year'] = vix_monthly['Date'].dt.year

vix_monthly.head()

Unnamed: 0,Date,VIX,VIX_lag,Year
0,2006-12-01,11.56,,2006
1,2007-01-01,10.42,11.56,2007
2,2007-02-01,15.42,10.42,2007
3,2007-03-01,14.64,15.42,2007
4,2007-04-01,14.22,14.64,2007


In [12]:
# merge the two datasets so that gdp gets repeated for each month

df = pd.merge(gdp, vix_monthly, on='Year', how='left')
df.head()

Unnamed: 0,economy,Country,Year,GDP_growth_rate,GDP_growth_rate_lag,Date,VIX,VIX_lag
0,ABW,Aruba,2006,1.127412,,2006-12-01,11.56,
1,ABW,Aruba,2007,3.089544,1.127412,2007-01-01,10.42,11.56
2,ABW,Aruba,2007,3.089544,1.127412,2007-02-01,15.42,10.42
3,ABW,Aruba,2007,3.089544,1.127412,2007-03-01,14.64,15.42
4,ABW,Aruba,2007,3.089544,1.127412,2007-04-01,14.22,14.64


In [13]:
# keep only the United States data
df = df[df['economy'] == 'USA']

df.head()

Unnamed: 0,economy,Country,Year,GDP_growth_rate,GDP_growth_rate_lag,Date,VIX,VIX_lag
43858,USA,United States,2006,2.782811,,2006-12-01,11.56,
43859,USA,United States,2007,2.010508,2.782811,2007-01-01,10.42,11.56
43860,USA,United States,2007,2.010508,2.782811,2007-02-01,15.42,10.42
43861,USA,United States,2007,2.010508,2.782811,2007-03-01,14.64,15.42
43862,USA,United States,2007,2.010508,2.782811,2007-04-01,14.22,14.64


In [14]:
# drop economy, country, year, gdp growth rate, and VIX columns
df = df.drop(columns=['economy', 'Country', 'Year', 'GDP_growth_rate', 'VIX'])

df.head()

Unnamed: 0,GDP_growth_rate_lag,Date,VIX_lag
43858,,2006-12-01,
43859,2.782811,2007-01-01,11.56
43860,2.782811,2007-02-01,10.42
43861,2.782811,2007-03-01,15.42
43862,2.782811,2007-04-01,14.64


In [15]:
# rename GDP_growth_rate_lag and VIX_lag columns 
df = df.rename(columns={'GDP_growth_rate_lag': 'GDP', 'VIX_lag': 'VIX'})

df.head()

Unnamed: 0,GDP,Date,VIX
43858,,2006-12-01,
43859,2.782811,2007-01-01,11.56
43860,2.782811,2007-02-01,10.42
43861,2.782811,2007-03-01,15.42
43862,2.782811,2007-04-01,14.64


In [17]:
df['Date'].max()

Timestamp('2021-12-01 00:00:00')

In [16]:
# save the data 
df.to_csv('../Data/Macro.csv', index=False)

In [28]:
# get also inflation data
inflation = wb.data.DataFrame('FP.CPI.TOTL', time=range(2006, 2022), labels=True).reset_index()
inflation = inflation.melt(id_vars=['economy', 'Country'], var_name='Year', value_name='Inflation')
inflation.insert(3, "temp", inflation["Year"].str[2:])
inflation = inflation.drop(columns=['Year'])
inflation = inflation.rename(columns={'temp': 'Year'})
inflation = inflation.astype({'Year': 'int64'})
inflation = inflation.sort_values(['economy', 'Year'])
inflation = inflation.dropna()
print(inflation.shape)
print(inflation.info())

(2938, 4)
<class 'pandas.core.frame.DataFrame'>
Index: 2938 entries, 207 to 3990
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   economy    2938 non-null   object 
 1   Country    2938 non-null   object 
 2   Year       2938 non-null   int64  
 3   Inflation  2938 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 114.8+ KB
None


In [29]:
inflation = inflation[inflation['economy'] == 'USA']

inflation.head(18)

Unnamed: 0,economy,Country,Year,Inflation
10,USA,United States,2006,92.449705
276,USA,United States,2007,95.086992
542,USA,United States,2008,98.737477
808,USA,United States,2009,98.38642
1074,USA,United States,2010,100.0
1340,USA,United States,2011,103.156842
1606,USA,United States,2012,105.291505
1872,USA,United States,2013,106.833849
2138,USA,United States,2014,108.566932
2404,USA,United States,2015,108.695722


In [30]:
inflation['Date'] = pd.to_datetime(inflation['Year'], format='%Y')
inflation = inflation.drop(columns=['economy', 'Country', 'Year'])
inflation.head()

Unnamed: 0,Inflation,Date
10,92.449705,2006-01-01
276,95.086992,2007-01-01
542,98.737477,2008-01-01
808,98.38642,2009-01-01
1074,100.0,2010-01-01


In [31]:
inflation.to_csv('../Data/Macro_inflation.csv', index=False)