### data_collection.ipynb

The purpose of this file is to obtain data from the World Bank API, combine this data together into a single pandas dataframe and write to disk as a pickle file

In [34]:
import pandas as pd
import json
from pandas_datareader import wb

In [35]:
data_dir = '.\\..\\..\\data\\'
#name of output pickle file
world_bank_file_out = "world_bank_api_data.pkl"

In [36]:
#List of World Bank Indictor codes to be used to obtain data from the World bank API
codes = ['SI.POV.DDAY',
'SI.POV.GINI',
'EN.POP.SLUM.UR.ZS',
'SI.SPR.PC40',
'SE.PRM.UNER',
'SE.XPD.TOTL.GD.ZS',
'SL.TLF.TOTL.IN',
'NY.GDP.MKTP.KD.ZG',
'SP.URB.TOTL.IN.ZS']


In [37]:
#Obtain data through the World Bank API using the pandas datareader library
data = dict()
for code in codes:
    print(code)
    data[code] = wb.download(indicator=code, country=['all'],start=1972, end=2018)

SI.POV.DDAY
SI.POV.GINI
EN.POP.SLUM.UR.ZS
SI.SPR.PC40
SE.PRM.UNER
SE.XPD.TOTL.GD.ZS
SL.TLF.TOTL.IN
NY.GDP.MKTP.KD.ZG
SP.URB.TOTL.IN.ZS


In [41]:
#Concatenate together into 1 file
to_concatentate = [ data[code] for code in codes]
world_bank_data = pd.concat(to_concatentate, axis=1)

#According to pandas docs on multiIndex usage: For objects to be indexed and sliced effectively, they need to be sorted.
world_bank_data = world_bank_data.sort_index()

world_bank_data.index.levels[1].name = 'Year'
world_bank_data.index.levels[0].name = 'Country'

world_bank_data.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,SI.POV.DDAY,SI.POV.GINI,EN.POP.SLUM.UR.ZS,SI.SPR.PC40,SE.PRM.UNER,SE.XPD.TOTL.GD.ZS,SL.TLF.TOTL.IN,NY.GDP.MKTP.KD.ZG,SP.URB.TOTL.IN.ZS
Country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Afghanistan,1972,,,,,,1.11718,,,12.41
Afghanistan,1973,,,,,,1.42788,,,12.809
Afghanistan,1974,,,,,1426190.0,,,,13.219
Afghanistan,1975,,,,,,1.30332,,,13.641
Afghanistan,1976,,,,,,,,,14.074


#### Write file to a pickle file:

In [33]:
#Write data to a pickle file
world_bank_data.to_pickle(data_dir + world_bank_file_out)