This is a python script to pull data from the World Development Indicator (WDI) Big Query dataset. This is mostly for economists looking for a quick way to iterate and populate data for multiple countries and/ or economic unions. 

In [79]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.mode.chained_assignment = None
from fancyimpute import KNN

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%pip install -U wbdata

In [None]:
import wbdata                                                                 
wbdata.get_source()

In [81]:
def call_query(indicators, country_list, source):
    """This function searches and returns a list of list of our variables

        Args:
            indicators: This is a dcitionary of the WDI indicator codes of the desired variables
            country_list: This is the list of country code for relvenat countries
            source: This specifies the database to access.

        Returns: A dataframe with the variables

        """
    main_list = []
    exception_list = []
    wbdata.get_indicator(source=source) 

    for i in country_list:
        try:
            loop = wbdata.get_dataframe(indicators, country=str(i), cache=True)
            loop["country_id"] = str(i)
            main_list.append(loop)
        except Exception as exception:
            exception_list.append(str(i))
            continue
    full_df = pd.concat(main_list, axis=0)  
    return full_df


In [67]:
def remap(df):
    """This function remaps the country code to their full name version

        Args:
            df: the dataframe outputed from the call_query data

        Returns: A dataframe with country coes remapped """
        
    country_df = wbdata.get_country()
    country_df = pd.DataFrame(country_df)
    country_code_df = country_df[['id', 'name']]
    country_code_dic = dict(zip(country_code_df.id, country_code_df.name))
    complete_df = df.replace({"country_id": country_code_dic})
    complete_df = complete_df.reset_index(level=0)
    return complete_df

In [76]:
def fill_values(dataset, neighbors, country_list):
    """A K- Nearest Neighbor model which fills the NAs in our dataframe

        Args:
            dataset; The dataset outputted from the remap function above
            neighbors: A KNN parameter for the neigbors used in modelling. Usually integer 3 or 5 
            country_list: A list containing our desired countries

        Returns: a dataframe with the possible NA's filled

        """
    first_column = dataset.pop('country_id')
    dataset.insert(0, 'country_id', first_column)
    dataset['date'] = dataset.index
    
    dataset_columns = list(dataset.columns)

    filled_data = pd.DataFrame()
    for country in country_list:
        iteration_incomplete_dataset = dataset[dataset.isin([country]).any(axis=1)]
        iteration_incomplete_dataset.drop(['country_id'], axis=1, inplace=True)
        filled_country_data = pd.DataFrame(KNN(k=neighbors).fit_transform(iteration_incomplete_dataset))
        complete_data = filled_country_data
        complete_data.insert(0, 'country_id', country)
        filled_data = pd.concat([filled_data, complete_data])
    
    filled_data.columns = dataset_columns
    filled_data.date = filled_data.date.astype(int)
    return filled_data

In [89]:
def save(dataframe):
    dataframe.to_csv('MyData.csv', index=True) #output our result

In [57]:
def create_data(indicators, country_list, neighbors, source, fill_missing=True):
    """The final function to call all other functions

        Args:
            indicators: Our WDI indicator code variables
            country_list: A list of our desired countries
            neighbor: KNN nearest neighbor
            bq_assistant: The BigQuery Object to initialize our query
            fill_missing: A True or False variable to employ KNN to fill NAs

        Returns: a dataframe with the desired variables

        """
    
    raw_query = call_query(indicators, country_list, source)
    
    
    if fill_missing == True:
        completed_data = fill_values(raw_query, neighbors, country_list)
        print(completed_data.head())
        clean_data = remap(completed_data)
        return clean_data
    else: 
        clean_data = remap(raw_query)
        return clean_data

In [11]:
#Uncomment nd run this cell if you want to get all WB country codes
#countries = [i['id'] for i in wbdata.get_country()]
#print(countries)

In [85]:
#Please input a dictionary for the WDI indicators which you are interested in as well as the countries you would like to spool the data for.
# It should be in a dictionary format.
indicators = {"SP.DYN.CBRT.IN" : "Birth rate, crude (per 1,000 people)",
              "SP.DYN.TFRT.IN" : "Fertility rate, total (births per woman)",
              "EG.USE.ELEC.KH.PC" : "Electric power consumption (kWh per capita)",
              "DT.ODA.ODAT.CD" : "Net official development assistance received (current US$)",
              "SP.DYN.CDRT.IN" : "Death rate, crude (per 1,000 people)",
              "SP.DYN.IMRT.FE.IN" : "Mortality rate, infant, female (per 1,000 live births)",
              "SP.DYN.LE00.IN" : "Life expectancy at birth, total (years)"} #dictionary of WDI indicator code(s)
country_list = ["NGA", "ZMB", "ZWE"]  #list of countries
fill_missing = True #Indicates user wants to employ the KNN nearest neighbors models to fill NA values
neighbors = 3 #3 or 5 neighbour are recommended to fill missing variables on a country basis
source = 2 # 2 specifies World Development Indicators. This can still be changed

In [None]:
data = create_data(indicators, country_list, neighbors, source, fill_missing)
save(data)