<a href="https://colab.research.google.com/github/Ayanlola2002/DATA-SCIENCE-PROJECTS/blob/master/WBI_data_extraction_omdena.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Extraction from World Bank API 

Scraping a list of World Bank Indicators.

Needed input:
* `indicators`: List of Indicators
* `countries`: List of countries
* `years`: List of years 

In [None]:
!pip install -q world_bank_data
 
# install chromium, its driver, and selenium
!apt update
!apt install -q chromium-chromedriver
!pip install -q selenium

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:6 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:9 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:12 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:13 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Get:14 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Fetched 252 kB in 2s (116 kB/s

In [None]:

import world_bank_data as wb
import pandas as pd
import re
import time
from google.colab import files

## Auxiliary functions

* `wb_interface_search(indicator)`: Returns indicator's ID  
* `indicators_dict(indicators)`: Returns dictionary of indicators and respective ID
* `build_wb_df(ind_dict)`: Returns dataframe with all selected indicators 
* `filter_wb(df, years, countries):` Returns filtered dataframe for specific `years` and `countries`


In [None]:
#Search 

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import re
import time


def wb_interface_search(indicator):
  '''Returns indicator's ID 
  `indicator`: world bank indicator as string 
  Example: 'Access to electricity, rural (% of rural population)'  
  '''

  t_start = time.time()
  options = webdriver.ChromeOptions()
  options.add_argument('--headless')
  options.add_argument('--no-sandbox')
  options.add_argument('--disable-dev-shm-usage')
  options.add_argument("--disable-notifications")

  weblink = 'https://data.worldbank.org/indicator?tab=all'
  wd = webdriver.Chrome('chromedriver',options=options)
  t_end = time.time()
  print('Initialized web driver in {} s'.format(t_end - t_start))

  t_start = time.time()
  wd.get(weblink)
  t_end = time.time()
  print('Downloaded website in {} s'.format(t_end - t_start))

  t_start = time.time()
  search_key = wd.find_element_by_xpath("//input[@id='overviewSearch']")
  time.sleep(5)    #Need to let the page load completely, otherwise the search result wont be narrowed down
  search_key.send_keys(indicator + Keys.RETURN + Keys.ENTER)
  t_end = time.time()
  print('Searched key in {} s'.format(t_end - t_start))

  t_start = time.time()
  all_results = wd.find_elements_by_xpath('//section[@class="nav-item"]/*/li/a')
  try:
      expected_result = all_results[0]   #Assume first result is the most relevant one, since we copied exact title
      link= expected_result.get_attribute("href")
      code = re.search('([A-Z].*)\?', link)[0].replace('?','')
      t_end = time.time()
      print('Found result in {} s'.format(t_end - t_start))

      return code
  except:
      return ''

def indicators_dict(indicators):
  '''Returns dictionary of indicators and respective ID
  `indicators`: list of world bank indicator as string 
  Example: ['Access to electricity, rural (% of rural population)', 'Arable land (% of land area)']  
  '''

  t_start = time.time()
  
  key = map(wb_interface_search, indicators)

  ind_dict = {}  

  for ind, i in zip(indicators, key):
    ind_dict[ind]= i

  t_end = time.time()
  print('Did everything in {} s'.format(t_end - t_start))


  return(ind_dict)



In [None]:
indicators = ['Access to electricity, rural (% of rural population)', 'Arable land (% of land area)', 'Social contributions (current LCU)']

ind_dict= indicators_dict(indicators)


Initialized web driver in 1.224775791168213 s
Downloaded website in 5.64798903465271 s
Searched key in 6.6349780559539795 s
Found result in 0.02378392219543457 s
Initialized web driver in 1.206981897354126 s
Downloaded website in 5.143914461135864 s
Searched key in 6.377284288406372 s
Found result in 0.019243955612182617 s
Initialized web driver in 1.3375539779663086 s
Downloaded website in 5.571833610534668 s
Searched key in 6.3849937915802 s
Found result in 0.024185657501220703 s
Did everything in 39.6002197265625 s


In [None]:
# Extract data

def build_wb_df(ind_dict):
  '''
   Returns dataframe with all selected indicators 
   `ind_dict`: dictionary of indicators and respective IDs 
   Example: {'Access to electricity, rural (% of rural population)': 'EG.ELC.ACCS.RU.ZS', 'Arable land (% of land area)': 'AG.LND.ARBL.ZS'}
  '''


  data_frame = pd.DataFrame()
  df_set, title_set = [],[]

  count = 0
  for key,value in ind_dict.items():
    #print(value)
    #'date' parameter seems to be able to select one year only. So currently extracting all data available
    time.sleep(1)
    #print(value)

    df = wb.get_series(value).to_frame()    
    df = df.reset_index()

    # For checking individual dataset
    df_set.append(df)
    title_set.append(df['Series'][0])
    df = df.drop(['Series'], axis=1)

    if count > 0:
      data_frame = pd.merge(data_frame, df, on = ['Country','Year'], how = 'outer')
    else:
      data_frame = df

    count +=1

  inv_map = {v: k for k, v in ind_dict.items()}
  i = 2
  for k,v in inv_map.items():
    data_frame = data_frame.rename(columns={data_frame.columns[i]: v})
    i+=1
  return(data_frame)
    


In [None]:
df1= build_wb_df(ind_dict)

In [None]:
# Filtering year and countries

years = ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']
countries = ['Algeria', 'Angola', 'Burkina Faso', 'Benin', 'Botswana', 'Burundi', 'Cabo Verde', 'Cameroon', 'Central African Republic',
              'Chad', 'Comoros', 'Congo, Dem. Rep.', 'Djibouti', 'Egypt, Arab Rep.', 'Equatorial Guinea', 'Eritrea', 'Eswatini', 'Ethiopia', 
              'Gambia', 'Gabon', 'Ghana', 'Guinea', 'Guinea-Bissau', 'Kenya', 'Lesotho', 'Liberia', 'Libya', 'Madagascar', 'Malawi', 'Mali', 
              'Mauritania', 'Mauritius', 'Morocco', 'Mozambique', 'Namibia', 'Niger', 'Nigeria', 'Rwanda', 'Sao Tome and Principe', 
              'Senegal', 'Seychelles', 'Sierra Leone', 'Somalia', 'South Africa', 'South Sudan', 'Sudan', 'Tanzania', 'Togo', 'Tunisia', 
              'Uganda', 'Zambia', 'Zimbabwe']

def filter_wb(df, years, countries):
  '''
   Returns filtered dataframe for specific `years` and `countries`
   `df`: dataframe returned by function `build_wb_df` 
   `years`: list of input years as strings. Example: ['2010', '2011']
   `countries`: list of countries as strings. Example: ['Algeria', 'Angola'] 
  '''

  mask_year = (df['Year'] == years[0])
  for year in years[1:]:
    mask_year = mask_year | (df['Year'] == year)
  # mask_year = (df['Year'] == years[0]) | (df['Year'] == years[1]) | (df['Year'] == years[0])
  mask_country = (df['Country'] == countries[0])
  for country in countries[1:]:
    mask_country = mask_country | (df['Country'] == country)
  df = df[mask_year & mask_country]
  return(df.reset_index())

In [None]:
df = filter_wb(df1, years, countries)

df.head(20)

Unnamed: 0,index,Country,Year,"Access to electricity, rural (% of rural population)",Arable land (% of land area),Social contributions (current LCU)
0,3039,Algeria,2010,97.4341,3.149798,
1,3040,Algeria,2011,97.563957,3.149798,
2,3041,Algeria,2012,97.673732,3.151687,
3,3042,Algeria,2013,98.573072,3.147363,
4,3043,Algeria,2014,99.479064,3.136111,
5,3044,Algeria,2015,99.764565,3.133046,
6,3045,Algeria,2016,99.965142,3.108736,
7,3046,Algeria,2017,100.0,,
8,3047,Algeria,2018,100.0,,
9,3048,Algeria,2019,,,
