In [1]:
# dependencies
import json
import requests
import pandas as pd
from pprint import pprint
# need to pip install world bank data to access more detailed info
# pip install -U wbdata
import wbdata

# testing wbdata install
wbdata.get_source()

  id  name
----  --------------------------------------------------------------------
   1  Doing Business
   2  World Development Indicators
   3  Worldwide Governance Indicators
   5  Subnational Malnutrition Database
   6  International Debt Statistics
  11  Africa Development Indicators
  12  Education Statistics
  13  Enterprise Surveys
  14  Gender Statistics
  15  Global Economic Monitor
  16  Health Nutrition and Population Statistics
  18  IDA Results Measurement System
  19  Millennium Development Goals
  20  Quarterly Public Sector Debt
  22  Quarterly External Debt Statistics SDDS
  23  Quarterly External Debt Statistics GDDS
  24  Poverty and Equity
  25  Jobs
  27  Global Economic Prospects
  28  Global Financial Inclusion
  29  The Atlas of Social Protection: Indicators of Resilience and Equity
  30  Exporter Dynamics Database – Indicators at Country-Year Level
  31  Country Policy and Institutional Assessment
  32  Global Financial Development
  33  G20 Financial Inclus

In [2]:
# finding available indicators to refine data
wbdata.search_indicators('gdp per capita')

id                          name
--------------------------  ----------------------------------------------------------------------------------------
6.0.GDPpc_constant          GDP per capita, PPP (constant 2011 international $)
FB.DPT.INSU.PC.ZS           Deposit insurance coverage (% of GDP per capita)
NV.AGR.PCAP.KD.ZG           Real agricultural GDP per capita growth rate (%)
NY.GDP.PCAP.CD              GDP per capita (current US$)
NY.GDP.PCAP.CN              GDP per capita (current LCU)
NY.GDP.PCAP.KD              GDP per capita (constant 2010 US$)
NY.GDP.PCAP.KD.ZG           GDP per capita growth (annual %)
NY.GDP.PCAP.KN              GDP per capita (constant LCU)
NY.GDP.PCAP.PP.CD           GDP per capita, PPP (current international $)
NY.GDP.PCAP.PP.KD           GDP per capita, PPP (constant 2017 international $)
NY.GDP.PCAP.PP.KD.87        GDP per capita, PPP (constant 1987 international $)
NY.GDP.PCAP.PP.KD.ZG        GDP per capita, PPP annual growth (%)
SE.XPD.PRIM.PC.ZS   

In [3]:
# confirming country list
wbdata.get_country()

id    name
----  --------------------------------------------------------------------------------
ABW   Aruba
AFG   Afghanistan
AFR   Africa
AGO   Angola
ALB   Albania
AND   Andorra
ARB   Arab World
ARE   United Arab Emirates
ARG   Argentina
ARM   Armenia
ASM   American Samoa
ATG   Antigua and Barbuda
AUS   Australia
AUT   Austria
AZE   Azerbaijan
BDI   Burundi
BEA   East Asia & Pacific (IBRD-only countries)
BEC   Europe & Central Asia (IBRD-only countries)
BEL   Belgium
BEN   Benin
BFA   Burkina Faso
BGD   Bangladesh
BGR   Bulgaria
BHI   IBRD countries classified as high income
BHR   Bahrain
BHS   Bahamas, The
BIH   Bosnia and Herzegovina
BLA   Latin America & the Caribbean (IBRD-only countries)
BLR   Belarus
BLZ   Belize
BMN   Middle East & North Africa (IBRD-only countries)
BMU   Bermuda
BOL   Bolivia
BRA   Brazil
BRB   Barbados
BRN   Brunei Darussalam
BSS   Sub-Saharan Africa (IBRD-only countries)
BTN   Bhutan
BWA   Botswana
CAA   Sub-Saharan Africa (IFC classification)
CAF   Centr

In [4]:
# iterating through countries to get their Gross Domestic Product per Capita
countries = [i['name'] for i in wbdata.get_country()]

indicators = { "NY.GDP.PCAP.CD" : "gdppc"}

# Sorting new data into a dataframe
df = wbdata.get_dataframe(indicators, country='all')

df

Unnamed: 0_level_0,Unnamed: 1_level_0,gdppc
country,date,Unnamed: 2_level_1
Arab World,2020,
Arab World,2019,6584.740241
Arab World,2018,6601.825189
Arab World,2017,6108.588220
Arab World,2016,6112.105953
...,...,...
Zimbabwe,1964,281.558440
Zimbabwe,1963,277.479715
Zimbabwe,1962,276.688781
Zimbabwe,1961,280.828951


In [6]:
# saving dataframe as a csv to make edits easier
df.to_csv(r'gdppc.csv')

In [13]:
# thinning out data to a recent year with a decent amount of data
df2 = pd.read_csv('gdppc.csv')
df2 = df2[df2.date == 2019]

In [14]:
df2

Unnamed: 0,country,date,gdppc
1,Arab World,2019,6584.740241
62,Caribbean small states,2019,10500.971902
123,Central Europe and the Baltics,2019,16298.478764
184,Early-demographic dividend,2019,3644.014158
245,East Asia & Pacific,2019,11502.865111
306,East Asia & Pacific (excluding high income),2019,8192.143336
367,East Asia & Pacific (IDA & IBRD countries),2019,8278.073975
428,Euro area,2019,38976.34288
489,Europe & Central Asia,2019,24742.719826
550,Europe & Central Asia (excluding high income),2019,8133.573458


In [15]:
# confirming that most countries this year have gdp info
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print(df2)

                                                 country  date          gdppc
1                                             Arab World  2019    6584.740241
62                                Caribbean small states  2019   10500.971902
123                       Central Europe and the Baltics  2019   16298.478764
184                           Early-demographic dividend  2019    3644.014158
245                                  East Asia & Pacific  2019   11502.865111
306          East Asia & Pacific (excluding high income)  2019    8192.143336
367           East Asia & Pacific (IDA & IBRD countries)  2019    8278.073975
428                                            Euro area  2019   38976.342880
489                                Europe & Central Asia  2019   24742.719826
550        Europe & Central Asia (excluding high income)  2019    8133.573458
611         Europe & Central Asia (IDA & IBRD countries)  2019    9017.538951
672                                       European Union  2019  

In [20]:
# saving as a new csv
df2.to_csv(r'gdppc2019.csv')

In [23]:
# dependencies
import pandas as pd

df3 = pd.read_csv('gdppc2019.csv')

# Renaming unamed column so it can be removed from csv

df3.rename({"Unnamed: 0":"a"}, axis="columns", inplace=True)

# Then, drop the column as usual.

df3.drop(["a"], axis=1, inplace=True)

# get rid of null columns

df3 = df3.dropna()
df3 = df3.reset_index(drop=True)

# Verifying dataframe is how we want it

format_dict = {'gdppc':'${0:,.2f}'}
df3.style.format(format_dict).hide_index()

country,date,gdppc
Arab World,2019,"$6,584.74"
Caribbean small states,2019,"$10,500.97"
Central Europe and the Baltics,2019,"$16,298.48"
Early-demographic dividend,2019,"$3,644.01"
East Asia & Pacific,2019,"$11,502.87"
East Asia & Pacific (excluding high income),2019,"$8,192.14"
East Asia & Pacific (IDA & IBRD countries),2019,"$8,278.07"
Euro area,2019,"$38,976.34"
Europe & Central Asia,2019,"$24,742.72"
Europe & Central Asia (excluding high income),2019,"$8,133.57"


In [24]:
# saving the new csv
df3.to_csv(r'gdppc2019clean.csv')