# WEB SCRAPING


The Wiki web page 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)' contains the Gross Domestic Product per Country. This is essential for many analyses; this script loads the data from the wiki web page and cleans the data.

### Libraries

In [11]:
# libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import html5lib

### Helper function

In [12]:
def clean_year(years):

    lst = []

    for y in years:
        
        if len(y) == 4:
            lst.append(int(y))
        elif len(y) > 4:
            lst.append(int(y[-4:]))
        else:
            lst.append(lst[-1])
            
    return(lst)

### DATA GATHERING

In [13]:
# libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import html5lib

# url
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'

# data load
gdp = pd.read_html(url, match='Country/Territory', header=0, skiprows=1)[0]

### DATA CLEANING

In [14]:
# column renaming 
dt = gdp.rename(columns = {
    'UN Region': 'REGION', 
    'Country/Territory': 'COUNTRY',
    'Estimate': 'IMF',
    'Estimate.1': 'WORLD_BANK',
    'Estimate.2': 'UN',
    'Year': 'IMF_YEAR',
    'Year.1': 'WORLD_BANK_YEAR',
    'Year.2': 'UN_YEAR'
    }
)

# cleaning the year
for y in ['IMF_YEAR', 'WORLD_BANK_YEAR', 'UN_YEAR']:
    dt[y] = clean_year(dt[y])

In [15]:
dt.head(3)

Unnamed: 0,COUNTRY,REGION,IMF,IMF_YEAR,WORLD_BANK,WORLD_BANK_YEAR,UN,UN_YEAR
0,World,—,101560901,2022,96513077,2021,85328323,2020
1,United States,Americas,25035164,2022,22996100,2021,20893746,2020
2,China,Asia,18321197,2022,17734063,2021,14722801,2020
