ETL is a process that allows you to extract data from various sources, transform it according to your requirements and finally load it into a database or data format of your choice.

### Extract

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
import numpy as np

# get html data first
html_data = requests.get("https://en.wikipedia.org/wiki/World_Happiness_Report")

# check if status is 200 -> shows that its allowed to scrape the webpage
print(html_data.status_code)

# parse html data now using BeautifulSoup
soup = BeautifulSoup(html_data.text, "html.parser")

# get all tables from wikipedia page
tables = soup.find_all('table',{'class':"wikitable"})

# store target table
table = tables[4]

# convert table html code to pandas df
data = pd.read_html(str(table))
df_happiness = pd.DataFrame(data[0]) 
# print(table)
display(df_happiness)

200


Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Freedom to make life choices,Generosity,Perceptions of corruption,Unnamed: 8
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.340
2,3,Denmark,7.555,1.351,1.590,0.868,0.683,0.284,0.408
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138
4,5,Switzerland,7.487,1.420,1.549,0.927,0.660,0.256,0.357
...,...,...,...,...,...,...,...,...,...
151,152,Yemen,3.355,0.442,1.073,0.343,0.244,0.083,0.064
152,153,Tanzania,3.303,0.455,0.991,0.381,0.481,0.270,0.097
153,154,South Sudan,3.254,0.337,0.608,0.177,0.112,0.224,0.106
154,155,Central African Republic,3.083,0.024,0.000,0.010,0.305,0.218,0.038


In [2]:
from tqdm import tqdm

# rename some countries to later match the country names from RapidAPI
df_happiness = df_happiness.apply(lambda x: x.replace("Congo (Kinshasa)", "DR Congo"))
df_happiness = df_happiness.apply(lambda x: x.replace("Congo (Brazzaville)", "Congo"))
df_happiness = df_happiness.apply(lambda x: x.replace("Ivory Coast", "Côte d'Ivoire"))

# create URL and headers for API call
url = "https://world-population.p.rapidapi.com/allcountriesname"

# the headers can be found when logging in to your RapidAPI account and opening the link above
headers = {
    'x-rapidapi-host': "world-population.p.rapidapi.com",
    'x-rapidapi-key': "1ae0131274msh2c8337983915cbep16eba8jsnce5983341241"
}

# add population column first by setting all values to NaN
df_happiness["Population"] = np.nan

# loop over countries and get population
for country in tqdm(df_happiness["Country or region"].to_list()):
    # create querystring for API call
    querystring = {"country_name" : country}
    
    # create request and fetch response
    response = requests.request("GET", url, headers=headers, params=querystring)
    
    # add population to dataframe in case response is okay
    response_dict = json.loads(response.text)
    if response_dict["ok"] == True:
        population = response_dict["body"]["population"]
        df_happiness.loc[df_happiness["Country or region"] == country, "Population"] = population


  0%|                                                                                          | 0/156 [00:01<?, ?it/s]


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [4]:
# get html data first
html_data = requests.get("https://www.worlddata.info/average-age.php")

# check if status is 200 -> shows that its allowed to scrape the webpage
print(html_data.status_code)

# parse html data now using BeautifulSoup
soup = BeautifulSoup(html_data.text, "html.parser")

# get all tables from wikipedia page
tables = soup.find_all('table',{'class':"std100 hover"})

# store target table
table = tables[0]

# convert table html code to pandas df
data = pd.read_html(str(table))
df_average_age = pd.DataFrame(data[0]) 

display(df_average_age)

200


Unnamed: 0,Country,Median age in years,Population under 20 years old,Life expectancy in years
0,Japan,48.6,16.3 %,84.6
1,Germany,47.8,18.5 %,81.0
2,Italy,46.5,17.5 %,82.9
3,Hong Kong *,45.6,15.7 %,85.7
4,Greece,45.3,19.3 %,80.3
...,...,...,...,...
122,Chad,16.1,58.4 %,52.6
123,Mali,16.0,58.5 %,59.0
124,Angola,15.9,55.6 %,61.7
125,Uganda,15.7,57.1 %,62.7


In [5]:
df_final = df_happiness.set_index("Country or region").join(df_average_age.set_index("Country")).reset_index()

### Transform

In [6]:
# Compute GDP by using GDP per capita and the Population columns
df_final["GDP"] = df_final["GDP per capita"] * df_final["Population"]

# Remove % sign of Population under 20 years old column and convert it to be of type float
def transform_col(col_val):
    try: 
        return float(col_val.replace(" %", ""))
    except: # value is NaN
        return col_val

df_final["Population under 20 years old in %"] = df_final["Population under20 years old"].apply(transform_col)
df_final = df_final.drop(columns=["Population under20 years old"])

KeyError: 'Population under20 years old'

### Load

The Load step is the last step of the ETL pipeline.

In [8]:
def load(dataset):
    dataset.to_csv("final_dataset.csv", index=False)

load(df_final)