# Create your own Dataset

# Extract

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
import numpy as np

# get html data first
html_data = requests.get("https://en.wikipedia.org/wiki/World_Happiness_Report")

# check if status is 200 -> shows that its allowed to scrape the webpage
print(html_data.status_code)


200


In [5]:
# parse html data now using BeautifulSoup
soup = BeautifulSoup(html_data.text, "html.parser")

# get all tables from wikipedia page
tables = soup.find_all('table',{'class':"wikitable"})

In [7]:
# store target table
table = tables[0]
# convert table html code to pandas df
data = pd.read_html(str(table))
df_happiness = pd.DataFrame(data[0]) 
df_happiness

Unnamed: 0,Overall rank,Country or region
0,1,Finland
1,2,Denmark
2,3,Iceland
3,4,Israel
4,5,Netherlands
...,...,...
132,133,"Congo, Democratic Republic of"
133,134,Zimbabwe
134,135,Sierra Leone
135,136,Lebanon


In [9]:
from tqdm import tqdm

# rename some countries to later match the country names from RapidAPI
df_happiness = df_happiness.apply(lambda x: x.replace("Congo (Kinshasa)", "DR Congo"))
df_happiness = df_happiness.apply(lambda x: x.replace("Congo (Brazzaville)", "Congo"))
df_happiness = df_happiness.apply(lambda x: x.replace("Ivory Coast", "Côte d'Ivoire"))

# create URL and headers for API call
url = "https://world-population.p.rapidapi.com/population"

# the headers can be found when logging in to your RapidAPI account and opening the link above
headers = {
    'x-rapidapi-host': "world-population.p.rapidapi.com",
    'x-rapidapi-key': "***************************"
}

# add population column first by setting all values to NaN
df_happiness["Population"] = np.nan

# loop over countries and get population
for country in tqdm(df_happiness["Country or region"].to_list()):
    # create querystring for API call
    querystring = {"country_name" : country}
    
    # create request and fetch response
    response = requests.request("GET", url, headers=headers, params=querystring)
    
    # add population to dataframe in case response is okay
    response_dict = json.loads(response.text)
    if response_dict["ok"] == True:
        population = response_dict["body"]["population"]
        df_happiness.loc[df_happiness["Country or region"] == country, "Population"] = population

  0%|                                                                                          | 0/137 [00:00<?, ?it/s]


KeyError: 'ok'

In [10]:
# get html data first
html_data = requests.get("https://www.worlddata.info/average-age.php")

# check if status is 200 -> shows that its allowed to scrape the webpage
print(html_data.status_code)

# parse html data now using BeautifulSoup
soup = BeautifulSoup(html_data.text, "html.parser")

# get all tables from wikipedia page
tables = soup.find_all('table',{'class':"std100 hover"})

# store target table
table = tables[0]

# convert table html code to pandas df
data = pd.read_html(str(table))
df_average_age = pd.DataFrame(data[0])
df_average_age

200


Unnamed: 0,Country,Median age in years,Population under 20 years old,Life expectancy in years
0,Japan,48.6,16.3 %,84.6
1,Germany,47.8,18.5 %,81.0
2,Italy,46.5,17.5 %,82.9
3,Hong Kong *,45.6,15.7 %,85.7
4,Greece,45.3,19.3 %,80.3
...,...,...,...,...
122,Chad,16.1,58.4 %,52.6
123,Mali,16.0,58.5 %,59.0
124,Angola,15.9,55.6 %,61.7
125,Uganda,15.7,57.1 %,62.7


In [12]:
# let's use pandas join functionality for joining these tables together
df_final = df_happiness.set_index("Country or region").join(df_average_age.set_index("Country")).reset_index()
df_final

Unnamed: 0,Country or region,Overall rank,Population,Median age in years,Population under 20 years old,Life expectancy in years
0,Finland,1,,42.8,20.9 %,82.0
1,Denmark,2,,42.0,22.0 %,81.5
2,Iceland,3,,,,
3,Israel,4,,30.4,36.0 %,82.6
4,Netherlands,5,,42.8,21.3 %,81.5
...,...,...,...,...,...,...
132,"Congo, Democratic Republic of",133,,,,
133,Zimbabwe,134,,20.5,52.1 %,59.3
134,Sierra Leone,135,,19.1,50.3 %,60.1
135,Lebanon,136,,,,


# Transform

In [13]:
# Compute GDP by using GDP per capita and the Population columns
df_final["GDP"] = df_final["GDP per capita"] * df_final["Population"]

# Remove % sign of Population under 20 years old column and convert it to be of type float
def transform_col(col_val):
    try: 
        return float(col_val.replace(" %", ""))
    except: # value is NaN
        return col_val

df_final["Population under 20 years old in %"] = df_final["Population under20 years old"].apply(transform_col)
df_final = df_final.drop(columns=["Population under20 years old"])

KeyError: 'GDP per capita'

# Load

In [14]:
def load(dataset):
    dataset.to_csv("final_dataset.csv", index=False)

load(df_final)