In [None]:
""" 

Creating Data Frame

1. Collect the name of all the countries per continent from English Wikipedia.

2. Create countries-continents pandas dataframe. Dataframe should have two columns: country, continent.

3. Collect the happiness score, GDP per capital, social support, healthy life expectancy, freedom to make life choices, generosity, and perceptions of corruption per country in 2019 from English Wikipedia and put all collected information in a dataframe.

4. Create a new dataframe with all the information that you collected and save it in a CSV.

 

Links:

https://en.wikipedia.org/wiki/World_Happiness_Report#2019_report
https://simple.wikipedia.org/wiki/List_of_countries_by_continents
Recommended libraries to use: 

Beautifuisoup - https://www.crummy.com/software/BeautifulSoup/bs4/doc/  #For HTM parsing 
requests   - https://pypi.org/project/requests/ #For downloading the HTML code for the Wikipedia page, we need to import the requests' library


Attention: you can get creative and use different libraries. 
There is not one particular solution. Be creative and try to find your way. Collaborate with your peers, if you feel like you can not do it alone. 

Even if you do just part of it - bring it with you to the tutorial, so that we can discuss and improve it together.  


Disclaimer: if no one does it - then we can not have a proper discussion.  So it is in your best interests to actually invest time in this task!

Good luck! Looking forward to see your solutions! 

"""

In [None]:
""" 
Example DataFrame: 
Country | Continent
India   | Asia
Germany | Europe
China   | Asia


Flow:
    pypi -> request module for downloading the HTML code from wikipedia
    BeautifulSoup for parsing the downloaded HTML file

"""

In [181]:
import requests

r = requests.get("https://simple.wikipedia.org/wiki/List_of_countries_by_continents#Asia")

In [184]:
import requests

url = "https://simple.wikipedia.org/wiki/List_of_countries_by_continents#Asia"  # Replace with the target website
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    html_content = response.text  # Get the raw HTML content
    # print(html_content)
else:
    print(f"Failed to retrieve data, Status Code: {response.status_code}")


In [185]:
from bs4 import BeautifulSoup

In [186]:
soup = BeautifulSoup(html_content, "html.parser")
data = []

In [187]:

# Extract title
title = soup.title.text
print("Page Title:", title)

# Extract all links
for link in soup.find_all("a"):
    print(link.get("href"))  # Get 'href' attribute of each anchor tag

# Extract specific elements (e.g., paragraphs)
for paragraph in soup.find_all("p"):
    print(paragraph.text)

Page Title: List of countries by continents - Simple English Wikipedia, the free encyclopedia
#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Simple_start
/wiki/Wikipedia:Simple_talk
/wiki/Special:RecentChanges
/wiki/Special:Random
/wiki/Help:Contents
//simple.wikipedia.org/wiki/Wikipedia:Contact_us
/wiki/Wikipedia:About
/wiki/Main_Page
/wiki/Special:Search
https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=simple.wikipedia.org&uselang=en
/w/index.php?title=Special:CreateAccount&returnto=List+of+countries+by+continents
/w/index.php?title=Special:UserLogin&returnto=List+of+countries+by+continents
https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=simple.wikipedia.org&uselang=en
/w/index.php?title=Special:CreateAccount&returnto=List+of+countries+by+continents
/w/index.php?title=Special:UserLogin&returnto=List+of+countries+by+continents
/wiki/Help:Introduction
/wiki/Special:MyContributions
/wiki/Special:MyTalk
#
#Africa
#Sovereign_sta

In [188]:
"""
Structure for wikipedia:
    <div class = "mw-heading mw-heading2">
    Before each wikitable, there is a div 
    the div contains the continents name
    table contains the countries 
    <table class = "wikitable">
"""


tables = soup.findAll("table", class_="wikitable")
# print(tables)
for table in tables:
    continent_div = table.find_previous("div", class_="mw-heading2")
    # continent div: <div class="mw-heading mw-heading2"><h2 id="Africa">Africa</h2></div>
    # print(continent_div.text.strip())
    continent_name = continent_div.text.strip()
    for row in table.find_all("tr")[1:]:
        cols = row.find_all("td")
        if(cols):
            country_td = cols[2].find("a")
            country_name = country_td.get("title")
            print(continent_name, country_name)
            data.append([continent_name, country_name])

Africa Algeria
Africa Angola
Africa Benin
Africa Botswana
Africa Burkina Faso
Africa Burundi
Africa Cameroon
Africa Cape Verde
Africa Central African Republic
Africa Chad
Africa Comoros
Africa Democratic Republic of the Congo
Africa Republic of the Congo
Africa Djibouti
Africa Egypt
Africa Equatorial Guinea
Africa Eritrea
Africa Eswatini
Africa Ethiopia
Africa Gabon
Africa The Gambia
Africa Ghana
Africa Guinea
Africa Guinea-Bissau
Africa Ivory Coast
Africa Kenya
Africa Lesotho
Africa Liberia
Africa Libya
Africa Madagascar
Africa Malawi
Africa Mali
Africa Mauritania
Africa Mauritius
Africa Morocco
Africa Mozambique
Africa Namibia
Africa Niger
Africa Nigeria
Africa Rwanda
Africa São Tomé and Príncipe
Africa Senegal
Africa Seychelles
Africa Sierra Leone
Africa Somalia
Africa South Africa
Africa South Sudan
Africa Sudan
Africa Tanzania
Africa Togo
Africa Tunisia
Africa Uganda
Africa Zambia
Africa Zimbabwe
Africa Santa Cruz de Tenerife
Africa Saint-Pierre (not yet started)
Africa Funchal
Af

In [189]:
import pandas as pd
df = pd.DataFrame(data=data, columns=["Continent", "Country"])

In [190]:
df.head(100)[80:]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Continent  204 non-null    object
 1   Country    204 non-null    object
dtypes: object(2)
memory usage: 3.3+ KB


In [None]:
""" 
Collect the 
happiness score, GDP per capital, social support, healthy life expectancy, 
freedom to make life choices, generosity, and perceptions of corruption 
per country in 2019 from English Wikipedia and put all collected information in a dataframe.


https://en.wikipedia.org/wiki/World_Happiness_Report#2019_report


structure:
    parent: div, class = mw-heading, mw-heading3
        h3: id: 2019_report
    sibling of parent: div
        child: table

"""

In [191]:
import requests

url = "https://en.wikipedia.org/wiki/World_Happiness_Report#2019_report"  # Replace with the target website
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    html_content = response.text  # Get the raw HTML content
    # print(html_content)
else:
    print(f"Failed to retrieve data, Status Code: {response.status_code}")

In [192]:
countries_content_data = []
# Wikipedia URL for World Happiness Report 2019
url = "https://en.wikipedia.org/wiki/World_Happiness_Report#2019_report"

# Fetch the webpage
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Step 1: Find the h3 tag with id="2019_report"
heading = soup.find("h3", id="2019_report")

if heading:
    # Step 2: Get its parent div containing the 2019 report section
    parent_div = heading.find_parent("div", class_="mw-heading mw-heading3")
    if parent_div:
        # Step 3: Find the sibling div that contains the table
        sibling_div = parent_div.find_next_sibling("div")
        if sibling_div:
            # Step 4: Find the table within the sibling div
            table = sibling_div.find("table", class_="wikitable")
            # print(table)
            # Print or process the table
            # print(table)
            tbody = table.find("tbody")
            # print(tbody)
            # print(tbody)

            """
            Collect the 
            happiness score, GDP per capital, social support, healthy life expectancy, 
            freedom to make life choices, generosity, and perceptions of corruption 
            """
           
            rows = tbody.find_all("tr")

            column_headings = rows[0]
            column_rows = column_headings.find_all("abbr")
            # print(column_headings)
            df_columns = ["Country"]
            for col in column_rows:
                title = col.get("title")
                if ":" in title:
                    result = title.split(":", 1)[1].strip() 
                    df_columns.append(result)
                else:
                    df_columns.append(title)
            

            countries_content_data.append(df_columns)

            for row in rows[1:]:
                # print(row)
                tds = row.find_all("td")[1:]
                print("Start")
                country_data = []
                for td in tds:
                    # print(td.text.strip())
                    country_aref = td.find("a")
                    if(country_aref):
                        print(country_aref.text)
                        country_data.append(country_aref.text.strip())
                    else:
                        country_data.append(td.text.strip())
                        # print("")
                print(country_data)
                countries_content_data.append(country_data)
                print("End")
                

            # print(column_headings)
            # print(rows)
        
        else:
            print("No sibling div found containing the table.")
    else:
        print("Parent div containing the 2019 report section not found.")
else:
    print("The 2019 report heading (h3) not found.")

Start
Finland
['Finland', '7.769', '1.340', '1.587', '0.986', '0.596', '0.153', '0.393']
End
Start
Denmark
['Denmark', '7.600', '1.383', '1.573', '0.996', '0.592', '0.252', '0.410']
End
Start
Norway
['Norway', '7.554', '1.488', '1.582', '1.028', '0.603', '0.271', '0.341']
End
Start
Iceland
['Iceland', '7.494', '1.380', '1.624', '1.026', '0.591', '0.354', '0.118']
End
Start
Netherlands
['Netherlands', '7.488', '1.396', '1.522', '0.999', '0.557', '0.322', '0.298']
End
Start
Switzerland
['Switzerland', '7.480', '1.452', '1.526', '1.052', '0.572', '0.263', '0.343']
End
Start
Sweden
['Sweden', '7.343', '1.387', '1.487', '1.009', '0.574', '0.267', '0.373']
End
Start
New Zealand
['New Zealand', '7.307', '1.303', '1.557', '1.026', '0.585', '0.330', '0.380']
End
Start
Canada
['Canada', '7.278', '1.365', '1.505', '1.039', '0.584', '0.285', '0.308']
End
Start
Austria
['Austria', '7.246', '1.376', '1.475', '1.016', '0.532', '0.244', '0.226']
End
Start
Australia
['Australia', '7.228', '1.372', '1.5

In [193]:
"""  
More structure:
table
    has tbody
        has 2 rows, 2nd 
            2nd row has td
                td has table of interest
                    table -> table class -> wikitable


we can search for child table with class wikitable
"""
print(countries_content_data)

[['Country', 'Happiness score', 'GDP', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption'], ['Finland', '7.769', '1.340', '1.587', '0.986', '0.596', '0.153', '0.393'], ['Denmark', '7.600', '1.383', '1.573', '0.996', '0.592', '0.252', '0.410'], ['Norway', '7.554', '1.488', '1.582', '1.028', '0.603', '0.271', '0.341'], ['Iceland', '7.494', '1.380', '1.624', '1.026', '0.591', '0.354', '0.118'], ['Netherlands', '7.488', '1.396', '1.522', '0.999', '0.557', '0.322', '0.298'], ['Switzerland', '7.480', '1.452', '1.526', '1.052', '0.572', '0.263', '0.343'], ['Sweden', '7.343', '1.387', '1.487', '1.009', '0.574', '0.267', '0.373'], ['New Zealand', '7.307', '1.303', '1.557', '1.026', '0.585', '0.330', '0.380'], ['Canada', '7.278', '1.365', '1.505', '1.039', '0.584', '0.285', '0.308'], ['Austria', '7.246', '1.376', '1.475', '1.016', '0.532', '0.244', '0.226'], ['Australia', '7.228', '1.372', '1.548', '1.036', '0.557', '0.332', '0.

In [194]:
content_df = pd.DataFrame(countries_content_data[1:], columns=countries_content_data[0][:])

In [195]:
content_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 8 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Country                       156 non-null    object
 1   Happiness score               156 non-null    object
 2   GDP                           156 non-null    object
 3   Social support                156 non-null    object
 4   Healthy life expectancy       156 non-null    object
 5   Freedom to make life choices  156 non-null    object
 6   Generosity                    156 non-null    object
 7   Perceptions of corruption     156 non-null    object
dtypes: object(8)
memory usage: 9.9+ KB


In [196]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Continent  204 non-null    object
 1   Country    204 non-null    object
dtypes: object(2)
memory usage: 3.3+ KB


In [197]:
merged_df = pd.merge(content_df, df, on="Country", how="outer")

In [198]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Country                       214 non-null    object
 1   Happiness score               156 non-null    object
 2   GDP                           156 non-null    object
 3   Social support                156 non-null    object
 4   Healthy life expectancy       156 non-null    object
 5   Freedom to make life choices  156 non-null    object
 6   Generosity                    156 non-null    object
 7   Perceptions of corruption     156 non-null    object
 8   Continent                     204 non-null    object
dtypes: object(9)
memory usage: 15.2+ KB


In [199]:
merged_df

Unnamed: 0,Country,Happiness score,GDP,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Continent
0,Afghanistan,3.203,0.350,0.517,0.361,0.000,0.158,0.025,Asia
1,Albania,4.719,0.947,0.848,0.874,0.383,0.178,0.027,Europe
2,Algeria,5.211,1.002,1.160,0.785,0.086,0.073,0.114,Africa
3,Andorra,,,,,,,,Europe
4,Angola,,,,,,,,Africa
...,...,...,...,...,...,...,...,...,...
209,Venezuela,4.707,0.960,1.427,0.805,0.154,0.064,0.047,South America
210,Vietnam,5.175,0.741,1.346,0.851,0.543,0.147,0.073,Asia
211,Yemen,3.380,0.287,1.163,0.463,0.143,0.108,0.077,Asia
212,Zambia,4.107,0.578,1.058,0.426,0.431,0.247,0.087,Africa


In [200]:
merged_df.to_csv("merged_output.csv", index=False, encoding="utf-8")
df.to_csv("country_continent.csv", index=False, encoding="utf-8")
content_df.to_csv("country_statistics.csv", index=False, encoding="utf-8")