In [None]:
""" 

Creating Data Frame

1. Collect the name of all the countries per continent from English Wikipedia.

2. Create countries-continents pandas dataframe. Dataframe should have two columns: country, continent.

3. Collect the happiness score, GDP per capital, social support, healthy life expectancy, freedom to make life choices, generosity, and perceptions of corruption per country in 2019 from English Wikipedia and put all collected information in a dataframe.

4. Create a new dataframe with all the information that you collected and save it in a CSV.

 

Links:

https://en.wikipedia.org/wiki/World_Happiness_Report#2019_report
https://simple.wikipedia.org/wiki/List_of_countries_by_continents
Recommended libraries to use: 

Beautifuisoup - https://www.crummy.com/software/BeautifulSoup/bs4/doc/  #For HTM parsing 
requests   - https://pypi.org/project/requests/ #For downloading the HTML code for the Wikipedia page, we need to import the requests' library


Attention: you can get creative and use different libraries. 
There is not one particular solution. Be creative and try to find your way. Collaborate with your peers, if you feel like you can not do it alone. 

Even if you do just part of it - bring it with you to the tutorial, so that we can discuss and improve it together.  


Disclaimer: if no one does it - then we can not have a proper discussion.  So it is in your best interests to actually invest time in this task!

Good luck! Looking forward to see your solutions! 

"""

In [None]:
""" 
Example DataFrame: 
Country | Continent
India   | Asia
Germany | Europe
China   | Asia


Flow:
    pypi -> request module for downloading the HTML code from wikipedia
    BeautifulSoup for parsing the downloaded HTML file

"""

In [203]:
import requests

r = requests.get("https://simple.wikipedia.org/wiki/List_of_countries_by_continents#Asia")

In [204]:
import requests

url = "https://simple.wikipedia.org/wiki/List_of_countries_by_continents#Asia"  # Replace with the target website
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    html_content = response.text  # Get the raw HTML content
    # print(html_content)
else:
    print(f"Failed to retrieve data, Status Code: {response.status_code}")


In [205]:
from bs4 import BeautifulSoup

In [206]:
soup = BeautifulSoup(html_content, "html.parser")
data = []

In [208]:
"""
Structure for wikipedia:
    <div class = "mw-heading mw-heading2">
    Before each wikitable, there is a div 
    the div contains the continents name
    table contains the countries 
    <table class = "wikitable">
"""


tables = soup.findAll("table", class_="wikitable")
# print(tables)
for table in tables:
    continent_div = table.find_previous("div", class_="mw-heading2")
    # continent div: <div class="mw-heading mw-heading2"><h2 id="Africa">Africa</h2></div>
    # print(continent_div.text.strip())
    continent_name = continent_div.text.strip()
    for row in table.find_all("tr")[1:]:
        cols = row.find_all("td")
        if(cols):
            country_td = cols[2].find("a")
            country_name = country_td.get("title")
            data.append([continent_name, country_name])

In [209]:
import pandas as pd
df = pd.DataFrame(data=data, columns=["Continent", "Country"])

In [210]:
df.head(100)[80:]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 408 entries, 0 to 407
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Continent  408 non-null    object
 1   Country    408 non-null    object
dtypes: object(2)
memory usage: 6.5+ KB


In [None]:
""" 
Collect the 
happiness score, GDP per capital, social support, healthy life expectancy, 
freedom to make life choices, generosity, and perceptions of corruption 
per country in 2019 from English Wikipedia and put all collected information in a dataframe.


https://en.wikipedia.org/wiki/World_Happiness_Report#2019_report


structure:
    parent: div, class = mw-heading, mw-heading3
        h3: id: 2019_report
    sibling of parent: div
        child: table

"""

In [211]:
import requests

url = "https://en.wikipedia.org/wiki/World_Happiness_Report#2019_report"  # Replace with the target website
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    html_content = response.text  # Get the raw HTML content
    # print(html_content)
else:
    print(f"Failed to retrieve data, Status Code: {response.status_code}")

In [212]:
countries_content_data = []
# Wikipedia URL for World Happiness Report 2019
url = "https://en.wikipedia.org/wiki/World_Happiness_Report#2019_report"

# Fetch the webpage
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Step 1: Find the h3 tag with id="2019_report"
heading = soup.find("h3", id="2019_report")

if heading:
    # Step 2: Get its parent div containing the 2019 report section
    parent_div = heading.find_parent("div", class_="mw-heading mw-heading3")
    if parent_div:
        # Step 3: Find the sibling div that contains the table
        sibling_div = parent_div.find_next_sibling("div")
        if sibling_div:
            # Step 4: Find the table within the sibling div
            table = sibling_div.find("table", class_="wikitable")
            # print(table)
            # Print or process the table
            # print(table)
            tbody = table.find("tbody")
            # print(tbody)
            # print(tbody)

            """
            Collect the 
            happiness score, GDP per capital, social support, healthy life expectancy, 
            freedom to make life choices, generosity, and perceptions of corruption 
            """
           
            rows = tbody.find_all("tr")

            column_headings = rows[0]
            column_rows = column_headings.find_all("abbr")
            # print(column_headings)
            df_columns = ["Country"]
            for col in column_rows:
                title = col.get("title")
                if ":" in title:
                    result = title.split(":", 1)[1].strip() 
                    df_columns.append(result)
                else:
                    df_columns.append(title)
            

            countries_content_data.append(df_columns)

            for row in rows[1:]:
                # print(row)
                tds = row.find_all("td")[1:]
                
                country_data = []
                for td in tds:
                    # print(td.text.strip())
                    country_aref = td.find("a")
                    if(country_aref):
                       
                        country_data.append(country_aref.text.strip())
                    else:
                        country_data.append(td.text.strip())
                        
               
                countries_content_data.append(country_data)
           
                

            # print(column_headings)
            # print(rows)
        
        else:
            print("No sibling div found containing the table.")
    else:
        print("Parent div containing the 2019 report section not found.")
else:
    print("The 2019 report heading (h3) not found.")

In [214]:
content_df = pd.DataFrame(countries_content_data[1:], columns=countries_content_data[0][:])

In [215]:
content_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 8 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Country                       156 non-null    object
 1   Happiness score               156 non-null    object
 2   GDP                           156 non-null    object
 3   Social support                156 non-null    object
 4   Healthy life expectancy       156 non-null    object
 5   Freedom to make life choices  156 non-null    object
 6   Generosity                    156 non-null    object
 7   Perceptions of corruption     156 non-null    object
dtypes: object(8)
memory usage: 9.9+ KB


In [216]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 408 entries, 0 to 407
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Continent  408 non-null    object
 1   Country    408 non-null    object
dtypes: object(2)
memory usage: 6.5+ KB


In [217]:
merged_df = pd.merge(content_df, df, on="Country", how="outer")

In [198]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Country                       214 non-null    object
 1   Happiness score               156 non-null    object
 2   GDP                           156 non-null    object
 3   Social support                156 non-null    object
 4   Healthy life expectancy       156 non-null    object
 5   Freedom to make life choices  156 non-null    object
 6   Generosity                    156 non-null    object
 7   Perceptions of corruption     156 non-null    object
 8   Continent                     204 non-null    object
dtypes: object(9)
memory usage: 15.2+ KB


In [218]:
merged_df.head(10)

Unnamed: 0,Country,Happiness score,GDP,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Continent
0,Afghanistan,3.203,0.35,0.517,0.361,0.0,0.158,0.025,Asia
1,Afghanistan,3.203,0.35,0.517,0.361,0.0,0.158,0.025,Asia
2,Albania,4.719,0.947,0.848,0.874,0.383,0.178,0.027,Europe
3,Albania,4.719,0.947,0.848,0.874,0.383,0.178,0.027,Europe
4,Algeria,5.211,1.002,1.16,0.785,0.086,0.073,0.114,Africa
5,Algeria,5.211,1.002,1.16,0.785,0.086,0.073,0.114,Africa
6,Andorra,,,,,,,,Europe
7,Andorra,,,,,,,,Europe
8,Angola,,,,,,,,Africa
9,Angola,,,,,,,,Africa


In [219]:
merged_df.to_csv("merged_output.csv", index=False, encoding="utf-8")
df.to_csv("country_continent.csv", index=False, encoding="utf-8")
content_df.to_csv("country_statistics.csv", index=False, encoding="utf-8")