In [1]:
pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.


In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Wikipedia page URL
url = 'https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_area'

# Send a request to fetch the content of the page
response = requests.get(url)

# Check if the page was fetched successfully
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all tables on the page
    tables = soup.find_all('table')

    # Loop through the tables and find the correct one
    target_table = None
    for table in tables:
        # Check if the table has the right number of columns (6 in this case)
        headers = table.find_all('th')
        if len(headers) >= 6 and 'Country / dependency' in headers[1].text:
            target_table = table
            break

    if target_table:  # Ensure the target table is found
        # Initialize lists to store the data
        countries = []
        total_areas = []
        land_areas = []
        water_areas = []
        water_percentage = []

        # Loop through the rows of the table, skipping the header
        for row in target_table.find_all('tr')[1:]:
            cells = row.find_all('td')
            if len(cells) >= 5:
                countries.append(cells[1].text.strip())
                total_areas.append(cells[2].text.strip())
                land_areas.append(cells[3].text.strip())
                water_areas.append(cells[4].text.strip())
                water_percentage.append(cells[5].text.strip())

        # Create a DataFrame from the scraped data
        df = pd.DataFrame({
            'Country/Dependency': countries,
            'Total Area (km²)': total_areas,
            'Land Area (km²)': land_areas,
            'Water Area (km²)': water_areas,
            '% Water': water_percentage
        })

        # Display the DataFrame
        print(df)

        # Optional: Save the DataFrame to a CSV file
        df.to_csv('countries_by_area.csv', index=False)
        print("Data saved to 'countries_by_area.csv'")
    else:
        print("Target table not found on the page.")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


                          Country/Dependency           Total Area (km²)  \
0                                      Earth  510,072,000 (196,940,000)   
1                                     Russia     17,098,246 (6,601,667)   
2                                 Antarctica     14,200,000 (5,480,000)   
3                                     Canada      9,984,670 (3,855,100)   
4                                      China      9,596,960 (3,705,410)   
..                                       ...                        ...   
259  Ashmore and Cartier Islands (Australia)                  5.0 (1.9)   
260            Coral Sea Islands (Australia)                  3.0 (1.2)   
261               Spratly Islands (disputed)                 2.0 (0.77)   
262                                   Monaco                 2.0 (0.77)   
263                             Vatican City                0.49 (0.19)   

              Land Area (km²)           Water Area (km²) % Water  
0    148,940,000 (57,506,000)  3

In [27]:
# Path to the CSV file
file_path = '/Users/barry/Desktop/Python/countries_by_area.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to ensure it is loaded correctly
print(df.head())


  Country/Dependency           Total Area (km²)           Land Area (km²)  \
0              Earth  510,072,000 (196,940,000)  148,940,000 (57,506,000)   
1             Russia     17,098,246 (6,601,667)    16,376,870 (6,323,142)   
2         Antarctica     14,200,000 (5,480,000)    14,200,000 (5,480,000)   
3             Canada      9,984,670 (3,855,100)     9,093,507 (3,511,021)   
4              China      9,596,960 (3,705,410)     9,326,410 (3,600,950)   

            Water Area (km²) % Water  
0  361,132,000 (139,434,000)    70.8  
1          721,380 (278,530)     4.2  
2                          0       0  
3          891,163 (344,080)     8.9  
4          270,550 (104,460)     2.8  


In [29]:
import pandas as pd
import re

# Function to clean the area string and extract only the numeric part (in km²)
def clean_area(area_str):
    # Ensure area_str is a string and not bytes
    if isinstance(area_str, bytes):
        area_str = area_str.decode("utf-8")  # Decode bytes to string
    
    # Use regular expressions to extract the first number (before any parentheses)
    match = re.search(r'[\d,]+', area_str)
    if match:
        return float(match.group(0).replace(',', ''))  # Remove commas and convert to float
    else:
        return None  # Return None if no valid number is found

# Define the function to calculate how many times one country fits into another
def how_many_fits(country1, country2, df):
    # Extract the area of the two countries
    try:
        area1_str = df.loc[df['Country/Dependency'] == country1, 'Total Area (km²)'].values[0]
        area2_str = df.loc[df['Country/Dependency'] == country2, 'Total Area (km²)'].values[0]
        
        # Clean the area strings to extract the numeric part
        area1 = clean_area(area1_str)
        area2 = clean_area(area2_str)

        # Check if both areas were successfully extracted
        if area1 is None or area2 is None:
            print(f"Could not extract valid area for {country1} or {country2}.")
            return

        # Calculate how many times country1 fits into country2
        fit_ratio = area2 / area1

        # Print the result
        print(f"{country1} fits into {country2} approximately {fit_ratio:.4f} times.")
    
    except IndexError:
        print(f"One or both countries not found in the dataset.")

# Example usage: Calculate how many times "Qatar" fits into "Canada"
how_many_fits("Syria", "Canada", df)


Syria fits into Canada approximately 53.9187 times.


In [31]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display
import re

# Step 1: Load the dataset
file_path = '/Users/barry/Desktop/Python/countries_by_area.csv'
df = pd.read_csv(file_path)

# Step 2: Clean the data by removing text inside parentheses and commas
df['Total Area (km²)'] = df['Total Area (km²)'].apply(lambda x: re.sub(r'\s*\(.*?\)', '', str(x)))  # Remove parentheses
df['Total Area (km²)'] = df['Total Area (km²)'].str.replace(',', '').astype(float)  # Remove commas and convert to float

# Step 3: Sort countries by name
sorted_countries = sorted(df['Country/Dependency'].tolist())

# Step 4: Create dropdown widgets for country selection
country1_dropdown = widgets.Dropdown(
    options=sorted_countries,
    description='Country 1:',
    style={'description_width': 'initial'}
)

country2_dropdown = widgets.Dropdown(
    options=sorted_countries,
    description='fits in Country 2:',
    style={'description_width': 'initial'}
)

# Step 5: Define a function to calculate how many times country1 fits into country2
def calculate_fits(country1, country2):
    # Get the area of both countries
    area1 = df.loc[df['Country/Dependency'] == country1, 'Total Area (km²)'].values[0]
    area2 = df.loc[df['Country/Dependency'] == country2, 'Total Area (km²)'].values[0]

    # Calculate how many times country1 fits into country2
    fits = area2 / area1
    print(f"\n{country1} fits into {country2} {fits:.5f} times.")

# Step 6: Create a button to trigger the calculation
button = widgets.Button(description="Calculate")
output = widgets.Output()

# Step 7: Define button click event handler
def on_button_click(b):
    with output:
        output.clear_output()
        calculate_fits(country1_dropdown.value, country2_dropdown.value)

button.on_click(on_button_click)

# Step 8: Display the dropdowns, button, and output
display(country1_dropdown, country2_dropdown, button, output)


Dropdown(description='Country 1:', options=('9,525,067 (3,677,647)', 'Abkhazia', 'Afghanistan', 'Akrotiri and …

Dropdown(description='fits in Country 2:', options=('9,525,067 (3,677,647)', 'Abkhazia', 'Afghanistan', 'Akrot…

Button(description='Calculate', style=ButtonStyle())

Output()