In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Get the main page content
response = requests.get('https://www.worldometers.info/population/')
content = response.content

# Analyze the HTML page using BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')

# Find all the elements <a> based on the structure provided
country_links = soup.find_all('a', href=lambda href: href and '/world-population/' in href)

country_url = []
country_name = []

# Collect each country link and name
for link in country_links:
    country_url.append(link['href'])
    country_name.append(link.text)

# Remove the first 6 entries which are not country links
del country_url[0:6]
del country_name[0:6]

# Initialize data dictionary
data = {
    "Country": [],
    "Year": [],
    "Population": [],
    "Yearly % Change": [],
    "Yearly Change": [],
    "Migrants (net)": [],
    "Median Age": [],
    "Fertility Rate": [],
    "Density (P/Km²)": [],
    "Urban Pop %": [],
    "Urban Population": [],
    "Country's Share of World Pop": [],
    "World Population": [],
    "Global Rank": []
}

# Loop through each country
for name, url in zip(country_name, country_url):
    if url != '/world-population/channel-islands-population/':
        web_page = requests.get('https://www.worldometers.info' + url)
        web_page_content = web_page.content

        soup = BeautifulSoup(web_page_content, 'html.parser')

        table = soup.find('table', class_='table table-striped table-bordered table-hover table-condensed table-list')
    
        if table:
            for row in table.find_all('tr'):
                cells = row.find_all('td')
                if cells:
                    data["Country"].append(name)
                    data["Year"].append(cells[0].text.strip() if len(cells) > 0 else None)
                    data["Population"].append(cells[1].text.strip() if len(cells) > 1 else None)
                    data["Yearly % Change"].append(cells[2].text.strip() if len(cells) > 2 else None)
                    data["Yearly Change"].append(cells[3].text.strip() if len(cells) > 3 else None)
                    data["Migrants (net)"].append(cells[4].text.strip() if len(cells) > 4 else None)
                    data["Median Age"].append(cells[5].text.strip() if len(cells) > 5 else None)
                    data["Fertility Rate"].append(cells[6].text.strip() if len(cells) > 6 else None)
                    data["Density (P/Km²)"].append(cells[7].text.strip() if len(cells) > 7 else None)
                    data["Urban Pop %"].append(cells[8].text.strip() if len(cells) > 8 else None)
                    data["Urban Population"].append(cells[9].text.strip() if len(cells) > 9 else None)
                    data["Country's Share of World Pop"].append(cells[10].text.strip() if len(cells) > 10 else None)
                    data["World Population"].append(cells[11].text.strip() if len(cells) > 11 else None)
                    data["Global Rank"].append(cells[12].text.strip() if len(cells) > 12 else None)
        else:
            print(f"Table not found for country: {name}")

# Create the DataFrame
df_final = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df_final.to_csv('world_population.csv', index=False)

print("Data scraping complete. Data saved to 'world_population.csv'.")

Data scraping complete. Data saved to 'world_population.csv'.
