<a href="https://colab.research.google.com/github/Chaimaferdia/Tp-Big-Data-Analytics/blob/main/Worldometers_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import unicodedata
import re

os.makedirs("/content/data", exist_ok=True)

def scrape_table(url, table_id=None):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        response.encoding = 'utf-8'
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")
        table = soup.find("table", id=table_id) if table_id else soup.find("table")

        if not table:
            print(f"There is no table {url}")
            return None

        headers = []
        for th in table.find_all("th"):
            header_text = clean_text(th.text.strip())
            headers.append(header_text)

        rows = []
        for row in table.find_all("tr"):
            cols = []
            for td in row.find_all("td"):
                cell_text = clean_text(td.text.strip())
                cols.append(cell_text)
            if cols:
                rows.append(cols)

        if headers and rows:
            df = pd.DataFrame(rows, columns=headers)
        else:
            df = pd.DataFrame(rows)

        return df

    except Exception as e:
        print(f"Data loading error from {url}: {e}")
        return None

def clean_text(text):
    if not isinstance(text, str):
        return text

    text = text.replace('−', '-').replace('–', '-').replace('—', '-')
    text = text.replace('â', '-')

    text = unicodedata.normalize('NFKD', text)
    text = re.sub(r'[^\x00-\x7F\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def clean_dataframe(df):
    if df is None or df.empty:
        return df

    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].apply(clean_text)

    return df

pages = {
    "coronavirus": ("https://www.worldometers.info/coronavirus/", "main_table_countries_today"),
    "population": ("https://www.worldometers.info/world-population/population-by-country/", None),
    "gdp": ("https://www.worldometers.info/gdp/gdp-by-country/", None),
    "energy": ("https://www.worldometers.info/energy/", None),
    "oil_reserves": ("https://www.worldometers.info/oil/", None),
    "oil_production": ("https://www.worldometers.info/oil/oil-production-by-country/", None),
    "oil_consumption": ("https://www.worldometers.info/oil/oil-consumption-by-country/", None),
    "coal": ("https://www.worldometers.info/coal/", None),
    "gas": ("https://www.worldometers.info/gas/", None),
    "water_usage": ("https://www.worldometers.info/water/", None),
    "co2": ("https://www.worldometers.info/co2-emissions/", None),
}

for name, (url, tid) in pages.items():
    print(f"Data loading  {name}...")
    df = scrape_table(url, tid)
    if df is not None:
        df = clean_dataframe(df)
        path = f"/content/data/{name}.csv"
        df.to_csv(path, index=False, encoding='utf-8-sig')
    else:
        print(f"Failed to load data {name}")
    time.sleep(3)

print("\n The process is done")

Data loading  coronavirus...
Data loading  population...
Data loading  gdp...
Data loading  energy...
Data loading  oil_reserves...
Data loading  oil_production...
Data loading  oil_consumption...
Data loading  coal...
Data loading  gas...
Data loading  water_usage...
Data loading  co2...

 The process is done
