### Imports

In [27]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import unidecode
import os
import re
import time

### Configuration & City Lists (Traffic Index)

In [28]:
# List of significant European cities (aligned with source naming)
SIGNIFICANT_EUROPEAN_CITIES = [
    'Lisbon', 'Barcelona', 'Budapest', 'Istanbul', 'Bucharest', 'Madrid', 'Sofia',
    'Krakow (Cracow)', 'Belgrade', 'Prague', 'Porto', 'Valencia', 'Kiev (Kyiv)',
    'Moscow', 'Berlin', 'Vienna', 'Malaga', 'Seville (Sevilla)', 'Rome', 'Faro',
    'Athens', 'Warsaw', 'Minsk', 'Paris', 'Ljubljana', 'Florence', 'Liverpool',
    'Tallinn', 'Zagreb', 'Hamburg', 'Naples', 'Milan', 'Split', 'Brussels',
    'Dublin', 'Riga', 'Lyon', 'Palma de Mallorca', 'Vilnius', 'London',
    'Stockholm', 'Munich', 'Marseille', 'Cologne', 'Amsterdam', 'Hvar',
    'Dusseldorf', 'Helsinki', 'Bordeaux', 'Frankfurt', 'Stuttgart', 'Hanover',
    'Copenhagen', 'Dresden', 'Manchester', 'Rotterdam', 'Saint Petersburg',
    'Edinburgh', 'Dubrovnik', 'Oslo', 'Glasgow', 'Belfast', 'Salzburg',
    'Zurich', 'Geneva', 'Valletta', 'Reykjavik'
]

# Cities excluded due to inconsistent or missing data
CITIES_TO_EXCLUDE = ["Bordeaux", "Faro", "Hvar"]

### Numbeo URLs (Traffic Index)

In [29]:
URLS_TRAFFIC = {
    "2025_Current": "https://www.numbeo.com/traffic/rankings_current.jsp",
    "2023_Integration": (
        "https://www.numbeo.com/traffic/region_rankings.jsp"
        "?title=2023&region=150"
    )
}

### Scraping Function (Traffic Index)

In [30]:
def scrape_numbeo_traffic(url, year_tag):
    """Scrape and clean Numbeo Traffic Index data."""
    try:
        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/91.0.4472.124 Safari/537.36"
            )
        }

        response = requests.get(url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")

        # Main table
        table = soup.find("table", {"id": "t2"})
        if table is None:
            print(f"Table not found for {year_tag}")
            return pd.DataFrame()

        # Table columns
        columns = [
            "Rank",
            "City",
            "Traffic_Index",
            "Time_Index",
            "Time_Exp_Index",
            "Inefficiency_Index",
            "CO2_Emission_Index"
        ]

        rows = []
        for tr in table.find_all("tr")[1:]:  # skip header
            cells = tr.find_all("td")
            if len(cells) >= 7:
                row = [cell.get_text(strip=True) for cell in cells[:7]]
                rows.append(row)

        df = pd.DataFrame(rows, columns=columns)
        df["data_source"] = year_tag

        # Convert numeric columns
        numeric_cols = [
            "Traffic_Index",
            "Time_Index",
            "Time_Exp_Index",
            "Inefficiency_Index",
            "CO2_Emission_Index"
        ]
        for col in numeric_cols:
            df[col] = pd.to_numeric(df[col], errors="coerce")

        # Extract country from city column
        df["Country"] = (
            df["City"]
            .str.split(",")
            .str[-1]
            .str.strip()
            .apply(unidecode.unidecode)
        )

        # Clean city name
        df["City"] = (
            df["City"]
            .str.split(",")
            .str[0]
            .str.strip()
            .apply(unidecode.unidecode)
        )

        # Normalize column names
        df.columns = (
            df.columns
            .str.strip()
            .str.lower()
            .str.replace(" ", "_")
        )

        return df

    except Exception as e:
        print(f"Error scraping {year_tag}: {e}")
        return pd.DataFrame()

### Main Execution (Traffic Index)

In [31]:
print("Starting TRAFFIC scraping process...")

# Scrape all configured URLs
df_list = [
    scrape_numbeo_traffic(url, tag)
    for tag, url in URLS_TRAFFIC.items()
]

# Keep only non-empty results
df_list = [df for df in df_list if not df.empty]

if df_list:
    # Combine all results
    df_combined = pd.concat(df_list, ignore_index=True)

    # Prioritize 2025 data
    df_combined["sort_helper"] = df_combined["data_source"].apply(
        lambda x: 0 if "2025" in x else 1
    )
    df_combined = df_combined.sort_values("sort_helper")

    # Remove duplicates: keep first occurrence (2025 preferred)
    df_unique = df_combined.drop_duplicates(subset=["city"], keep="first").copy()

    # Remove excluded cities
    df_unique = df_unique[~df_unique["city"].isin(CITIES_TO_EXCLUDE)]

    # Keep only significant European cities
    refined_cities = [
        city for city in SIGNIFICANT_EUROPEAN_CITIES
        if city not in CITIES_TO_EXCLUDE
    ]
    df_final = df_unique[df_unique["city"].isin(refined_cities)].copy()

    # Select final columns
    cols_to_keep = [
        "city",
        "country",
        "traffic_index",
        "time_index",
        "time_exp_index",
        "inefficiency_index",
        "co2_emission_index",
        "data_source"
    ]
    df_final = df_final[cols_to_keep]

    # Sort alphabetically by city
    df_final = df_final.sort_values("city").reset_index(drop=True)

    # Add 1-based index
    df_final.index = df_final.index + 1
    df_final.index.name = "city_index"

    # Display result
    print("\n--- Final Scraped Traffic Data ---")
    print(df_final)

    # Show source stats
    print("\nSource usage stats:")
    print(df_final["data_source"].value_counts())

    # Save CSV (optional)
    # filename = "european_cities_traffic_index.csv"
    # df_final.to_csv(filename, encoding="utf-8")
    # print(f"\nData saved to {filename}")

else:
    print("No data scraped.")

Starting TRAFFIC scraping process...

--- Final Scraped Traffic Data ---
                        city         country  traffic_index  time_index  \
city_index                                                                
1                  Amsterdam     Netherlands           76.0        22.1   
2                     Athens          Greece          151.9        37.2   
3                  Barcelona           Spain           99.0        30.0   
4                    Belfast  United Kingdom          152.1        37.6   
5                   Belgrade          Serbia          137.9        36.5   
6                     Berlin         Germany           98.9        33.3   
7                   Brussels         Belgium          147.8        36.8   
8                  Bucharest         Romania          166.0        41.0   
9                   Budapest         Hungary          140.1        38.7   
10                   Cologne         Germany          135.0        36.7   
11                Copenhage

### Identify Missing Cities (Traffic Index)

In [32]:
# Cities we expected after exclusions
cities_we_wanted = {
    city for city in SIGNIFICANT_EUROPEAN_CITIES
    if city not in CITIES_TO_EXCLUDE
}

# Cities we actually got from scraping
cities_we_got = set(df_final["city"].unique())

# Determine missing cities
missing_cities = cities_we_wanted - cities_we_got

# Report
print("\n--- Missing Cities Report ---")
print(f"Number of searched cities: {len(cities_we_wanted)}")
print(f"Found cities: {len(cities_we_got)}")
print(f"Missing cities: {len(missing_cities)}")

if missing_cities:
    print("\nCities not found:")
    for city in sorted(missing_cities):
        print(f"‚ùå {city}")
else:
    print("‚úÖ No missing city!")


--- Missing Cities Report ---
Number of searched cities: 64
Found cities: 56
Missing cities: 8

Cities not found:
‚ùå Dresden
‚ùå Dubrovnik
‚ùå Florence
‚ùå Marseille
‚ùå Palma de Mallorca
‚ùå Salzburg
‚ùå Seville (Sevilla)
‚ùå Valletta


### Smart Recovery for Missing Cities (Traffic Index)

In [33]:
# --- 0. City to country mapping for missing cities ---
CITY_TO_COUNTRY = {
    "Dresden": "Germany",
    "Dubrovnik": "Croatia",
    "Florence": "Italy",
    "Marseille": "France",
    "Palma de Mallorca": "Spain",
    "Salzburg": "Austria",
    "Seville (Sevilla)": "Spain",
    "Valletta": "Malta"
}

# --- 1. Identify still-missing cities ---
current_cities = set(df_final["city"].unique())
wanted_cities = {c for c in SIGNIFICANT_EUROPEAN_CITIES if c not in CITIES_TO_EXCLUDE}
still_missing = wanted_cities - current_cities

print(f"Cities still missing (Traffic): {still_missing}")

# --- 2. Robust Recovery Function ---
def scrape_traffic_full_columns(city_name):
    """
    Attempts multiple name variants to extract all traffic metrics.
    Handles labels like CO2 safely (ignoring <sub> tags).
    """
    candidates = []

    # Variant A: Name before parentheses
    var_a = city_name.split("(")[0].strip().replace(" ", "-").title()
    candidates.append(var_a)

    # Variant B: Name inside parentheses
    match = re.search(r"\((.*?)\)", city_name)
    if match:
        var_b = match.group(1).strip().replace(" ", "-").title()
        candidates.append(var_b)

    headers = {"User-Agent": "Mozilla/5.0"}

    for candidate in candidates:
        url = f"https://www.numbeo.com/traffic/in/{candidate}"
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")

                values = {
                    "traffic_index": None,
                    "time_index": None,
                    "time_exp_index": None,
                    "inefficiency_index": None,
                    "co2_emission_index": None
                }

                # Robust label mapping
                label_map = {
                    "traffic_index": "Traffic Index:",
                    "time_index": "Time Index",
                    "time_exp_index": "Time Exp. Index:",
                    "inefficiency_index": "Inefficiency Index:",
                    "co2_emission_index": "Emission Index:"  # avoids <sub>2</sub>
                }

                found = False
                for col, text in label_map.items():
                    label_el = soup.find(string=re.compile(re.escape(text)))
                    if label_el:
                        parent = label_el.parent
                        val_td = parent.find_next("td")
                        if val_td:
                            clean_text = val_td.get_text(strip=True).replace(",", "")
                            match_num = re.search(r"(\d+\.\d+)", clean_text)
                            if match_num:
                                values[col] = float(match_num.group(1))
                                found = True

                if found:
                    return values
        except Exception:
            continue

    return None

# --- 3. Execution ---
if still_missing:
    print(f"\n--- RETRYING FULL RECOVERY FOR {len(still_missing)} CITIES ---")
    new_rows = []

    for city in still_missing:
        print(f"üîç Deep Search for: {city}...", end=" ")
        time.sleep(1)

        val_dict = scrape_traffic_full_columns(city)

        if val_dict:
            print(f"‚úÖ FOUND (Traffic: {val_dict.get('traffic_index')} | CO2: {val_dict.get('co2_emission_index')})")
            row = {
                "city": city,
                "country": CITY_TO_COUNTRY.get(city, None),  # ‚úÖ use correct country
                "data_source": "Single_Page_Recovery"
            }
            row.update(val_dict)
            new_rows.append(row)
        else:
            print("‚ùå Data not found.")

    if new_rows:
        df_new = pd.DataFrame(new_rows)
        df_final = pd.concat([df_final, df_new], ignore_index=True)

        # Ensure numeric columns are float
        for col in ["traffic_index", "time_index", "time_exp_index", "inefficiency_index", "co2_emission_index"]:
            if col in df_final.columns:
                df_final[col] = df_final[col].astype(float)

        # Cleanup & reindex
        df_final = df_final.sort_values("city").reset_index(drop=True)
        df_final.index = df_final.index + 1
        df_final.index.name = "city_index"

        print("\n--- UPDATE COMPLETED ---")
        recovered_cities = [r["city"] for r in new_rows]
        print(df_final[df_final["city"].isin(recovered_cities)][["city", "country", "traffic_index", "co2_emission_index"]])
    else:
        print("No new data added.")
else:
    print("All cities are already present! No action needed.")

Cities still missing (Traffic): {'Salzburg', 'Dresden', 'Marseille', 'Seville (Sevilla)', 'Valletta', 'Florence', 'Dubrovnik', 'Palma de Mallorca'}

--- RETRYING FULL RECOVERY FOR 8 CITIES ---
‚úÖ FOUND (Traffic: 95.39 | CO2: 1407.14)
‚úÖ FOUND (Traffic: 85.49 | CO2: 2351.73)
‚úÖ FOUND (Traffic: 112.79 | CO2: 3736.11)
‚úÖ FOUND (Traffic: 125.18 | CO2: 4086.0). 
‚úÖ FOUND (Traffic: 106.46 | CO2: 3010.0)
‚úÖ FOUND (Traffic: 108.23 | CO2: 3815.13)
‚úÖ FOUND (Traffic: 105.26 | CO2: 4887.75)
‚úÖ FOUND (Traffic: 125.41 | CO2: 5260.17) 

--- UPDATE COMPLETED ---
                         city  country  traffic_index  co2_emission_index
city_index                                                               
12                    Dresden  Germany          85.49             2351.73
14                  Dubrovnik  Croatia         105.26             4887.75
17                   Florence    Italy         108.23             3815.13
35                  Marseille   France         112.79             37

### Display Recovered Cities (Traffic Index)

In [34]:
# Filter rows added via recovery
recovered_cities = df_final[df_final["data_source"] == "Single_Page_Recovery"]

# Display key metrics for review
print(
    recovered_cities[
        [
            "city",
            "traffic_index",
            "time_index",
            "time_exp_index",
            "inefficiency_index",
            "co2_emission_index",
            "data_source",
        ]
    ]
)

                         city  traffic_index  time_index  time_exp_index  \
city_index                                                                 
12                    Dresden          85.49       23.82           23.82   
14                  Dubrovnik         105.26       22.12           22.12   
17                   Florence         108.23       28.78           65.99   
35                  Marseille         112.79       30.24          120.42   
42          Palma de Mallorca         125.41       30.17          117.00   
51                   Salzburg          95.39       32.36          259.32   
52          Seville (Sevilla)         125.18       32.41          263.99   
59                   Valletta         106.46       30.75          146.89   

            inefficiency_index  co2_emission_index           data_source  
city_index                                                                
12                       68.82             2351.73  Single_Page_Recovery  
14            

### Remove Unnecessary Columns (Traffic Index)

In [35]:
# Columns to remove
cols_to_drop = ["time_exp_index", "inefficiency_index", "co2_emission_index"]

# Drop only if they exist to avoid errors
df_final = df_final.drop(columns=[col for col in cols_to_drop if col in df_final.columns])

print("\nColumns removed successfully")


Columns removed successfully


### Save Final Traffic Data to CSV

In [36]:
# Define output file name
file_name = "significant_european_cities_traffic_index.csv"

# Save DataFrame to CSV
try:
    df_final.to_csv(file_name, index=False, encoding="utf-8")
    print(f"\nSUCCESS! File saved as: {file_name}")
    print(f"Directory: {os.getcwd()}")
except Exception as e:
    print(f"Error saving CSV: {e}")


SUCCESS! File saved as: significant_european_cities_traffic_index.csv
Directory: C:\Users
