### Import

In [25]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import unidecode
import re
import time
import os

### Configuration & City Lists (Pollution Index)

In [26]:
# List of significant European cities (aligned with source naming)
SIGNIFICANT_EUROPEAN_CITIES = [
    'Lisbon', 'Barcelona', 'Budapest', 'Istanbul', 'Bucharest', 'Madrid', 'Sofia',
    'Krakow (Cracow)', 'Belgrade', 'Prague', 'Porto', 'Valencia', 'Kiev (Kyiv)', 'Moscow', 'Berlin',
    'Vienna', 'Malaga', 'Seville (Sevilla)', 'Rome', 'Faro', 'Athens', 'Warsaw', 'Minsk',
    'Paris', 'Ljubljana', 'Florence', 'Liverpool', 'Tallinn', 'Zagreb', 'Hamburg',
    'Naples', 'Milan', 'Split', 'Brussels', 'Dublin', 'Riga', 'Lyon',
    'Palma de Mallorca', 'Vilnius', 'London', 'Stockholm', 'Munich', 'Marseille',
    'Cologne', 'Amsterdam', 'Hvar', 'Dusseldorf', 'Helsinki', 'Bordeaux',
    'Frankfurt', 'Stuttgart', 'Hanover', 'Copenhagen', 'Dresden', 'Manchester',
    'Rotterdam', 'Saint Petersburg', 'Edinburgh', 'Dubrovnik', 'Oslo', 'Glasgow',
    'Belfast', 'Salzburg', 'Zurich', 'Geneva', 'Valletta', 'Reykjavik'
]

# Cities excluded due to inconsistent or missing data
CITIES_TO_EXCLUDE = ['Bordeaux', 'Faro', 'Hvar']

### Nubmeo URLs (Pollution Index)

In [27]:
# Numbeo URLs for Pollution
URLS_POLLUTION = {
    '2025_Current': 'https://www.numbeo.com/pollution/rankings_current.jsp',
    '2023_Integration': 'https://www.numbeo.com/pollution/rankings.jsp?title=2023'
}

### Scraping Function (Pollution Index)

In [28]:
def scrape_numbeo_pollution(url, year_tag):
    """
    Scrapes the Numbeo Pollution Index table and cleans the data.
    """
    try:
        headers_ua = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, headers=headers_ua)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, "html.parser")
        table = soup.find("table", {"id": "t2"})
        if not table:
            return pd.DataFrame()
        
        headers = ["Rank", "City", "Pollution_Index", "Pollution_Exp_Index"]
        
        rows_data = []
        for tr in table.find_all("tr")[1:]:
            cells = tr.find_all("td")
            if len(cells) >= 4:
                row = [cell.get_text(strip=True) for cell in cells]
                rows_data.append(row[:4])
        
        df = pd.DataFrame(rows_data, columns=headers)
        df['data_source'] = year_tag
        
        # Convert numeric columns
        df['Pollution_Index'] = pd.to_numeric(df['Pollution_Index'], errors='coerce')
        df['Pollution_Exp_Index'] = pd.to_numeric(df['Pollution_Exp_Index'], errors='coerce')
        
        # Clean City and Country
        df['Country'] = df['City'].str.split(',').str[-1].str.strip().apply(unidecode.unidecode)
        df['City'] = df['City'].str.split(',').str[0].str.strip().apply(unidecode.unidecode)
        df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
        
        return df

    except Exception as e:
        print(f"Error scraping {year_tag}: {e}")
        return pd.DataFrame()

### Main Execution (Pollution Index)

In [30]:
print("Starting initial scraping...")
df_list = [scrape_numbeo_pollution(url, tag) for tag, url in URLS_POLLUTION.items()]
df_list = [df for df in df_list if not df.empty]

if df_list:
    df_combined = pd.concat(df_list, ignore_index=True)
    df_combined['sort_helper'] = df_combined['data_source'].apply(lambda x: 0 if '2025' in x else 1)
    df_combined = df_combined.sort_values('sort_helper')
    df_unique = df_combined.drop_duplicates(subset=['city'], keep='first').copy()
    
    # Filtering
    df_unique = df_unique[~df_unique['city'].isin(CITIES_TO_EXCLUDE)]
    REFINED_CITIES = [city for city in SIGNIFICANT_EUROPEAN_CITIES if city not in CITIES_TO_EXCLUDE]
    df_final = df_unique[df_unique['city'].isin(REFINED_CITIES)].copy()
    
    # Column selection
    cols_to_keep = ['city', 'country', 'pollution_index', 'pollution_exp_index', 'data_source']
    df_final = df_final[cols_to_keep]
    print(f"Initial scraping complete. Found {len(df_final)} cities.")
else:
    print("No data found.")

Starting initial scraping...
Initial scraping complete. Found 56 cities.


### Identify Missing Cities (Pollution Index)

In [35]:
# --- Identify missing cities ---
cities_we_wanted = {city for city in SIGNIFICANT_EUROPEAN_CITIES if city not in CITIES_TO_EXCLUDE}
cities_we_got = set(df_final['city'].unique())
missing_cities = cities_we_wanted - cities_we_got

if missing_cities:
    print("\nCities not found:")
    for city in sorted(missing_cities):
        print(f"‚ùå {city}")
else:
    print("‚úÖ No missing city!")


Cities not found:
‚ùå Dresden
‚ùå Dubrovnik
‚ùå Hanover
‚ùå Malaga
‚ùå Palma de Mallorca
‚ùå Salzburg
‚ùå Seville (Sevilla)
‚ùå Valletta


### Smart Recovery for Missing Cities (Pollution Index)

In [36]:
def scrape_pollution_smart_fallback(city_name):
    """
    Attempts multiple name variants to extract pollution metrics from Numbeo.
    """
    candidates = []
    # Variant A: Name before parentheses
    var_a = city_name.split('(')[0].strip().replace(" ", "-").title()
    candidates.append(var_a)
    
    # Variant B: Name inside parentheses
    match = re.search(r'\((.*?)\)', city_name)
    if match:
        var_b = match.group(1).strip().replace(" ", "-").title()
        candidates.append(var_b)
        
    headers = {'User-Agent': 'Mozilla/5.0'}

    for candidate in candidates:
        url = f"https://www.numbeo.com/pollution/in/{candidate}"
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")
                
                values = {'pollution_index': None, 'pollution_exp_index': None}
                label_map = {
                    'pollution_index': "Pollution Index:",
                    'pollution_exp_index': "Pollution Exp. Index:"
                }
                
                found_something = False
                for col_name, search_text in label_map.items():
                    label_el = soup.find(string=re.compile(re.escape(search_text)))
                    if label_el:
                        parent = label_el.parent
                        value_container = parent.find_next("td")
                        if value_container:
                            raw_text = value_container.get_text(strip=True)
                            match_num = re.search(r"(\d+\.\d+)", raw_text)
                            if match_num:
                                values[col_name] = float(match_num.group(1))
                                found_something = True
                
                if found_something:
                    return values
        except Exception:
            continue
    return None

# Mapping for missing cities to correct countries
CITY_TO_COUNTRY = {
    "Malaga": "Spain",
    "Seville (Sevilla)": "Spain",
    "Hanover": "Germany",
    "Dubrovnik": "Croatia",
    "Valletta": "Malta",
    "Salzburg": "Austria",
    "Dresden": "Germany",
    "Palma de Mallorca": "Spain"
}

# --- Recover missing cities ---
if missing_cities:
    print(f"\nRecovering {len(missing_cities)} missing cities...")
    new_rows = []

    for city in missing_cities:
        print(f"üîç Searching: {city}...", end=" ")
        time.sleep(1)
        val_dict = scrape_pollution_smart_fallback(city)

        if val_dict:
            print(f"‚úÖ FOUND (Pollution: {val_dict.get('pollution_index')})")
            row = {
                'city': city,
                'country': CITY_TO_COUNTRY.get(city, None),
                'data_source': 'Single_Page_Recovery'
            }
            row.update(val_dict)
            new_rows.append(row)
        else:
            print("‚ùå Not found.")

    if new_rows:
        df_new = pd.DataFrame(new_rows)

        # Ensure numeric columns are floats
        for col in ['pollution_index', 'pollution_exp_index']:
            if col in df_new.columns:
                df_new[col] = df_new[col].astype(float)

        # Add recovered rows to the main DataFrame
        df_final = pd.concat([df_final, df_new], ignore_index=True)
        print(f"\n‚úÖ Recovered {len(new_rows)} cities added to df_final")
else:
    print("‚úÖ No missing cities to recover.")


Recovering 8 missing cities...
‚úÖ FOUND (Pollution: 33.86)
‚úÖ FOUND (Pollution: 40.54)illa)... 
‚úÖ FOUND (Pollution: 26.17)
‚úÖ FOUND (Pollution: 21.47) 
‚úÖ FOUND (Pollution: 74.23)
‚úÖ FOUND (Pollution: 25.56)
‚úÖ FOUND (Pollution: 22.35)
‚úÖ FOUND (Pollution: 39.01)lorca... 

‚úÖ Recovered 8 cities added to df_final


### Final Check

In [37]:
# Sort alphabetically and reset index
df_final = df_final.sort_values(by='city').reset_index(drop=True)
df_final.index = df_final.index + 1
df_final.index.name = 'city_index'

# Display specific recovery check
recovered_cities = df_final[df_final['data_source'] == 'Single_Page_Recovery']

print("\n--- FINAL VERIFICATION ---")
print(f"Total cities in DataFrame: {len(df_final)}")
print("\n--- Recovered cities details ---")
if not recovered_cities.empty:
    print(recovered_cities[['city', 'pollution_index', 'pollution_exp_index', 'data_source']])
else:
    print("No recovered cities found.")

# Quick check for duplicates
duplicates = df_final[df_final.duplicated(subset=['city'])]
if not duplicates.empty:
    print(f"\n‚ö†Ô∏è WARNING: Found {len(duplicates)} duplicates!")
else:
    print("\n‚úÖ No duplicate cities found.")


--- FINAL VERIFICATION ---
Total cities in DataFrame: 64

--- Recovered cities details ---
                         city  pollution_index  pollution_exp_index  \
city_index                                                            
12                    Dresden            22.35                  NaN   
14                  Dubrovnik            21.47                  NaN   
22                    Hanover            26.17                  NaN   
33                     Malaga            33.86                  NaN   
42          Palma de Mallorca            39.01                  NaN   
51                   Salzburg            25.56                  NaN   
52          Seville (Sevilla)            40.54                  NaN   
59                   Valletta            74.23                  NaN   

                     data_source  
city_index                        
12          Single_Page_Recovery  
14          Single_Page_Recovery  
22          Single_Page_Recovery  
33          Single_Pag

### Delete unnesessary column

In [38]:
if 'pollution_exp_index' in df_final.columns:
    df_final = df_final.drop(columns=['pollution_exp_index'])
    print("üóëÔ∏è Column 'pollution_exp_index' dropped.")
else:
    print("‚ö†Ô∏è Column 'pollution_exp_index' not found.")

# View the final clean dataset
print("\n--- FINAL CLEAN DATASET PREVIEW ---")
print(df_final.head())

üóëÔ∏è Column 'pollution_exp_index' dropped.

--- FINAL CLEAN DATASET PREVIEW ---
                 city         country  pollution_index   data_source
city_index                                                          
1           Amsterdam     Netherlands             22.6  2025_Current
2              Athens          Greece             55.2  2025_Current
3           Barcelona           Spain             63.0  2025_Current
4             Belfast  United Kingdom             26.5  2025_Current
5            Belgrade          Serbia             69.2  2025_Current


### Export CSV

In [39]:
file_name = 'significant_european_cities_pollution_index.csv'

try:
    df_final.to_csv(file_name, index=False, encoding='utf-8')
    print(f" File successfully saved as: {file_name}")
    print(f" Directory: {os.getcwd()}")
except Exception as e:
    print(f" An error occurred while saving: {e}")

 File successfully saved as: significant_european_cities_pollution_index.csv
 Directory: C:\Users
