### Imports

In [139]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import unidecode
import re
import requests
import time
import os

### Configuration & City Lists

In [130]:
# List of significant European cities (aligned with Numbeo naming)
SIGNIFICANT_EUROPEAN_CITIES = [
    'Lisbon', 'Barcelona', 'Budapest', 'Istanbul', 'Bucharest', 'Madrid', 'Sofia',
    'Krakow (Cracow)', 'Belgrade', 'Prague', 'Porto', 'Valencia', 'Kiev (Kyiv)', 'Moscow',
    'Berlin', 'Vienna', 'Malaga', 'Seville (Sevilla)', 'Rome', 'Faro', 'Athens', 'Warsaw',
    'Minsk', 'Paris', 'Ljubljana', 'Florence', 'Liverpool', 'Tallinn', 'Zagreb', 'Hamburg',
    'Naples', 'Milan', 'Split', 'Brussels', 'Dublin', 'Riga', 'Lyon', 'Palma de Mallorca',
    'Vilnius', 'London', 'Stockholm', 'Munich', 'Marseille', 'Cologne', 'Amsterdam',
    'Hvar', 'Dusseldorf', 'Helsinki', 'Bordeaux', 'Frankfurt', 'Stuttgart', 'Hanover',
    'Copenhagen', 'Dresden', 'Manchester', 'Rotterdam', 'Saint Petersburg', 'Edinburgh',
    'Dubrovnik', 'Oslo', 'Glasgow', 'Belfast', 'Salzburg', 'Zurich', 'Geneva',
    'Valletta', 'Reykjavik'
]

# Cities excluded due to inconsistent or unreliable data
CITIES_TO_EXCLUDE = ['Bordeaux', 'Faro', 'Hvar']

### Numbeo URLs (Health Care Rankings)

In [131]:
URLS_TO_SCRAPE = {
    '2025_Current': 'https://www.numbeo.com/health-care/rankings_current.jsp',
    '2023_Integration': 'https://www.numbeo.com/health-care/rankings.jsp?title=2023'
}

### Scraping Function

In [132]:
def scrape_numbeo_healthcare(url, year_tag):
    """
    Scrapes the Numbeo Health Care table and cleans the data.
    """
    try:
        headers_ua = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, headers=headers_ua)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Numbeo main tables usually have id="t2"
        table = soup.find("table", {"id": "t2"})
        if not table:
            return pd.DataFrame()
        
        # Define headers manually (matching the columns in your image)
        # Col 0: Rank (skipped in usage usually)
        # Col 1: City
        # Col 2: Health Care Index
        # Col 3: Health Care Exp. Index
        headers = ["Rank", "City", "Health_Care_Index", "Health_Care_Exp_Index"]
        
        rows_data = []
        for tr in table.find_all("tr")[1:]:
            cells = tr.find_all("td")
            if len(cells) >= 4: # Ensure we have enough columns
                # Extract text
                row = [cell.get_text(strip=True) for cell in cells]
                # Keep first 4 columns
                rows_data.append(row[:4])
        
        df = pd.DataFrame(rows_data, columns=headers)
        df['data_source'] = year_tag
        
        # Convert numeric columns to float
        # We replace any non-numeric characters just in case, though usually clean
        df['Health_Care_Index'] = pd.to_numeric(df['Health_Care_Index'], errors='coerce')
        df['Health_Care_Exp_Index'] = pd.to_numeric(df['Health_Care_Exp_Index'], errors='coerce')
        
        # --- Data Cleaning Logic ---
        # 1. Extract Country
        df['Country'] = df['City'].str.split(',').str[-1].str.strip().apply(unidecode.unidecode)
        
        # 2. Clean City name
        df['City'] = df['City'].str.split(',').str[0].str.strip().apply(unidecode.unidecode)
        
        # Normalize column names to snake_case (e.g. Health_Care_Index -> health_care_index)
        df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
        
        return df

    except Exception as e:
        print(f"Error scraping {year_tag}: {e}")
        return pd.DataFrame()

### Main Execution

In [133]:
print("Starting scraping process...")
df_list = [scrape_numbeo_healthcare(url, tag) for tag, url in URLS_TO_SCRAPE.items()]
df_list = [df for df in df_list if not df.empty]

if df_list:
    df_combined = pd.concat(df_list, ignore_index=True)
    
    # --- DEBUG ---
    print("\n--- STATISTICS (Befor filtering) ---")
    print(df_combined['data_source'].value_counts())
    # It shows the number of rows for 2025 and for 2023
    # ------------------------------------------

    # Sort to prioritize 2025 data
    df_combined['sort_helper'] = df_combined['data_source'].apply(lambda x: 0 if '2025' in x else 1)
    df_combined = df_combined.sort_values('sort_helper')
    
    # Drop duplicates keeping the first one found (2025)
    df_unique = df_combined.drop_duplicates(subset=['city'], keep='first').copy()
    
    # Filter: Remove excluded cities
    df_unique = df_unique[~df_unique['city'].isin(CITIES_TO_EXCLUDE)]
    
    # Filter: Keep only significant cities
    REFINED_CITIES = [city for city in SIGNIFICANT_EUROPEAN_CITIES if city not in CITIES_TO_EXCLUDE]
    
    # To see cities' source
    df_final = df_unique[df_unique['city'].isin(REFINED_CITIES)].copy()
    
    # --- FINAL COLUMNS SELECTION ---
    cols_to_keep = ['city', 'country', 'health_care_index', 'health_care_exp_index', 'data_source']
    df_final = df_final[cols_to_keep]
    
    # Sort alphabetically by City
    df_final = df_final.sort_values(by='city').reset_index(drop=True)
    
    # Add index starting from 1
    df_final.index = df_final.index + 1
    df_final.index.name = 'city_index'
    
    print("\n--- Final Scraped Data ---")
    print(df_final)
    
    # Final check: 
    print("\nUsed sources for the final result:")
    print(df_final['data_source'].value_counts())
    
else:
    print("No data scraped.")

Starting scraping process...

--- STATISTICS (Befor filtering) ---
data_source
2025_Current        321
2023_Integration    266
Name: count, dtype: int64

--- Final Scraped Data ---
                        city         country  health_care_index  \
city_index                                                        
1                  Amsterdam     Netherlands               81.5   
2                     Athens          Greece               58.3   
3                  Barcelona           Spain               76.7   
4                    Belfast  United Kingdom               70.7   
5                   Belgrade          Serbia               53.1   
6                     Berlin         Germany               66.5   
7                   Brussels         Belgium               73.6   
8                  Bucharest         Romania               55.3   
9                   Budapest         Hungary               52.2   
10                   Cologne         Germany               72.0   
11             

### Identify Missing Cities

In [134]:
cities_we_wanted = set([c for c in SIGNIFICANT_EUROPEAN_CITIES if c not in CITIES_TO_EXCLUDE])
cities_we_got = set(df_final['city'].unique())
missing_cities = cities_we_wanted - cities_we_got

print("\n--- REPORT DISCREPANZE ---")
print(f"Number of searched cities: {len(cities_we_wanted)}")
print(f"Found cities: {len(cities_we_got)}")
print(f"Missing cities: {len(missing_cities)}")

if missing_cities:
    print("\nCities not found:")
    for city in sorted(missing_cities):
        print(f"‚ùå {city}")
else:
    print("No missing city!")


--- REPORT DISCREPANZE ---
Number of searched cities: 64
Found cities: 56
Missing cities: 8

Cities not found:
‚ùå Dresden
‚ùå Dubrovnik
‚ùå Hanover
‚ùå Marseille
‚ùå Palma de Mallorca
‚ùå Salzburg
‚ùå Seville (Sevilla)
‚ùå Valletta


### Recover Remaining Missing Cities

In [135]:
# --- 1. IDENTIFY REMAINING MISSING CITIES ---
current_cities = set(df_final['city'].unique())
wanted_cities = set([c for c in SIGNIFICANT_EUROPEAN_CITIES if c not in CITIES_TO_EXCLUDE])
still_missing = wanted_cities - current_cities

print(f"Cities still missing: {still_missing}")

# --- 2. NEW "SMART" RECOVERY FUNCTION ---
def scrape_smart_fallback(city_name):
    """
    Tries multiple name variants to find the correct URL.
    """
    candidates = []
    
    # Variant A: Part before the parentheses (e.g., "Seville")
    var_a = city_name.split('(')[0].strip().replace(" ", "-").title()
    candidates.append(var_a)
    
    # Variant B: Part INSIDE the parentheses (e.g., "Sevilla")
    match = re.search(r'\((.*?)\)', city_name)
    if match:
        var_b = match.group(1).strip().replace(" ", "-").title()
        candidates.append(var_b)
        
    headers = {'User-Agent': 'Mozilla/5.0'}

    for candidate in candidates:
        url = f"https://www.numbeo.com/health-care/in/{candidate}"
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")
                
                # Pattern 1: Search for specific text
                label = soup.find(string=re.compile("Health Care System Index"))
                if label:
                    parent = label.parent
                    container = parent.find_next("td")
                    if container:
                        val_text = container.get_text(strip=True)
                        match_num = re.search(r"(\d+\.\d+)", val_text)
                        if match_num:
                            return float(match_num.group(1))
                            
                # Pattern 2: Search in raw text
                full_text = soup.get_text()
                match_text = re.search(r"Health Care System Index:\s*(\d+\.\d+)", full_text)
                if match_text:
                    return float(match_text.group(1))
                    
        except Exception as e:
            continue
            
    return None

# --- 3. EXECUTION ---

if still_missing:
    print(f"\n--- FINAL ATTEMPT FOR {len(still_missing)} CITIES ---")
    new_data = []
    
    for city in still_missing:
        print(f"üîç Deep Search for: {city}...", end=" ")
        time.sleep(1) # Courtesy sleep
        
        val = scrape_smart_fallback(city)
        
        if val:
            print(f" FOUND: {val}")
            new_data.append({
                'city': city,
                'country': city, # Placeholder
                'health_care_index': val,
                'health_care_exp_index': None,
                'data_source': 'Single_Page_Recovery'
            })
        else:
            print(f" Unable to find data even with alternative URLs.")

    # --- DATAFRAME UPDATE ---
    if new_data:
        df_new = pd.DataFrame(new_data)
        
        # FIX FOR THE WARNING: Explicitly cast empty column to float
        df_new['health_care_exp_index'] = df_new['health_care_exp_index'].astype(float)
        
        df_final = pd.concat([df_final, df_new], ignore_index=True)
        
        # Final cleanup and sorting
        df_final = df_final.sort_values(by='city').reset_index(drop=True)
        df_final.index = df_final.index + 1
        df_final.index.name = 'city_index'
        
        print("\n--- UPDATE COMPLETED ---")
        print(df_final.tail(10)) 
    else:
        print("No new data added.")

else:
    print("All cities are already present! No action needed.")

Cities still missing: {'Marseille', 'Palma de Mallorca', 'Dresden', 'Salzburg', 'Hanover', 'Valletta', 'Dubrovnik', 'Seville (Sevilla)'}

--- FINAL ATTEMPT FOR 8 CITIES ---
 FOUND: 82.37h for: Marseille... 
 FOUND: 90.23h for: Palma de Mallorca... 
 FOUND: 79.68h for: Dresden... 
 FOUND: 73.25h for: Salzburg... 
 FOUND: 66.96h for: Hanover... 
 FOUND: 55.1ch for: Valletta... 
 FOUND: 58.43h for: Dubrovnik... 
 FOUND: 72.0ch for: Seville (Sevilla)... 

--- UPDATE COMPLETED ---
                 city      country  health_care_index  health_care_exp_index  \
city_index                                                                     
55          Stockholm       Sweden               65.7                  119.7   
56          Stuttgart      Germany               80.8                  148.4   
57            Tallinn      Estonia               73.7                  134.9   
58           Valencia        Spain               81.7                  150.4   
59           Valletta     Valletta     

### Final Check

In [136]:
recovered_cities_hc = df_final[df_final['data_source'] == 'Single_Page_Recovery']
print(recovered_cities_hc[['city', 'health_care_index', 'health_care_exp_index', 'data_source']])

                         city  health_care_index  health_care_exp_index  \
city_index                                                                
12                    Dresden              79.68                    NaN   
14                  Dubrovnik              58.43                    NaN   
22                    Hanover              66.96                    NaN   
35                  Marseille              82.37                    NaN   
42          Palma de Mallorca              90.23                    NaN   
51                   Salzburg              73.25                    NaN   
52          Seville (Sevilla)              72.00                    NaN   
59                   Valletta              55.10                    NaN   

                     data_source  
city_index                        
12          Single_Page_Recovery  
14          Single_Page_Recovery  
22          Single_Page_Recovery  
35          Single_Page_Recovery  
42          Single_Page_Recovery  
51  

### Delete Unnecessary Column

In [137]:
# Check if the column exists first to avoid errors if run multiple times
if 'health_care_exp_index' in df_final.columns:
    df_final = df_final.drop(columns=['health_care_exp_index'])
    print("üóëÔ∏è Column 'health_care_exp_index' dropped.")
else:
    print("‚ö†Ô∏è Column not found (it might have been removed already).")

üóëÔ∏è Column 'health_care_exp_index' dropped.


### Export CSV

In [141]:
file_name = 'significant_european_cities_health_index.csv'

try:
    df_final.to_csv(file_name, index=False, encoding='utf-8')
    print(f" SUCCESS! \n File saved as: {file_name}")
    print(f" Location: {os.getcwd()}")
except Exception as e:
    print(f" Error saving CSV: {e}")

 SUCCESS! 
 File saved as: significant_european_cities_health_index.csv
 Location: C:\Users
