### Imports and Constants

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import unidecode

# ------------------------------
# TARGET CITY LISTS & CONFIGS
# ------------------------------

# 67 significant European cities
SIGNIFICANT_EUROPEAN_CITIES = [
    'Lisbon', 'Barcelona', 'Budapest', 'Istanbul', 'Bucharest', 'Madrid', 'Sofia',
    'Krakow (Cracow)', 'Belgrade', 'Prague', 'Porto', 'Valencia', 'Kiev (Kyiv)', 'Moscow', 'Berlin',
    'Vienna', 'Malaga', 'Seville (Sevilla)', 'Rome', 'Faro', 'Athens', 'Warsaw', 'Minsk',
    'Paris', 'Ljubljana', 'Florence', 'Liverpool', 'Tallinn', 'Zagreb', 'Hamburg',
    'Naples', 'Milan', 'Split', 'Brussels', 'Dublin', 'Riga', 'Lyon',
    'Palma de Mallorca', 'Vilnius', 'London', 'Stockholm', 'Munich', 'Marseille',
    'Cologne', 'Amsterdam', 'Hvar', 'Dusseldorf', 'Helsinki', 'Bordeaux',
    'Frankfurt', 'Stuttgart', 'Hanover', 'Copenhagen', 'Dresden', 'Manchester',
    'Rotterdam', 'Saint Petersburg', 'Edinburgh', 'Dubrovnik', 'Oslo', 'Glasgow',
    'Belfast', 'Salzburg', 'Zurich', 'Geneva', 'Valletta', 'Reykjavik'
]

# Problematic cities to exclude
CITIES_TO_EXCLUDE = ['Bordeaux', 'Faro', 'Hvar']

# Numbeo URLs
URLS_TO_SCRAPE = {
    '2025_Global': 'https://www.numbeo.com/cost-of-living/rankings.jsp?title=2025',
    '2023_Europe': 'https://www.numbeo.com/cost-of-living/region_rankings.jsp?title=2023&region=150'
}

### Scraping Function

In [2]:
def scrape_numbeo(url, year_tag):
    """
    Scrape a Numbeo cost-of-living table and clean the data.
    Returns a pandas DataFrame with normalized city & country names.
    """
    try:
        html = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}).text
        soup = BeautifulSoup(html, "html.parser")
        table = soup.find("table", {"id": "t2"})
        if not table:
            return pd.DataFrame()
        
        # Extract headers and rows
        headers = [th.text.strip() for th in table.find_all("th")]
        rows = [[td.text.strip() for td in tr.find_all("td")] for tr in table.find_all("tr")[1:]]
        
        # Create DataFrame
        df = pd.DataFrame(rows, columns=headers)
        df['data_year'] = year_tag
        
        # Clean city and country
        df['Country'] = df['City'].str.split(',').str[-1].str.strip().apply(unidecode.unidecode)
        df['City'] = df['City'].str.split(',').str[0].str.strip().apply(unidecode.unidecode)
        
        # Convert column names to snake_case
        df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace(r'[^0-9a-zA-Z_]', '', regex=True)
        
        return df
    except Exception as e:
        print(f"Error scraping {year_tag}: {e}")
        return pd.DataFrame()

### Scrape and Merge Data

In [5]:
# Scrape data from all URLs
df_list = [scrape_numbeo(url, tag) for tag, url in URLS_TO_SCRAPE.items()]
df_list = [df for df in df_list if not df.empty]

if not df_list:
    raise RuntimeError("Could not retrieve any data from Numbeo.")

# Combine datasets and prioritize the most recent year
df_combined = pd.concat(df_list, ignore_index=True)
df_combined['sort_year'] = df_combined['data_year'].str[:4].astype(int)
df_combined = df_combined.sort_values('sort_year', ascending=False)

# Drop duplicates by city, keep the most recent entry
df_unique = df_combined.drop_duplicates(subset=['city'], keep='first').drop(columns=['sort_year'])

### Filter and Clean Final Data

In [6]:
# Remove problematic cities
df_unique = df_unique[~df_unique['city'].isin(CITIES_TO_EXCLUDE)]

# Keep only the refined 64 target cities
REFINED_CITIES = [city for city in SIGNIFICANT_EUROPEAN_CITIES if city not in CITIES_TO_EXCLUDE]
df_unique = df_unique[df_unique['city'].isin(REFINED_CITIES)].copy()

# Convert numeric columns
numeric_cols = [
    "cost_of_living_index", "rent_index", "cost_of_living_plus_rent_index",
    "groceries_index", "restaurant_price_index", "local_purchasing_power_index", "rank"
]
for col in numeric_cols:
    if col in df_unique.columns:
        df_unique[col] = pd.to_numeric(df_unique[col], errors='coerce')

# Reset index and add city_index
df_unique = df_unique.reset_index(drop=True)
df_unique.insert(0, 'city_index', df_unique.index + 1)

### Final Check and Preview

In [9]:
# Check for missing cities
found_cities = set(df_unique['city'])
missing_cities = sorted(set(REFINED_CITIES) - found_cities)

print(f"Total target cities: {len(REFINED_CITIES)}")
print(f"Total cities found: {len(found_cities)}")
print(f"Cities still missing: {missing_cities if missing_cities else 'None!'}\n")

Total target cities: 64
Total cities found: 64
Cities still missing: None!



### Export Data to CSV

In [11]:
import os

# File name
file_name = 'significant_european_cities_cost_of_living.csv'

try:
    # Save the filtered DataFrame
    df_unique.to_csv(file_name, index=False, encoding='utf-8')
    print(f"File saved as: {file_name}")
    print(f"Directory: {os.getcwd()}")
except NameError:
    print("ERROR: 'df_filtered' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

File saved as: significant_european_cities_cost_of_living.csv
Directory: C:\Users
