### Imports

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import unidecode
import re
import time
import os

### Configuration & City Lists (Crime Index)

In [4]:
# List of significant European cities (aligned with source naming)
SIGNIFICANT_EUROPEAN_CITIES = [
    'Lisbon', 'Barcelona', 'Budapest', 'Istanbul', 'Bucharest', 'Madrid', 'Sofia',
    'Krakow (Cracow)', 'Belgrade', 'Prague', 'Porto', 'Valencia', 'Kiev (Kyiv)',
    'Moscow', 'Berlin', 'Vienna', 'Malaga', 'Seville (Sevilla)', 'Rome', 'Faro',
    'Athens', 'Warsaw', 'Minsk', 'Paris', 'Ljubljana', 'Florence', 'Liverpool',
    'Tallinn', 'Zagreb', 'Hamburg', 'Naples', 'Milan', 'Split', 'Brussels',
    'Dublin', 'Riga', 'Lyon', 'Palma de Mallorca', 'Vilnius', 'London',
    'Stockholm', 'Munich', 'Marseille', 'Cologne', 'Amsterdam', 'Hvar',
    'Dusseldorf', 'Helsinki', 'Bordeaux', 'Frankfurt', 'Stuttgart',
    'Hanover', 'Copenhagen', 'Dresden', 'Manchester', 'Rotterdam',
    'Saint Petersburg', 'Edinburgh', 'Dubrovnik', 'Oslo', 'Glasgow',
    'Belfast', 'Salzburg', 'Zurich', 'Geneva', 'Valletta', 'Reykjavik'
]

# Cities excluded due to inconsistent or missing data
CITIES_TO_EXCLUDE = [
    'Bordeaux',
    'Faro',
    'Hvar'
]

### Numbeo URLs (Crime Index)

In [5]:
URLS_CRIME = {
    '2025_Current': 'https://www.numbeo.com/crime/rankings_current.jsp',
    '2023_Integration': 'https://www.numbeo.com/crime/rankings.jsp?title=2023'
}

### Scraping

In [8]:
def scrape_numbeo_crime(url, year_tag):
    """
    Scrapes crime and safety index data from Numbeo ranking pages
    and applies basic cleaning.
    """
    try:
        headers_ua = {
            'User-Agent': (
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                'AppleWebKit/537.36 (KHTML, like Gecko) '
                'Chrome/91.0.4472.124 Safari/537.36'
            )
        }

        response = requests.get(url, headers=headers_ua)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")

        # Numbeo ranking table usually has id="t2"
        table = soup.find("table", {"id": "t2"})
        if not table:
            return pd.DataFrame()

        # Expected table columns
        headers = ["Rank", "City", "Crime_Index", "Safety_Index"]

        rows_data = []
        for tr in table.find_all("tr")[1:]:
            cells = tr.find_all("td")
            if len(cells) >= 4:
                row = [cell.get_text(strip=True) for cell in cells]
                rows_data.append(row[:4])

        df = pd.DataFrame(rows_data, columns=headers)
        df['data_source'] = year_tag

        # Convert numeric columns
        df['Crime_Index'] = pd.to_numeric(df['Crime_Index'], errors='coerce')
        df['Safety_Index'] = pd.to_numeric(df['Safety_Index'], errors='coerce')

        # --------------------
        # Data Cleaning
        # --------------------

        # Extract country from city string
        df['Country'] = (
            df['City']
            .str.split(',')
            .str[-1]
            .str.strip()
            .apply(unidecode.unidecode)
        )

        # Clean city name
        df['City'] = (
            df['City']
            .str.split(',')
            .str[0]
            .str.strip()
            .apply(unidecode.unidecode)
        )

        # Normalize column names
        df.columns = (
            df.columns
            .str.strip()
            .str.lower()
            .str.replace(' ', '_')
        )

        return df

    except Exception as e:
        print(f"Error scraping {year_tag}: {e}")
        return pd.DataFrame()


def scrape_crime_smart_fallback(city_name):
    """
    Tries multiple city name variants to fetch crime data
    from individual Numbeo city pages.
    """
    candidates = []

    # Variant 1: Name before parentheses
    var_a = city_name.split('(')[0].strip().replace(" ", "-").title()
    candidates.append(var_a)

    # Variant 2: Name inside parentheses
    match = re.search(r'\((.*?)\)', city_name)
    if match:
        var_b = match.group(1).strip().replace(" ", "-").title()
        candidates.append(var_b)

    headers = {'User-Agent': 'Mozilla/5.0'}

    for candidate in candidates:
        url = f"https://www.numbeo.com/crime/in/{candidate}"

        try:
            response = requests.get(url, headers=headers)
            if response.status_code != 200:
                continue

            soup = BeautifulSoup(response.text, "html.parser")

            values = {
                'crime_index': None,
                'safety_index': None
            }

            label_map = {
                'crime_index': "Crime Index:",
                'safety_index': "Safety Index:"
            }

            found_data = False

            for col_name, label_text in label_map.items():
                label_el = soup.find(string=re.compile(re.escape(label_text)))
                if label_el:
                    parent = label_el.parent
                    value_cell = parent.find_next("td")
                    if value_cell:
                        raw_text = value_cell.get_text(strip=True)
                        match_num = re.search(r"(\d+\.\d+)", raw_text)
                        if match_num:
                            values[col_name] = float(match_num.group(1))
                            found_data = True

            if found_data:
                return values

        except Exception:
            continue

    return None

# MAIN EXECUTION

print("Starting CRIME scraping process...")

df_list = [
    scrape_numbeo_crime(url, tag)
    for tag, url in URLS_CRIME.items()
]

df_list = [df for df in df_list if not df.empty]

if df_list:
    df_combined = pd.concat(df_list, ignore_index=True)

    # --------------------
    # Statistics (Before filtering)
    # --------------------
    print("\n--- STATISTICS (Before filtering) ---")
    print(df_combined['data_source'].value_counts())

    # Prioritize latest data
    df_combined['sort_helper'] = df_combined['data_source'].apply(
        lambda x: 0 if '2025' in x else 1
    )
    df_combined = df_combined.sort_values('sort_helper')

    # Remove duplicates, keeping latest source
    df_unique = df_combined.drop_duplicates(
        subset=['city'],
        keep='first'
    ).copy()

    # Remove excluded cities
    df_unique = df_unique[~df_unique['city'].isin(CITIES_TO_EXCLUDE)]

    # Keep only significant cities
    REFINED_CITIES = [
        city for city in SIGNIFICANT_EUROPEAN_CITIES
        if city not in CITIES_TO_EXCLUDE
    ]

    df_final = df_unique[df_unique['city'].isin(REFINED_CITIES)].copy()

    # --------------------
    # Final Columns
    # --------------------
    cols_to_keep = [
        'city',
        'country',
        'crime_index',
        'safety_index',
        'data_source'
    ]
    df_final = df_final[cols_to_keep]

    # Sort and index
    df_final = df_final.sort_values('city').reset_index(drop=True)
    df_final.index = df_final.index + 1
    df_final.index.name = 'city_index'

    print("\n--- Final Scraped Data ---")
    print(df_final)

    print("\nUsed sources for the final result:")
    print(df_final['data_source'].value_counts())

else:
    print("No data scraped.")

Starting CRIME scraping process...

--- STATISTICS (Before filtering) ---
data_source
2023_Integration    416
2025_Current        400
Name: count, dtype: int64

--- Final Scraped Data ---
                 city         country  crime_index  safety_index   data_source
city_index                                                                    
1           Amsterdam     Netherlands         25.7          74.3  2025_Current
2              Athens          Greece         55.2          44.8  2025_Current
3           Barcelona           Spain         51.9          48.1  2025_Current
4             Belfast  United Kingdom         48.0          52.0  2025_Current
5            Belgrade          Serbia         37.8          62.2  2025_Current
...               ...             ...          ...           ...           ...
57             Vienna         Austria         28.2          71.8  2025_Current
58            Vilnius       Lithuania         30.1          69.9  2025_Current
59             Warsaw 

### Missing Cities

In [9]:
# Identify Missing Cities

cities_we_wanted = set(REFINED_CITIES)
cities_we_got = set(df_final['city'].unique())

# Cities present in the reference list but missing in final data
missing_cities = cities_we_wanted - cities_we_got

# Report
print("\n--- REPORT DISCREPANCIES ---")
print(f"Number of searched cities: {len(cities_we_wanted)}")
print(f"Found cities: {len(cities_we_got)}")
print(f"Missing cities: {len(missing_cities)}")

if missing_cities:
    print("\nCities not found:")
    for city in sorted(missing_cities):
        print(f" {city}")
else:
    print("No missing city!")


--- REPORT DISCREPANCIES ---
Number of searched cities: 64
Found cities: 61
Missing cities: 3

Cities not found:
 Dubrovnik
 Salzburg
 Valletta


### Recover Missing Cities

In [10]:
# --------------------------------------------------
# Recovery Process
# --------------------------------------------------

if missing_cities:
    print("\n--- RECOVERING MISSING CITIES ---")

    new_rows = []

    for city in missing_cities:
        print(f"üîç Searching: {city}...", end=" ")

        # Courtesy delay to avoid aggressive requests
        time.sleep(1)

        val_dict = scrape_crime_smart_fallback(city)

        if val_dict:
            print(f"FOUND (Crime: {val_dict.get('crime_index')})")

            row = {
                'city': city,
                'country': city,
                'data_source': 'Single_Page_Recovery'
            }
            row.update(val_dict)
            new_rows.append(row)
        else:
            print("Not found.")

    if new_rows:
        df_new = pd.DataFrame(new_rows)
        df_final = pd.concat([df_final, df_new], ignore_index=True)


# --------------------------------------------------
# Final Sorting & Indexing
# --------------------------------------------------

# Sort alphabetically by city
df_final = df_final.sort_values(by='city').reset_index(drop=True)

# Recreate index starting from 1
df_final.index = df_final.index + 1
df_final.index.name = 'city_index'

print("\n--- Final Scraped Data ---")
print(df_final.tail(10))

print("\nUsed sources for the final result:")
print(df_final['data_source'].value_counts())


# ======================
# Final Check
# ======================


# --------------------------------------------------
# Verification Snippet
# --------------------------------------------------

# Cities recovered using the smart fallback method
recovered_cities_crime = df_final[
    df_final['data_source'] == 'Single_Page_Recovery'
]

print("\n--- VERIFICATION: RECOVERED CITIES ---")
if not recovered_cities_crime.empty:
    print(
        recovered_cities_crime[
            ['city', 'crime_index', 'safety_index', 'data_source']
        ]
    )
else:
    print("No cities needed recovery (or none were found).")


--- RECOVERING MISSING CITIES ---
FOUND (Crime: 17.77)nik... 
FOUND (Crime: 36.24)ta... 
FOUND (Crime: 20.5)urg... 

--- Final Scraped Data ---
                 city      country  crime_index  safety_index  \
city_index                                                      
55          Stockholm       Sweden        46.50         53.50   
56          Stuttgart      Germany        30.40         69.60   
57            Tallinn      Estonia        21.70         78.30   
58           Valencia        Spain        34.60         65.40   
59           Valletta     Valletta        36.24         63.76   
60             Vienna      Austria        28.20         71.80   
61            Vilnius    Lithuania        30.10         69.90   
62             Warsaw       Poland        25.40         74.60   
63             Zagreb      Croatia        21.40         78.60   
64             Zurich  Switzerland        23.40         76.60   

                     data_source  
city_index                        
55  

### Export CSV

In [11]:
# --------------------------------------------------
# Save Final Data to CSV
# --------------------------------------------------

file_name = 'significant_european_cities_crime_safety_index.csv'

try:
    df_final.to_csv(file_name, index=False, encoding='utf-8')

    print(f"\nFile saved as: {file_name}")
    print(f"Directory: {os.getcwd()}")

except Exception as e:
    print(f"An error occurred while saving the CSV: {e}")


File saved as: significant_european_cities_crime_index.csv
Directory: C:\Users
