In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display

In [2]:
TARGET_CITIES = [
    "Paris", "Barcelona", "Tokyo", "New York", "London, United Kingdom",
    "Rome", "Amsterdam", "Sydney", "Bangkok", "Istanbul", 
    "Cairo", "Rio de Janeiro", "Venice", "Los Angeles"
]

MANUAL_URLS = {
    "Venice": "https://www.numbeo.com/quality-of-life/in/Venice",
    "Rio de Janeiro": "https://www.numbeo.com/quality-of-life/in/Rio-De-Janeiro",
    "Los Angeles": "https://www.numbeo.com/quality-of-life/in/Los-Angeles"
}

HEADERS = {"User-Agent": "Mozilla/5.0"}

In [3]:
def get_global_rankings():
    """Scrapes the main Numbeo Quality of Life ranking table."""
    url = "https://www.numbeo.com/quality-of-life/rankings_current.jsp"
    print(f"Fetching global data from {url}...")
    
    try:
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', {'id': 't2'})
        
        if not table: return pd.DataFrame()
        
        headers = [th.get_text(strip=True) for th in table.find('tr').find_all('th')]
        rows = []
        for row in table.find_all('tr')[1:]:
            cells = [td.get_text(strip=True) for td in row.find_all('td')]
            if len(cells) == len(headers):
                rows.append(cells)
                
        df = pd.DataFrame(rows, columns=headers)
        
        traffic_col = next((c for c in df.columns if 'Traffic' in c), 'Traffic Commute Time Index')
        return df.rename(columns={
            'City': 'City', 
            'Safety Index': 'Safety', 
            'Pollution Index': 'Pollution', 
            'Climate Index': 'Climate',
            traffic_col: 'Traffic'
        })
    except Exception as e:
        print(f"Global scrape error: {e}")
        return pd.DataFrame()

def scrape_city_details(city_name, url=None):
    """Scrapes specific city details if missing from global list."""
    if not url:
        clean_name = city_name.replace(" ", "-").title()
        url = f"https://www.numbeo.com/quality-of-life/in/{clean_name}"

    print(f"Force-scraping: {city_name}...")
    try:
        response = requests.get(url, headers=HEADERS)
        if response.status_code != 200: return None
            
        soup = BeautifulSoup(response.text, 'html.parser')
        data = {'City': city_name}
        all_cells = soup.find_all('td')
        
        label_map = {
            "Safety Index": "Safety",
            "Pollution Index": "Pollution",
            "Climate Index": "Climate",
            "Traffic Commute Time Index": "Traffic"
        }

        for i, cell in enumerate(all_cells):
            text = cell.get_text(strip=True)
            for label, key in label_map.items():
                if label in text:
                    try:
                        val = all_cells[i+1].get_text(strip=True)
                        if ":" not in val: # Avoid grabbing labels
                            data[key] = val
                    except IndexError: pass
        
        return data if 'Safety' in data else None
    except Exception as e:
        print(f"Error scraping {city_name}: {e}")
        return None

def calculate_satisfaction_index(df):
    """Cleans data and calculates the custom Travel Satisfaction Index."""
    cols = ['Safety', 'Pollution', 'Traffic', 'Climate']
    
    for col in cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    df = df.dropna(subset=cols).copy()
    
    scaler = MinMaxScaler()
    df['N_Safety'] = scaler.fit_transform(df[['Safety']])
    df['N_Climate'] = scaler.fit_transform(df[['Climate']])
    
    # Invert negative metrics (Pollution/Traffic) so higher is better
    df['N_Pollution'] = 1 - scaler.fit_transform(df[['Pollution']])
    df['N_Traffic'] = 1 - scaler.fit_transform(df[['Traffic']])
    
    # Weighted Score Calculation
    df['Travel_Score'] = (
        (df['N_Safety'] * 0.4) + 
        (df['N_Climate'] * 0.2) +
        (df['N_Pollution'] * 0.2) + 
        (df['N_Traffic'] * 0.2)
    ) * 100
    
    return df.sort_values('Travel_Score', ascending=False)

In [4]:
df_global = get_global_rankings()

found_data = []
found_cities = set()

for _, row in df_global.iterrows():
    city_lower = row['City'].lower()
    for target in TARGET_CITIES:
        if city_lower.startswith(target.lower() + ",") or city_lower == target.lower():
            row_dict = row.to_dict()
            row_dict['Target_City'] = target
            found_data.append(row_dict)
            found_cities.add(target)
            break

missing_cities = set(TARGET_CITIES) - found_cities
if missing_cities:
    print(f"Missing cities found: {missing_cities}")
    for city in missing_cities:
        city_data = scrape_city_details(city, url=MANUAL_URLS.get(city))
        if city_data:
            city_data['Target_City'] = city
            found_data.append(city_data)

if found_data:
    df_final = pd.DataFrame(found_data)
    
    cols_needed = ['Target_City', 'Safety', 'Pollution', 'Traffic', 'Climate']

    for c in cols_needed:
        if c not in df_final.columns: df_final[c] = None
            
    df_scored = calculate_satisfaction_index(df_final[cols_needed])
    df_scored = df_scored.sort_values('Travel_Score', ascending=False).drop_duplicates(subset=['Target_City'], keep='first')
    df_scored['Target_City'] = df_scored['Target_City'].replace('London, United Kingdom', 'London')
    df_scored.insert(0, 'Rank', range(1, len(df_scored) + 1))
    
    display(df_scored[['Rank', 'Target_City', 'Travel_Score', 'Safety', 'Pollution', 'Traffic', 'Climate']])
else:
    print("No data found.")

Fetching global data from https://www.numbeo.com/quality-of-life/rankings_current.jsp...
Missing cities found: {'Venice'}
Force-scraping: Venice...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce')


Unnamed: 0,Rank,Target_City,Travel_Score,Safety,Pollution,Traffic,Climate
0,1,Amsterdam,94.559718,74.3,22.6,22.1,87.5
1,2,Sydney,79.129574,66.1,28.6,43.3,97.1
2,3,Tokyo,76.163917,74.9,43.0,42.7,85.3
4,4,Rome,65.291093,52.7,48.8,35.3,93.7
13,5,Venice,62.355293,68.04,64.18,43.75,82.39
3,6,Barcelona,61.495503,48.1,63.0,30.0,95.7
9,7,Istanbul,50.097369,52.0,67.3,50.8,93.0
6,8,London,47.72525,44.4,58.1,44.8,88.3
8,9,New York,47.673184,48.9,57.9,43.5,79.7
5,10,Paris,46.35322,42.0,63.4,41.2,88.4


In [5]:
if 'df_scored' in locals() and not df_scored.empty: 
    # Clean up helper columns (Normalized columns) before saving
    output_cols = ['Rank', 'Target_City', 'Travel_Score', 'Safety', 'Pollution', 'Traffic', 'Climate']
    df_scored[output_cols].to_csv("Travel_Satisfaction_Index.csv", index=False)