### European Cities Data

In [62]:
import pandas as pd

# --------------------------------------------------
# 1. File Path and Data Loading
# --------------------------------------------------
file_path = r"E:\Bicocca\Data Management\best_cities_for_a_workation.csv"

try:
    # Read CSV file into a DataFrame
    df = pd.read_csv(file_path)
    print(f"Successfully loaded file: {file_path}\n")

    # --------------------------------------------------
    # 2. Basic Dataset Overview
    # --------------------------------------------------
    total_cities = len(df)
    print(f"Total number of cities: {total_cities}\n")

    # --------------------------------------------------
    # 3. Filter Cities Located in Europe
    # --------------------------------------------------
    europe_countries = [
        'Germany', 'France', 'United Kingdom', 'Italy', 'Switzerland', 'Poland', 'Ukraine',
        'Netherlands', 'Greece', 'Sweden', 'Belgium', 'Austria', 'Denmark', 'Norway',
        'Ireland', 'Romania', 'Hungary', 'Czechia', 'Czech Republic', 'Luxembourg', 'Finland',
        'Croatia', 'Cyprus', 'Iceland', 'Albania', 'Estonia', 'Bulgaria', 'Malta',
        'Slovenia', 'Latvia', 'Belarus', 'Slovakia', 'Moldova', 'Serbia', 'Montenegro',
        'Bosnia and Herzegovina', 'North Macedonia', 'Portugal', 'Spain', 'Turkey',
        'Russia', 'Georgia', 'Azerbaijan', 'Kazakhstan'
    ]

    european_cities_df = df[df['Country'].isin(europe_countries)].copy()

    europe_cities_count = len(european_cities_df)
    print(f"Number of European cities: {europe_cities_count}\n")
except FileNotFoundError:
    print(f"File not found: {file_path}")

except Exception as e:
    print(f"Unexpected error: {e}")

Successfully loaded file: E:\Bicocca\Data Management\best_cities_for_a_workation.csv

Total number of cities: 147

Number of European cities: 66



### Renaming Columns

In [63]:
# --------------------------------------------------
# Rename Dataset Columns for Better Readability
# --------------------------------------------------
# This mapping converts long, descriptive column names
# into short, clean, and code-friendly names
# --------------------------------------------------
column_mapping = {
    'Ranking': 'rank',
    'City': 'city',
    'Country': 'country',

    # Remote work related
    'Remote connection: Average WiFi speed (Mbps per second)': 'wifi_speed_mbps',
    'Co-working spaces: Number of co-working spaces': 'coworking_count',

    # Cost-related
    'Caffeine: Average price of buying a coffee': 'coffee_price',
    'Travel: Average price of taxi (per km)': 'taxi_price_km',
    'After-work drinks: Average price for 2 beers in a bar': 'two_beers_price',
    'Accommodation: Average price of 1 bedroom apartment per month': 'one_bed_apt_price',
    'Food: Average cost of a meal at a local, mid-level restaurant': 'meal_price',

    # Lifestyle & environment
    'Climate: Average number of sunshine hours': 'sunshine_hours',
    'Tourist attractions: Number of ‘Things to do’ on Tripadvisor': 'attractions_count',
    'Instagramability: Number of photos with #': 'ig_photos_count'
}

# --------------------------------------------------
# Apply the column renaming to the DataFrame
# --------------------------------------------------
# inplace=True modifies the existing DataFrame directly
# --------------------------------------------------
df.rename(columns=column_mapping, inplace=True)

# --------------------------------------------------
# Verify the updated column names
# --------------------------------------------------
print("New Column Names:")
print(df.columns.tolist())

New Column Names:
['rank', 'city', 'country', 'wifi_speed_mbps', 'coworking_count', 'coffee_price', 'taxi_price_km', 'two_beers_price', 'one_bed_apt_price', 'meal_price', 'sunshine_hours', 'attractions_count', 'ig_photos_count']


### Cleaning Data

In [64]:
import unidecode

# --------------------------------------------------
# a. Handle Missing Values
# --------------------------------------------------
print("Missing values before handling:")
print(df.isnull().sum())

# --------------------------------------------------
# b. Correct Data Types
# --------------------------------------------------
# Columns that should contain numeric values
# (use standardized column names)
numeric_cols = [
    'wifi_speed_mbps',
    'coworking_count',
    'coffee_price',
    'taxi_price_km',
    'two_beers_price',
    'one_bed_apt_price',
    'meal_price',
    'sunshine_hours',
    'attractions_count',
    'ig_photos_count'
]

# Keep only columns that actually exist (prevents KeyError)
existing_numeric_cols = [col for col in numeric_cols if col in df.columns]

# Convert numeric columns safely
df[existing_numeric_cols] = df[existing_numeric_cols].astype(float)

# Convert remaining columns
df['rank'] = df['rank'].astype(int)
df['city'] = df['city'].astype(str)
df['country'] = df['country'].astype(str)

# --------------------------------------------------
# c. Remove Leading and Trailing Spaces
# --------------------------------------------------
df['city'] = df['city'].str.strip()
df['country'] = df['country'].str.strip()

# --------------------------------------------------
# d. Normalize City and Country Names
# --------------------------------------------------
# Example: München → Munchen
df['city'] = df['city'].apply(lambda x: unidecode.unidecode(x))
df['country'] = df['country'].apply(lambda x: unidecode.unidecode(x))

# --------------------------------------------------
# e. Preview Cleaned Data
# --------------------------------------------------
print("\nCleaned Data Preview:")
df.head()

print("\nFinal Columns:")
print(df.columns.tolist())


Missing values before handling:
rank                 0
city                 0
country              0
wifi_speed_mbps      0
coworking_count      0
coffee_price         0
taxi_price_km        0
two_beers_price      0
one_bed_apt_price    0
meal_price           0
sunshine_hours       0
attractions_count    0
ig_photos_count      0
dtype: int64

Cleaned Data Preview:

Final Columns:
['rank', 'city', 'country', 'wifi_speed_mbps', 'coworking_count', 'coffee_price', 'taxi_price_km', 'two_beers_price', 'one_bed_apt_price', 'meal_price', 'sunshine_hours', 'attractions_count', 'ig_photos_count']


### Feature Engineering: Beer Price Normalization

In [65]:
# Ensure the column exists
if 'two_beers_price' in df.columns:
    # Convert to numeric (handles strings or bad values)
    df['two_beers_price'] = pd.to_numeric(df['two_beers_price'], errors='coerce')

    # Create single beer price
    df['beer_price'] = df['two_beers_price'] / 2

    # Drop the old column
    df.drop(columns=['two_beers_price'], inplace=True)

### Geographic Overview and Target European Cities

In [66]:
# ======================================================
# GEOGRAPHIC OVERVIEW & TARGET EUROPEAN CITIES
# ======================================================

# --- Unique countries & cities in dataset ---
print("\n===== UNIQUE COUNTRIES =====")
print(sorted(df['country'].dropna().unique()))

print("\n===== UNIQUE CITIES =====")
print(sorted(df['city'].dropna().unique()))


# --- Define European countries (EU + non-EU + transcontinental) ---
europe = [
    # European Union (27)
    'Austria','Belgium','Bulgaria','Croatia','Cyprus','Czech Republic','Denmark',
    'Estonia','Finland','France','Germany','Greece','Hungary','Ireland','Italy',
    'Latvia','Lithuania','Luxembourg','Malta','Netherlands','Poland','Portugal',
    'Romania','Slovakia','Slovenia','Spain','Sweden',

    # Non-EU European countries
    'Albania','Andorra','Iceland','Liechtenstein','Monaco','Montenegro',
    'North Macedonia','Norway','San Marino','Serbia','Switzerland',
    'United Kingdom','Vatican City',

    # Transcontinental / often included
    'Russia','Turkey','Kazakhstan','Georgia','Azerbaijan',

    # Post-Soviet European states
    'Belarus','Moldova','Ukraine',

    # Special / disputed / micro-territories
    'Kosovo (Disputed Territory)',
    'Isle of Man','Guernsey','Jersey','Faroe Islands','Gibraltar'
]


# --- Standardize city names for consistency ---
city_name_mapping = {
    'Krakow (Cracow)': 'Krakow',
    'Kiev (Kyiv)': 'Kyiv',
    'Seville (Sevilla)': 'Seville',
    'Hanover': 'Hannover',
    'Saint Petersburg': 'St. Petersburg'
}
df['city'] = df['city'].replace(city_name_mapping)


# --- Unique European cities in dataset ---
unique_european_cities = sorted(
    df[df['country'].isin(europe)]['city'].dropna().unique()
)
print("\n===== UNIQUE EUROPEAN CITIES IN DATASET =====")
print(unique_european_cities)
print(f"\nTotal unique European cities in dataset: {len(unique_european_cities)}")


# --- Refined target European cities (64 cities) ---
significant_european_cities = [
    'Lisbon','Barcelona','Budapest','Istanbul','Bucharest','Madrid','Sofia',
    'Krakow','Belgrade','Prague','Porto','Valencia','Kyiv','Moscow','Berlin',
    'Vienna','Malaga','Seville','Rome','Athens','Warsaw','Minsk','Paris',
    'Ljubljana','Florence','Liverpool','Tallinn','Zagreb','Hamburg','Naples',
    'Milan','Split','Brussels','Dublin','Riga','Lyon','Palma de Mallorca',
    'Vilnius','London','Stockholm','Munich','Marseille','Cologne','Amsterdam',
    'Dusseldorf','Helsinki','Frankfurt','Stuttgart','Hannover','Copenhagen',
    'Dresden','Manchester','Rotterdam','St. Petersburg','Edinburgh','Dubrovnik',
    'Oslo','Glasgow','Belfast','Salzburg','Zurich','Geneva','Valletta','Reykjavik'
]
cities_to_remove = {'Bordeaux', 'Faro', 'Hvar'}
target_cities = {city for city in significant_european_cities if city not in cities_to_remove}

# --- Filter dataset to refined European cities ---
df_filtered = df[df['city'].isin(target_cities)].drop_duplicates(subset='city').reset_index(drop=True)
df_filtered['city_index'] = df_filtered.index + 1

print(f"\nTotal target European cities (after removing 3): {len(target_cities)}")
print(f"Cities found in dataset: {len(df_filtered)}")
missing_cities = sorted(target_cities - set(df_filtered['city']))
if missing_cities:
    print("\nMissing cities:")
    print(missing_cities)
else:
    print("\nAll target cities are present in dataset.")


===== UNIQUE COUNTRIES =====
['Argentina', 'Australia', 'Austria', 'Belarus', 'Belgium', 'Bolivia', 'Brazil', 'Bulgaria', 'Burma/Myanmar', 'Cambodia', 'Canada', 'Chile', 'China', 'Colombia', 'Costa Rica', 'Croatia', 'Czech Republic', 'Denmark', 'Ecuador', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hawaii', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Ireland', 'Israel', 'Italy', 'Japan', 'Kuwait', 'Laos', 'Latvia', 'Lebanon', 'Lithuania', 'Malaysia', 'Malta', 'Mexico', 'Morocco', 'Nepal', 'Netherlands', 'New Zealand', 'Norway', 'Oman', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar', 'Romania', 'Russia', 'Senegal', 'Serbia', 'Singapore', 'Slovenia', 'South Africa', 'South Korea', 'Spain', 'Sri Lanka', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'United States', 'Uruguay', 'Vietnam']

===== UNIQUE CITIES =====
['Abu Dhabi', 'Adelaide', 'Amsterdam', 'Arequipa', 'Athens', 'Auckland', 'Bangkok',

### Export Filtered Data to CSV

In [67]:
import os

# File name
file_name = 'best_significant_european_workation_cities.csv'

try:
    # Save the filtered DataFrame
    df_filtered.to_csv(file_name, index=False, encoding='utf-8')
    print(f"File saved as: {file_name}")
    print(f"Directory: {os.getcwd()}")
except NameError:
    print("ERROR: 'df_filtered' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

File saved as: best_significant_european_workation_cities.csv
Directory: C:\Users
