In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from itertools import combinations
import numpy as np
from scipy.stats import chi2_contingency

In [None]:
raw_df = pd.read_csv("../data/raw/tripadvisor_european_restaurants.csv")
raw_df

In [None]:
raw_df.shape

In [None]:
raw_df.columns

In [None]:
raw_df.info()

In [None]:
raw_df.country.nunique()

#### Combine columns to single value

In [None]:
raw_df['country'] = raw_df['country'].replace({
    'England': 'United Kingdom',
    'Northern Ireland': 'United Kingdom',
    'Scotland': 'United Kingdom',
    'Wales': 'United Kingdom'
})

In [None]:
filtered_df = raw_df[raw_df["country"].isin(["France", "United Kingdom"])]
filtered_df = filtered_df.reset_index(drop=True)
filtered_df

#### Dropping Columns

In [None]:
filtered_df = filtered_df.drop(columns=['original_location', 'open_hours_per_week', 'popularity_detailed', 'popularity_generic', 'restaurant_link','working_shifts_per_week', 'price_level', 'region', 'province', 'address', 'atmosphere', 'keywords','value', 'very_good', 'average', 'poor', 'food', 'service', 'reviews_count_in_default_language'])
filtered_df

#### Filling Null values

In [None]:
filtered_df.isna().sum()

In [None]:
filtered_df = filtered_df.assign(
    claimed = filtered_df['claimed'].fillna('Unknown'),
    city = filtered_df['city'].fillna('Unknown'),
    awards = filtered_df['awards'].fillna('Unknown'),
    top_tags = filtered_df['top_tags'].fillna('Unknown'),
    price_range = filtered_df['price_range'].fillna('Not Available'),
    meals = filtered_df['meals'].fillna('Unknown'),
    cuisines = filtered_df['cuisines'].fillna('Unknown'),
    special_diets = filtered_df['special_diets'].fillna('Unknown'),
    features = filtered_df['features'].fillna('Unknown'),
    avg_rating = filtered_df['avg_rating'].fillna(filtered_df['avg_rating'].median()),
    original_open_hours = filtered_df['original_open_hours'].fillna('Unknown'),
    open_days_per_week = filtered_df['open_days_per_week'].fillna(0.0),
    total_reviews_count = filtered_df['total_reviews_count'].fillna(0),
    default_language = filtered_df['default_language'].fillna('Unknown'),
    excellent = filtered_df['excellent'].fillna(0.0),
    terrible = filtered_df['terrible'].fillna(0.0)
)

In [None]:
filtered_df.isna().sum()

#### Checking Data Types

In [None]:
filtered_df.dtypes

#### Keep Only First Cuisine

In [None]:
# Ensure cuisines is string
filtered_df["cuisines"] = filtered_df["cuisines"].astype(str)

# Keep only the first cuisine
filtered_df["primary_cuisine"] = filtered_df["cuisines"].str.split(",").str[0].str.strip()

# Preview result
filtered_df[["cuisines", "primary_cuisine"]].head()

In [None]:
filtered_df.head(15)

## EDA

In [None]:
# color = ["#2A5243", "#6aeaac"]

#### Number of restaurants by country and city

In [None]:
# Count restaurants by country
restaurants_by_country = (
    filtered_df.groupby("country")
    .size()
    .reset_index(name="number_of_restaurants")
    .sort_values("number_of_restaurants", ascending=False)
)
restaurants_by_country.head()

In [None]:
"""
The dataset is dominated by two countries:
United Kingdom: 171,664 restaurants
France: 155,288 restaurants

Insights

Coverage between the UK and France is relatively balanced, with the UK having ~10% more entries.
"""

In [None]:
# Plot 1: Restaurants by country
plt.figure()
plt.bar(restaurants_by_country["country"], restaurants_by_country["number_of_restaurants"], color = ["#2A5243", "#6aeaac"])
plt.title("Number of Restaurants by Country")
plt.xlabel("Country")
plt.ylabel("Number of Restaurants")
plt.show()

In [None]:
# Count restaurants by city
"""
restaurants_by_city = (
    filtered_df.groupby(["country", "city"])
    .size()
    .reset_index(name="number_of_restaurants")
    .sort_values("number_of_restaurants", ascending=False)
)
"""

restaurants_by_city = (
    filtered_df[filtered_df["city"] != "Unknown"]
    .groupby(["country", "city"])
    .size()
    .reset_index(name="number_of_restaurants")
    .sort_values("number_of_restaurants", ascending=False)
)
restaurants_by_city.head(10)

In [None]:
plt.figure()
plt.barh(restaurants_by_city["city"].head(10), restaurants_by_city["number_of_restaurants"].head(10), color = ["#2A5243", "#6aeaac"])
plt.title("Top 10 Cities by Number of Restaurants")
plt.xlabel("Number of Restaurants")
plt.ylabel("City")
plt.gca().invert_yaxis()
plt.show()

In [None]:
"""
Key takeaways

Paris overwhelmingly dominates, far ahead of all other cities.
Other major cities (Lyon, Marseille, Manchester, Birmingham) form a second tier with much smaller but comparable counts.
The sharp drop after Paris indicates strong centralization of restaurants in the capital.

"""
"""
Key Observations
1. ‚ÄúUnknown‚Äù City Category

The largest single ‚Äúcity‚Äù is Unknown with over 31k restaurants.
This likely represents:
    Missing city values
    Suburban, rural, or poorly geocoded entries

This should be cleaned or excluded for precise city-level analysis.

2. Paris Dominates France
    Paris alone accounts for ~12% of all French restaurants in the dataset.
    Strong urban concentration ‚Üí Paris is a prime candidate for:
        Competitive analysis
        Cuisine diversity studies
        Pricing and rating segmentation
"""

#### Identify top-rated and most-reviewed restaurants

In [None]:
# Ensure numeric types
filtered_df["avg_rating"] = pd.to_numeric(filtered_df["avg_rating"], errors="coerce")
filtered_df["total_reviews_count"] = pd.to_numeric(filtered_df["total_reviews_count"], errors="coerce")

# Drop rows with missing essentials
filtered_df = filtered_df.dropna(subset=["avg_rating", "total_reviews_count", "restaurant_name", "city", "country"])

# Top-rated restaurants (minimum 100 reviews)
top_rated = (
    filtered_df[filtered_df["total_reviews_count"] >= 50]
    .sort_values(["avg_rating", "total_reviews_count"], ascending=[False, False])
    .head(10)
)

top_rated[["restaurant_name", "city", "country", "avg_rating", "total_reviews_count"]]

In [None]:
plt.figure()
plt.barh(top_rated["restaurant_name"], top_rated["avg_rating"], color = ["#2A5243", "#6aeaac"])
plt.xlabel("Average Rating")
plt.title("Top-Rated Restaurants (Min 100 Reviews)")
plt.gca().invert_yaxis()
plt.show()

In [None]:
"""
What this shows
    Horizontal bar chart of the 10 highest-rated restaurants
    All have perfect or near-perfect ratings
    Filtered to avoid small-sample bias

How to read it
    These restaurants represent consistent quality at scale
    Ideal benchmarks for service and food excellence

Insights (Top-Rated)
    UK dominates the top-rated list, especially England.
    Both fine dining (Epicure, Adam‚Äôs) and casual dining (Pizza Union, Elif) can achieve perfect ratings.
    High ratings + high reviews indicate consistent quality at scale, not niche popularity.
"""

In [None]:
# Most-reviewed restaurants
most_reviewed = (
    filtered_df.sort_values("total_reviews_count", ascending=False)
    .head(10)
)

most_reviewed[["restaurant_name", "city", "country", "avg_rating", "total_reviews_count"]]

In [None]:
plt.figure()
plt.barh(most_reviewed["restaurant_name"] + " " + "(" + most_reviewed["country"] + ")", 
         most_reviewed["total_reviews_count"], 
         color = ["#2A5243", "#6aeaac"])
plt.xlabel("Total Reviews")
plt.title("Most-Reviewed Restaurants")
plt.gca().invert_yaxis()
plt.show()

In [None]:
"""
What this shows
    Restaurants with the highest total number of reviews
    Indicates footfall, visibility, and tourist appeal

How to read it
    High popularity does not necessarily imply the highest ratings
    Many are iconic, high-volume venues

Insights (Most-Reviewed)
    Paris strongly dominates the most-reviewed category.
    High review volume ‚â† highest rating ‚Üí popularity vs. quality distinction.
    Tourist-heavy and iconic venues (Hard Rock Cafe, Bouillon Chartier) generate massive review counts even with moderate ratings.
"""

In [None]:
# --- Plot 3: Rating vs Reviews (scatter plot) ---
plt.figure()
plt.scatter(filtered_df["total_reviews_count"], filtered_df["avg_rating"], color = ["#2A5243"])
plt.xlabel("Total Reviews")
plt.ylabel("Average Rating")
plt.title("Restaurant Popularity vs Rating")
plt.show()

In [None]:
"""
Plot 3: Rating vs. Total Reviews (Scatter Plot)

What this shows
    X-axis: total reviews (popularity)
    Y-axis: average rating (quality)

Key insights
    Weak correlation between popularity and rating
    Many highly rated restaurants have moderate review counts
    Very popular restaurants cluster around 4.0‚Äì4.5 ratings

Confirms two different success models:
‚≠ê Quality-driven
üî• Volume-driven

"""

#### Most common cuisines and cuisine combinations

In [None]:
# Clean cuisines column

filtered_df = filtered_df.dropna(subset=["cuisines", "country"])
filtered_df["cuisines"] = filtered_df["cuisines"].astype(str)

# Split cuisines into lists
filtered_df["cuisine_list"] = filtered_df["cuisines"].str.split(",")
filtered_df["cuisine_list"] = filtered_df["cuisine_list"].apply(lambda x: [c.strip() for c in x])

results = {}

for country in filtered_df["country"].unique():
    df_c = filtered_df[filtered_df["country"] == country]

    # ---- Most common single cuisines ----
    all_cuisines = [c for sublist in df_c["cuisine_list"] for c in sublist]
    cuisine_counts = Counter(all_cuisines)
    top_cuisines = cuisine_counts.most_common(10)

    # Cuisine combinations

    combo_counter = Counter()
    
    for cuisines in df_c["cuisine_list"]:
        if len(cuisines) > 1:
            combo_counter.update(combinations(sorted(set(cuisines)), 2))

    combo_counts = combo_counter.most_common(10)

    results[country] = {
        "top_cuisines": top_cuisines,
        "top_combinations": combo_counts
    }

results

In [None]:
"""
Interpretation (France)
   - French cuisine overwhelmingly dominates, reflecting strong culinary tradition.
   - High presence of European and Mediterranean cuisines suggests:
       - Regional fusion
       - Tourist-oriented menus
   - Italian and Pizza cuisines are the most popular foreign influences.
   - Asian cuisine has a significant but secondary presence.

Key Insight (France)
- French restaurants often pair local identity with broad European positioning, 
    while Italian cuisine integrates strongly through pizza-focused concepts.

Interpretation (UK)
   - British cuisine dominates but is closely tied to:
       - Pub
       - Bar
   - Cafe culture
   - Indian cuisine is the strongest non-European influence, reflecting:
       - Cultural diversity
       - Long-standing demand
   - Fast food has a stronger presence than in France.

Key Insight (UK)

- The UK food scene is strongly shaped by hospitality-driven cuisine pairings, 
    with Indian and South Asian cuisines deeply integrated into mainstream dining.

"""

In [None]:
"""

Cross-Country Comparison

    Aspect	                      France	                     United Kingdom

Dominant Cuisine	              French                           British
Strong Secondary	        European, Italian	                Cafe, Pub, Indian
Cultural Influence	        Traditional & regional	              Multicultural
Fusion Pattern	               European + French	            British + Pub / Bar
Foreign Cuisine Leader	          Italian	                         Indian

"""

In [None]:
# Create comparison data for plotting

data = {
    "Aspect": [
        "Dominant Cuisine",
        "Strong Secondary",
        "Cultural Influence",
        "Fusion Pattern",
        "Foreign Cuisine Leader"
    ],
    "France": [1, 2, 1, 1, 1],
    "United Kingdom": [1, 3, 2, 2, 1]
}

df_cuisine = pd.DataFrame(data)

# Plot grouped bar chart
plt.figure()
x = range(len(df_cuisine["Aspect"]))

plt.bar(x, df_cuisine["France"], label="France", color = "#6aeaac")
plt.bar(x, df_cuisine["United Kingdom"], bottom=df_cuisine["France"], label="United Kingdom", color = "#2A5243")

plt.xticks(x, df_cuisine["Aspect"], rotation=30, ha="right")
plt.ylabel("Relative Presence / Complexity")
plt.title("Cross-Country Cuisine Characteristics Comparison")
plt.legend()
plt.show()

In [None]:
"""

The chart uses encoded values to visualize relative complexity and diversity for each aspect. 
Since these variables are categorical, we encoded them to visualize relative complexity ‚Äî 
the chart supports interpretation, not precise measurement.
Each bar represents the relative richness or variety of that aspect

Example:

Strong Secondary
    France ‚Üí 2 (European, Italian)
    UK ‚Üí 3 (Cafe, Pub, Indian)

Dominant Cuisine:
    - Both countries show strong national identity
    
Strong Secondary Cuisines
   - UK clearly shows greater variety
   - Indicates a more diverse supporting food culture

Cultural Influence
   - France: traditional & regional
   - UK: multicultural ‚Üí visually higher

Fusion Pattern
   - UK shows more fusion complexity (British + Pub/Bar)
   - France remains more cuisine-identity driven

Foreign Cuisine Leader
   - Italy (France) vs Indian (UK)
   - Similar importance, different cultural roots

Key Takeaway:
France‚Äôs food culture is heritage-driven, while the UK‚Äôs is diversity-driven.

#### Indian Cuisine Distribution

In [None]:
# Clean cuisines

filtered_df = filtered_df.dropna(subset=["cuisines", "country"])
filtered_df["cuisines"] = filtered_df["cuisines"].astype(str)

# Check if Indian exists
filtered_df["is_Indian"] = filtered_df["cuisines"].str.contains("Indian", case = False, regex=False)

# Filter France & UK
df_indian = filtered_df[filtered_df["is_Indian"] & filtered_df["country"].isin(["France", "United Kingdom"])]

# Count by country
country_counts = (
    df_indian.groupby("country")
    .size()
    .reset_index(name="indian_restaurant_count")
)

# Top cities per country
city_counts = (
    df_indian[df_indian["city"] != "Unknown"]
    .groupby(["country", "city"])
    .size()
    .reset_index(name="count")
)
country_counts

In [None]:
city_counts

In [None]:
# Plot: Indian cuisine by country
plt.figure()
plt.bar(country_counts["country"], country_counts["indian_restaurant_count"], color = ["#2A5243", "#6aeaac"])
plt.title("Indian Cuisine Restaurants by Country")
plt.ylabel("Number of Restaurants")
plt.xlabel("Country")
plt.show()

In [None]:
plt.figure()
plt.pie(
    country_counts["indian_restaurant_count"],
    labels=country_counts["country"],
    colors=["#2A5243", "#6aeaac"],
    autopct="%1.1f%%",
    startangle=90
)
plt.title("")
plt.show()

In [None]:
"""
Indian Restaurants by Country

Key numbers (approximate):
   - üá¨üáß United Kingdom: ~13,000 Indian restaurants
   - üá´üá∑ France: ~1,700 Indian restaurants

Insight:
   - The UK has ~8√ó more Indian restaurants than France.
   - Indian cuisine is mainstream and deeply embedded in the UK food culture.
   - In France, Indian cuisine remains niche and urban-centered.
"""

In [None]:
top_cities_fr = city_counts[city_counts["country"] == "France"].sort_values("count", ascending=False).head(10)
top_cities_fr

In [None]:
# Plot: Top France cities
plt.figure()
plt.barh(top_cities_fr["city"], top_cities_fr["count"], color = ["#2A5243", "#6aeaac"])
plt.title("Top France Cities for Indian Cuisine")
plt.xlabel("Number of Indian Restaurants")
plt.gca().invert_yaxis()
plt.show()

In [None]:
"""
France: City-Level Distribution

   - Paris alone accounts for a large share of Indian restaurants
   - Secondary presence in:
       - Major metropolitan areas
       - Student & tourist cities
   - Very limited penetration beyond big cities

Indian cuisine in France is capital-centric and exploratory, not yet mainstream.
"""

In [None]:
top_cities_uk = city_counts[city_counts["country"] == "United Kingdom"].sort_values("count", ascending=False).head(10)
top_cities_uk

In [None]:
# Plot: Top UK cities
plt.figure()
plt.barh(top_cities_uk["city"], top_cities_uk["count"], color = ["#2A5243", "#6aeaac"])
plt.title("Top UK Cities for Indian Cuisine")
plt.xlabel("Number of Indian Restaurants")
plt.gca().invert_yaxis()
plt.show()

In [None]:
"""
United Kingdom: City-Level Distribution
   - Birmingham ‚Äì clear leader
   - Strong correlation with:
       - Historical South Asian communities
       - Industrial & multicultural cities
   - Birmingham & Leicester are known Indian cuisine hubs
   - Distribution is geographically widespread, not London-only

Unlike France, Indian cuisine in the UK is community-driven, not tourist-driven.

For new investors, UK will be Highly competitive but they will find more growth opportunities in France.
"""

#### Distribution of Vegetarian Restaurants in France & UK

In [None]:
# Filter Vegetarian food options
filtered_df["vegetarian_friendly"] = filtered_df["vegetarian_friendly"].astype(str)

# Filter vegetarian restaurants
veg_df = filtered_df[filtered_df["vegetarian_friendly"] == "Y"]

# Country-level counts
veg_country = (
    veg_df.groupby("country")
    .size()
    .reset_index(name="vegetarian_restaurant_count")
)

# Percentage of vegetarian restaurants per country
total_country = filtered_df.groupby("country").size().reset_index(name="total_restaurants")
veg_country = veg_country.merge(total_country, on="country")
veg_country["percentage"] = (
    veg_country["vegetarian_restaurant_count"] / veg_country["total_restaurants"] * 100
)

# Top cities per country
veg_city = (
    veg_df[veg_df["city"] != "Unknown"]
    .groupby(["country", "city"])
    .size()
    .reset_index(name="count")
)

In [None]:
veg_country

In [None]:
# --- Plot 1: Country-level comparison ---
plt.figure()
plt.bar(veg_country["country"], veg_country["percentage"], color = ["#2A5243", "#6aeaac"])
plt.ylabel("% of Vegetarian-Friendly Restaurants")
plt.title("Vegetarian Restaurants: UK vs France")
plt.show()

In [None]:
plt.figure()
plt.pie(
    veg_country["percentage"],
    labels=veg_country["country"],
    colors=["#2A5243", "#6aeaac"],
    autopct="%1.1f%%",
    startangle=90
)
plt.title("Vegetarian Restaurants: UK vs France")
plt.show()

In [None]:
"""
Key Insight:
   - The UK has nearly 3√ó more vegetarian-friendly restaurants than France.
   - Almost 1 in 2 restaurants in the UK offers vegetarian options.
   - In France, vegetarian dining is still niche, with fewer than 1 in 5 restaurants being vegetarian-friendly.
"""

In [None]:
top_veg_uk = veg_city[veg_city["country"] == "United Kingdom"].sort_values("count", ascending=False).head(10)
top_veg_uk

In [None]:
# --- Plot 2: Top UK Cities for Vegetarian Restaurants ---
plt.figure()
plt.barh(top_veg_uk["city"].head(10), top_veg_uk["count"].head(10), color = ["#2A5243", "#6aeaac"])
plt.xlabel("Number of Vegetarian Restaurants")
plt.title("Top UK Cities for Vegetarian Restaurants")
plt.gca().invert_yaxis()
plt.show()

In [None]:
"""
Interpretation (UK):
   - Vegetarian options are widely distributed, not limited to London.
   - Strong presence in:
       - Student cities (Leeds, Sheffield, Nottingham)
       - Progressive & lifestyle-driven cities (Brighton)
"""

In [None]:
top_veg_fr = veg_city[veg_city["country"] == "France"].sort_values("count", ascending=False).head(10)
top_veg_fr

In [None]:
# --- Plot 3: Top France Cities for Vegetarian Restaurants ---
plt.figure()
plt.barh(top_veg_fr["city"].head(10), top_veg_fr["count"].head(10), color = ["#2A5243", "#6aeaac"])
plt.xlabel("Number of Vegetarian Restaurants")
plt.title("Top France Cities for Vegetarian Restaurants")
plt.gca().invert_yaxis()
plt.show()

In [None]:
"""
Interpretation (France):
   - Paris dominates vegetarian dining in France.
   - Outside Paris, vegetarian restaurants are:
       - Concentrated in large cities
       - Often linked to tourism and student populations
"""

In [None]:
"""
For Investors
   - UK vegetarian market = stable, mature
   - France vegetarian market = expansion opportunity
"""

#### Top 10 Vegetarian Restaurants 

In [None]:
# Ensure numeric

filtered_df["avg_rating"] = pd.to_numeric(filtered_df["avg_rating"], errors="coerce")
filtered_df["total_reviews_count"] = pd.to_numeric(filtered_df["total_reviews_count"], errors="coerce")

# Filter vegetarian restaurants
filtered_df["vegetarian_friendly"] = filtered_df["vegetarian_friendly"].astype(str).str.lower()

In [None]:
# --- Top 10 Vegetarian in UK ---

# Filter vegetarian-friendly restaurants in the UK
uk_veg = filtered_df[
    (filtered_df["country"] == "United Kingdom") &
    (filtered_df["vegetarian_friendly"] == "y")
]

# Drop rows with missing rating or review count
uk_veg = uk_veg.dropna(subset=["avg_rating", "total_reviews_count"])

# Minimum reviews threshold for reliability
MIN_REVIEWS = 50
uk_veg = uk_veg[uk_veg["total_reviews_count"] >= MIN_REVIEWS]

top_uk_veg = uk_veg.sort_values(
    ["avg_rating", "total_reviews_count"],
    ascending=[False, False]
).head(10)

# Display results
top_uk_veg[["restaurant_name", "city", "cuisines", "avg_rating", "total_reviews_count"]]

In [None]:
# Plot horizontal bar chart
plt.figure()
plt.barh(top_uk_veg["restaurant_name"], top_uk_veg["total_reviews_count"], color = ["#4d372f", "#c18d6e"])
plt.xlabel("Review Count")
plt.title("Top 10 Vegetarian-Friendly Restaurants in UK")
plt.gca().invert_yaxis()
plt.show()

In [None]:
# --- Top 10 Vegetarian in France ---
fr_veg = filtered_df[(filtered_df["country"] == "France") &
    (filtered_df["vegetarian_friendly"] == "y")]

# Drop missing ratings/reviews
fr_veg = fr_veg.dropna(subset=["avg_rating", "total_reviews_count"])

# Apply minimum review threshold
fr_veg = fr_veg[fr_veg["total_reviews_count"] >= 50]

top_fr_veg = fr_veg.sort_values(
    ["avg_rating", "total_reviews_count"],
    ascending=[False, False]
).head(10)

top_fr_veg[["restaurant_name", "city", "primary_cuisine", "avg_rating", "total_reviews_count"]]

In [None]:
# Plot horizontal bar chart
plt.figure()
plt.barh(top_fr_veg["restaurant_name"], top_fr_veg["total_reviews_count"], color = ["#4d372f", "#c18d6e"])
plt.xlabel("Review Count")
plt.title("Top 10 Vegetarian-Friendly Restaurants in France")
plt.gca().invert_yaxis()
plt.show()

In [None]:
"""
- 7 out of 10 restaurants list French / European cuisine.
- Indicates that in France, vegetarian excellence is achieved within traditional culinary frameworks, 
    not only in niche vegetarian concepts.
- Indian / Asian cuisine appears in the top 3
"""

#### Top 10 Indian Restaurants in France & UK

In [None]:
# Clean fields

filtered_df["avg_rating"] = pd.to_numeric(filtered_df["avg_rating"], errors="coerce")
filtered_df["total_reviews_count"] = pd.to_numeric(filtered_df["total_reviews_count"], errors="coerce")
filtered_df["cuisines"] = filtered_df["cuisines"].astype(str)

# Filter Indian cuisine
indian_df = filtered_df[filtered_df["cuisines"].str.contains("Indian", case=False, regex=False)]

# Remove rows with missing ratings or reviews
indian_df = indian_df.dropna(subset=["avg_rating", "total_reviews_count"])

# Reliability filter
indian_df = indian_df[indian_df["total_reviews_count"] >= 50]

In [None]:
# Top 10 France
top_fr = (
    indian_df[indian_df["country"] == "France"]
    .sort_values(["avg_rating", "total_reviews_count"], ascending=[False, False])
    .head(10)
)

top_fr[["restaurant_name","city","avg_rating","total_reviews_count"]]

In [None]:
# Top 10 UK
top_uk = (
    indian_df[indian_df["country"] == "United Kingdom"]
    .sort_values(["avg_rating", "total_reviews_count"], ascending=[False, False])
    .head(10)
)

top_uk[["restaurant_name","city","avg_rating","total_reviews_count"]]

#### Language Distribution 

In [None]:
# Ensure languages column is string
filtered_df["default_language"] = filtered_df["default_language"].astype(str)

# Remove missing / unknown values
df_lang = filtered_df[~filtered_df["default_language"].isin(["Unknown", "nan", "None"])]

# Split languages into lists
df_lang["language_list"] = df_lang["default_language"].str.split(",")

# Strip whitespace
df_lang["language_list"] = df_lang["language_list"].apply(
    lambda x: [lang.strip() for lang in x]
)

# Function to get top languages per country
def top_languages_by_country(country, top_n=10):
    langs = df_lang[df_lang["country"] == country]["language_list"]
    all_langs = [l for sublist in langs for l in sublist]
    return Counter(all_langs).most_common(top_n)

In [None]:
top_lang_france = top_languages_by_country("France")
top_lang_france

In [None]:
top_lang_uk = top_languages_by_country("United Kingdom")
top_lang_uk

In [None]:
# Clean language column
filtered_df["default_language"] = filtered_df["default_language"].astype(str).str.lower()

# Filter France & UK
df_countries = filtered_df[filtered_df["country"].isin(["France", "United Kingdom"])]

# Identify English-language restaurants
df_countries["is_english"] = df_countries["default_language"].str.contains("english")

# Count English-language restaurants
english_counts = (
    df_countries[df_countries["is_english"]]
    .groupby("country")
    .size()
    .reset_index(name="english_restaurants")
)

# Total restaurants per country
total_counts = (
    df_countries.groupby("country")
    .size()
    .reset_index(name="total_restaurants")
)

# Merge and calculate percentage
lang_distribution = english_counts.merge(total_counts, on="country")
lang_distribution["percentage"] = (
    lang_distribution["english_restaurants"] / lang_distribution["total_restaurants"] * 100
)
lang_distribution

In [None]:
# Plot
plt.figure()
plt.bar(lang_distribution["country"], lang_distribution["percentage"], color = ["#2A5243", "#6aeaac"])
plt.ylabel("Percentage of English-language Restaurants")
plt.title("English Language Distribution in France vs United Kingdom")
plt.show()

In [None]:
"""
United Kingdom: ~92% of restaurants use English
France: ~57% of restaurants use English

United Kingdom
   - English is the default operating language for restaurants.
   - High percentage reflects:
       - Native language dominance
       - Strong accessibility for international users
       - Consistent platform standardization

France
   - English is widely used but not dominant.
   - Indicates:
       - Strong national language identity (French)
       - English adoption driven by tourism and international audiences
       - More bilingual or localized restaurant communication

While presenting:
‚ÄúEnglish-language usage highlights how restaurants position themselves for international audiences. 
    The UK is nearly universal, while France shows selective adoption driven by tourism.‚Äù
"""

#### Meal-Type Distribution

In [None]:
# Ensure meals column is string
filtered_df["meals"] = filtered_df["meals"].astype(str)

# Filter France & UK
filtered_df = filtered_df[filtered_df["country"].isin(["France", "United Kingdom"])]

# Clean meals: keep unique meals per restaurant
def clean_meals(meals):
    return list({m.strip() for m in meals.split(",") if m.strip() not in ["", "Unknown", "nan"]})

filtered_df["meals_list"] = filtered_df["meals"].apply(clean_meals)

# Count meal types per country
meal_counts = {}

for country in ["France", "United Kingdom"]:
    meals = filtered_df[filtered_df["country"] == country]["meals_list"]
    all_meals = [
        meal for sublist in meals for meal in sublist
        if meal not in ["", "Unknown", "nan"]
    ]
    meal_counts[country] = Counter(all_meals)

# Align meal types
all_meal_types = sorted(set(meal_counts["France"].keys()).union(meal_counts["United Kingdom"].keys()))

In [None]:
fr_values = [meal_counts["France"].get(meal, 0) for meal in all_meal_types]
fr_values

In [None]:
uk_values = [meal_counts["United Kingdom"].get(meal, 0) for meal in all_meal_types]
uk_values

In [None]:
# Side-by-side bar chart
x = range(len(all_meal_types))
width = 0.4

plt.figure()
plt.bar(x, fr_values, width=width, label="France", color = ["#4d372f"])
plt.bar([i + width for i in x], uk_values, width=width, label="United Kingdom", color = ["#c18d6e"])
plt.xticks([i + width / 2 for i in x], all_meal_types, rotation=30)
plt.ylabel("Number of Restaurants")
plt.title("Meal Type Distribution: France vs United Kingdom")
plt.legend()
plt.show()

In [None]:
"""
Lunch & Dinner:
   - Both countries show very high counts
   - France slightly leads, reflecting:
       - Strong lunch culture
       - Formal dining traditions

Breakfast:
   - UK far exceeds France
   - Indicates:
       - Caf√© & all-day dining culture
       - Grab-and-go and early dining habits

Breakfast is a UK strength and a French opportunity

Brunch:
   - Major difference
   - Brunch is:
       - Highly popular in the UK
       - Still niche in France

This is one of the clearest lifestyle contrasts

After-hours Dining:
   - More common in the UK
   - Reflects:
       - Nightlife culture
       - Flexible eating times

France focuses on structured lunch and dinner dining, 
while the UK shows a much more flexible, all-day eating culture driven by breakfast and brunch.
"""

In [None]:
# Convert to DataFrames
fr_meals = (
    pd.DataFrame(meal_counts["France"].items(), columns=["Meal", "Count"])
    .sort_values("Count", ascending=False)
)
fr_meals

In [None]:
# Plot France
plt.figure()
plt.bar(fr_meals["Meal"], fr_meals["Count"], color = ["#4d372f", "#c18d6e"])
plt.title("Meal Type Distribution in France")
plt.ylabel("Number of Restaurants")
plt.xticks(rotation=30)
plt.show()

In [None]:
uk_meals = (
    pd.DataFrame(meal_counts["United Kingdom"].items(), columns=["Meal", "Count"])
    .sort_values("Count", ascending=False)
)
uk_meals

In [None]:
# Plot UK
plt.figure()
plt.bar(uk_meals["Meal"], uk_meals["Count"], color = ["#4d372f", "#c18d6e"])
plt.title("Meal Type Distribution in United Kingdom")
plt.ylabel("Number of Restaurants")
plt.xticks(rotation=30)
plt.show()

In [None]:
"""
France ‚Äî Meal Type Distribution

Observed pattern:
   - Lunch and Dinner dominate strongly
   - Breakfast is secondary
   - Brunch and After-hours are niche

Insight:
France follows a structured, time-specific dining culture, centered on lunch and dinner.

United Kingdom ‚Äî Meal Type Distribution

Observed pattern:
   - Lunch and Dinner still lead
   - Breakfast and Brunch are much stronger than in France
   - After-hours dining is more common

Insight:
The UK shows a flexible, all-day dining culture, driven by caf√©s and casual dining.

Final Takeaway:
Meal distribution reflects lifestyle:
France eats by tradition, the UK eats by convenience.
"""

## HYPOTHESIS TESTING

### 1. English Language Distribution (France vs UK)

#### Research Question: Is there a statistically significant difference in the proportion of restaurants using English as the default language between France and the UK?

##### Null hypothesis (H‚ÇÄ):
##### There is no association between country (France vs UK) and English-language usage in restaurants.

##### Alternative hypothesis (H‚ÇÅ):
##### There is an association between country and English-language usage.

In [None]:
# Clean language column
filtered_df["default_language"] = filtered_df["default_language"].astype(str).str.lower()

# Focus on France & UK
filtered_df = filtered_df[filtered_df["country"].isin(["France", "United Kingdom"])]

# Create English vs Non-English flag
filtered_df["is_english"] = filtered_df["default_language"].str.contains("english")

# Build contingency table
contingency = pd.crosstab(filtered_df["country"], filtered_df["is_english"])

# Perform Chi-square test
chi2, p_value, dof, expected = chi2_contingency(contingency)

contingency, chi2, p_value

In [None]:
"""
Statistical Test Used:
    Chi-square test of independence

Appropriate because:
   - Variables are categorical (country, language)
   - Large sample size
   - Independence assumption holds

Since p < 0.05, we reject the null hypothesis.

Language choice reflects cultural orientation:
    UK ‚Üí global accessibility
    France ‚Üí balance between identity and tourism

Platforms and restaurant owners should:
   - Expect English by default in the UK
   - Use English selectively in France (tourist areas)
"""

### 2. Vegetarian Adoption (Final Project)

#### Research Question

#### Is there a statistically significant difference in the adoption of vegetarian-friendly restaurants between France and the UK?

##### Null hypothesis (H‚ÇÄ):
##### Vegetarian adoption is independent of country (France and UK have similar proportions).

##### Alternative hypothesis (H‚ÇÅ):
##### Vegetarian adoption differs by country.

In [None]:
# Clean vegetarian flag
filtered_df["vegetarian_friendly"] = filtered_df["vegetarian_friendly"].astype(str).str.lower()

# Focus on France & UK
filtered_df = filtered_df[filtered_df["country"].isin(["France", "United Kingdom"])]

# Vegetarian vs non-vegetarian
filtered_df["is_vegetarian"] = filtered_df["vegetarian_friendly"] == "y"

# Contingency table
contingency = pd.crosstab(filtered_df["country"], filtered_df["is_vegetarian"])

# Chi-square test
chi2, p_value, dof, expected = chi2_contingency(contingency)

contingency, chi2, p_value

In [None]:
"""
Statistical Test Used:
    Chi-square test of independence

Suitable because:
    Variables are categorical
    Large sample size
    Independence assumption satisfied

Since p < 0.05, we reject the null hypothesis.

- Vegetarian adoption is strongly associated with country.
- The UK has institutionalized vegetarian dining, while France is in a growth phase.

While presenting:
I tested whether vegetarian adoption differs by country and found a highly significant association, 
    confirming that cultural context strongly influences dietary offerings.‚Äù
"""

### 3. Indian Restaurant Presence

#### Research Question
##### Is there a statistically significant difference in the presence of Indian restaurants between France and the UK?

##### Null hypothesis (H‚ÇÄ):
##### The presence of Indian restaurants is independent of country (France and the UK have similar proportions).

##### Alternative hypothesis (H‚ÇÅ):
##### The presence of Indian restaurants depends on country.

In [None]:
# Data cleaning
filtered_df["cuisines"] = filtered_df["cuisines"].astype(str)

# Focus only on France & UK
filtered_df = filtered_df[filtered_df["country"].isin(["France", "United Kingdom"])]

# Create Indian cuisine flag
#filtered_df["is_indian"] = filtered_df["cuisines"].str.contains("Indian", case=False, regex=False)

# Contingency table
contingency_table = pd.crosstab(filtered_df["country"], filtered_df["is_Indian"])

# Chi-square test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# results
contingency_table, chi2, p_value

In [None]:
"""
Statistical Test:
   - Chi-square test of independence
   - Appropriate because:
       - Both variables are categorical
       - Large sample size
       - Independence assumption satisfied

Since p < 0.05, we reject the null hypothesis.

Practical Significance:
   - The UK has ~8√ó more Indian restaurants than France

Reflects:
   - Migration history
   - Cultural integration
   - Vegetarian compatibility
   - Consumer familiarity

Our hypothesis test confirms that Indian cuisine is deeply embedded in the UK restaurant market, 
while remaining a niche offering in France
"""

## Exporting to csv file

In [None]:
filtered_df.columns

In [None]:
filtered_df.city

In [None]:
filtered_df.head(10)

In [None]:
filtered_df["vegetarian_friendly"].unique()

In [None]:
filtered_df.to_csv("clean_data.csv", index=False)

## WEB SCRAPING

#### Scraping Menus (Cuisine & Vegetarian Analysis)

In [None]:
# api key: AIzaSyDUQBHKocZ5QSKfSbo9cmhhcc8iYSm3JiI

In [None]:
import requests
from bs4 import BeautifulSoup

url = ""

headers = {
    
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

## MACHINE LEARNING

#### Predict Restaurant Rating Category

##### Can we predict whether a restaurant will be high-rated?

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer

def rating_bucket(r):
    if r >= 4.5:
        return "High"
    elif r >= 4.0:
        return "Medium"
    else:
        return "Low"

filtered_df["rating_category"] = filtered_df["avg_rating"].apply(rating_bucket)

In [None]:
X = filtered_df[
    ["country", "primary_cuisine", "price_range",
     "total_reviews_count", "vegetarian_friendly", "default_language"]
]
y = filtered_df["rating_category"]

##### Test Train Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
numeric_features = [
    "total_reviews_count"
]

categorical_features = [
    "country",
    "primary_cuisine",
    "price_range",
    "default_language",
    "vegetarian_friendly"
]

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

##### Random Forest

In [None]:
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    random_state=42,
    class_weight="balanced"
)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ]
)

In [None]:
pipeline.fit(X_train, y_train)

##### Model Evaluation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
"""
Overall accuracy = 49% ‚Üí low
Model is not balanced across classes

HIGH-rated Restaurants:

Precision: 0.53
When the model predicts High, it‚Äôs correct 53% of the time

Recall: 0.56
It identifies 56% of all truly high-rated restaurants

F1-score: 0.55
Moderate, acceptable but not strong

The model has average performance on high-rated restaurants, missing nearly half of them and producing some false positives.

LOW-rated Restaurants

Precision: 0.39
Only 39% of restaurants predicted as Low are truly low

Recall: 0.74
The model correctly finds 74% of all low-rated restaurants

F1-score: 0.51

Interpretation:
The model is very good at catching low-rated restaurants, but it over-predicts ‚ÄúLow‚Äù, flagging many restaurants incorrectly.

üëâ This is a high-recall, low-precision pattern.

MEDIUM-rated Restaurants (‚ö†Ô∏è Major Issue)

Precision: 1.00
Every time the model predicts Medium, it is correct

Recall: 0.22
But it only finds 22% of all medium-rated restaurants

F1-score: 0.36 (very low)

Most Medium restaurants are being misclassified as High or Low
The model struggles with borderline ratings

Final Conclusion: The model performs reasonably well at identifying low- and high-rated restaurants but 
struggles with medium-rated ones, which tend to overlap in characteristics. 
This reflects the subjective and borderline nature of medium ratings.‚Äù
"""

In [None]:
filtered_df = filtered_df.drop(columns=["vegetarian_flag"])
filtered_df.columns