# Task 2: Restaurant Recommendation

## Objective: Create a restaurant recommendation system based on user preferences.

Steps:
* Preprocess the dataset by handling missing
values and encoding categorical variables.
* Determine the criteria for restaurant
recommendations (e.g., cuisine preference,
price range).
* Implement a content-based filtering
approach where users are recommended
restaurants similar to their preferred criteria.
* Test the recommendation system by
providing sample user preferences and
evaluating the quality of recommendations.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("Dataset .csv")
df.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import linear_kernel

reco_df = df.copy()

# --Preprocessing-- 
reco_df['Cuisines'] = reco_df['Cuisines'].fillna('')

reco_df['features'] = reco_df['Cuisines'] + " " + reco_df['Price range'].astype(str) + " " + reco_df['Aggregate rating'].astype(str)

print("Created a new 'features' column")
reco_df.head(5)

Created a new 'features' column


Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes,features
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Yes,No,No,No,3,4.8,Dark Green,Excellent,314,"French, Japanese, Desserts 3 4.8"
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Yes,No,No,No,3,4.5,Dark Green,Excellent,591,Japanese 3 4.5
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Yes,No,No,No,4,4.4,Green,Very Good,270,"Seafood, Asian, Filipino, Indian 4 4.4"
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,No,No,No,No,4,4.9,Dark Green,Excellent,365,"Japanese, Sushi 4 4.9"
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Yes,No,No,No,4,4.8,Dark Green,Excellent,229,"Japanese, Korean 4 4.8"


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer 

# Initialize the TF-IDF Vectorizer 
# stop_words = 'english removes common words like 'a', 'the', 'in', that does not help with matching
tfidf = TfidfVectorizer(stop_words='english')

# fit and transform 'features' column
# this learns the vocabulary and created the TF-IDF matrix for the 'features' column
tfidf_matrix = tfidf.fit_transform(reco_df['features'])

tfidf_matrix.shape

(9551, 148)

In [5]:
# compute the cosine similarity matrix 
# This calculates the similarity of every restaurant with every other restaurant 
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

print("Cosine similarity matrix created successfully.")
print(f"The shape of our similarity matrix is: {cosine_sim.shape}")
print("Each cell (i, j) in this matrix contains the similarity score between restaurant i and restaurant j.")
print(cosine_sim)

Cosine similarity matrix created successfully.
The shape of our similarity matrix is: (9551, 9551)
Each cell (i, j) in this matrix contains the similarity score between restaurant i and restaurant j.
[[1.         0.56345605 0.         ... 0.         0.         0.        ]
 [0.56345605 1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.3880504 ]
 [0.         0.         0.         ... 0.         0.3880504  1.        ]]


In [6]:
# create a reverse map of indices and restaurant names 
# This will let us look up a restaurant's index by its name 
indices = pd.Series(reco_df.index, index = reco_df['Restaurant Name'])
indices = indices[~indices.index.duplicated(keep='first')]

print(indices)

def get_recommendations(title, cosine_sim = cosine_sim):
    """
    This function takes a restaurant name and returns the top 10 most similar restaurants.
    """
    # 1. Get the index of the restaurant that matches the title 
    try: 
        idx = indices[title]
    except KeyError:
        return 'Restaurant not found in the dataset.'
    
    # 2. Get the pairwise similarity scores of all restaurants with that restaurant
    # We use list(enumerate(...)) to keep track of the original index

    sim_scores = list(enumerate(cosine_sim[idx]))
    print("pairwise similarity scores:", sim_scores)
    # 3. Sort the restaurants based on the similarity scores 
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    # key=lambda x: x[1] tells the sort function: "When you compare two tuples, don't look at the index (x[0]), look at the score (x[1])."
    print("Sorted similarity scores:", sim_scores)

    # 4. Get the scores of the 10 most similar restaurants (the first one is the restaurant itself)
    sim_scores = sim_scores[1:11]
    print("Top 10 similar restaurants:", sim_scores)

    # 5. Get the restaurant indices
    restaurant_indices = [i[0] for i in sim_scores]
    print("Top 10 restaurant indices:", restaurant_indices)

    # 6. Return the top 10 most similar restaurants
    return reco_df['Restaurant Name'].iloc[restaurant_indices]

# --- Let's Test It! ---
# Get recommendations for a restaurant you know or one from the dataset
# Let's try a place that is likely to have distinct features.
# 'Jahanpanah' is a good example as it's known for Mughlai cuisine.

print("--- Restaurant Recommendation System ---")
print("\nRecommendations for 'Jahanpanah':")
print(get_recommendations('Jahanpanah'))

print("\nRecommendations for 'Pizza Hut':")
print(get_recommendations('Pizza Hut'))


Restaurant Name
Le Petit Souffle               0
Izakaya Kikufuji               1
Heat - Edsa Shangri-La         2
Ooma                           3
Sambo Kojin                    4
                            ... 
Naml۱ Gurme                 9546
Ceviz A��ac۱                9547
Huqqa                       9548
A���k Kahve                 9549
Walter's Coffee Roastery    9550
Length: 7446, dtype: int64
--- Restaurant Recommendation System ---

Recommendations for 'Jahanpanah':
pairwise similarity scores: [(0, 0.0), (1, 0.0), (2, 0.07435546617692035), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.10846285820621783), (9, 0.0), (10, 0.0), (11, 0.0), (12, 0.0), (13, 0.0), (14, 0.0), (15, 0.0), (16, 0.0), (17, 0.0), (18, 0.0), (19, 0.0), (20, 0.0), (21, 0.0), (22, 0.0), (23, 0.0), (24, 0.0), (25, 0.0), (26, 0.0), (27, 0.0), (28, 0.0), (29, 0.0), (30, 0.0), (31, 0.0), (32, 0.0), (33, 0.0), (34, 0.0), (35, 0.0), (36, 0.0), (37, 0.0), (38, 0.0), (39, 0.0), (40, 0.0), (41, 0.0), (42, 

In [None]:
# --- 1. Import Libraries ---
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# --- 2. Load Data ---
df = pd.read_csv('Dataset .csv')
reco_df = df.copy()

# --- 3. Advanced Feature Engineering (Weighted Soup) ---
# Select relevant features and handle missing values
reco_df = reco_df[['Restaurant Name', 'Cuisines', 'Locality', 'Average Cost for two']].dropna()

# Give more weight to Cuisines by repeating it in the feature string.
# This makes cuisine matches more influential in the recommendation.
reco_df['features'] = (reco_df['Cuisines'].astype(str) + ' ') * 10 + \
                      reco_df['Locality'].astype(str) + ' ' + \
                      reco_df['Average Cost for two'].astype(str)

# --- 4. Vectorization and Similarity Calculation ---
# Initialize TfidfVectorizer with English stop words
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the 'features' column into a TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(reco_df['features'])

# Compute the cosine similarity matrix using the efficient linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# --- 5. Building the Recommendation Function ---
# Create a reverse map of indices and restaurant names to handle lookups
indices = pd.Series(reco_df.index, index=reco_df['Restaurant Name']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim, indices=indices):
    """
    This function takes a restaurant title as input and returns the top 10
    most similar restaurants based on the weighted feature soup.
    """
    try:
        # Get the index of the restaurant that matches the title
        idx = indices[title]

        if isinstance(idx, pd.Series): 
            idx = idx.iloc[0] 

        # Get the pairwise similarity scores of all restaurants with that restaurant
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Sort the restaurants based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)


        # Get the scores of the 10 most similar restaurants (excluding itself)
        sim_scores = sim_scores[1:31]

        # Get the restaurant indices
        restaurant_indices = [i[0] for i in sim_scores]

        recommended_restaurants = reco_df['Restaurant Name'].iloc[restaurant_indices]
        unique_restaurants = recommended_restaurants.drop_duplicates().head(10)
        # Return the top 10 most similar restaurants
        return unique_restaurants
    except KeyError:
        return f'Restaurant "{title}" not found in the dataset.'

# --- 6. Testing the Advanced Recommender ---
print('--- Recommendations for "Jahanpanah" (Advanced Content-Based) ---')
print(get_recommendations('Jahanpanah'))

print('\n--- Recommendations for "Pizza Hut" (Advanced Content-Based) ---')
print(get_recommendations('Pizza Hut'))

--- Recommendations for "Jahanpanah" (Advanced Content-Based) ---
640                                         Pind Balluchi
7479                                 Frontier - The Ashok
6529    The Great Kabab Factory - Radisson Blu Plaza D...
8943                                      Legacy of Awadh
801                                               Karim's
2031                            DCK- Dana Choga's Kitchen
6067                                     Nawab Restaurant
6255                                             K's Town
6271                                Ruchi's Food Junction
7095                              Ceaser Fast Food Centre
Name: Restaurant Name, dtype: object

--- Recommendations for "Pizza Hut" (Advanced Content-Based) ---
803                            Burgrill
6258                         McDonald's
7015                    High On Burgers
1425                       Burger Point
5056                        Burger Wala
4943                        Burger King
6605    Mil

In [30]:
list1 = []
for i in reco_df:
        list1.append([reco_df['Cuisines']])

print(list1)

[[0             French, Japanese, Desserts
1                               Japanese
2       Seafood, Asian, Filipino, Indian
3                        Japanese, Sushi
4                       Japanese, Korean
                      ...               
9546                             Turkish
9547     World Cuisine, Patisserie, Cafe
9548              Italian, World Cuisine
9549                     Restaurant Cafe
9550                                Cafe
Name: Cuisines, Length: 9551, dtype: object], [0             French, Japanese, Desserts
1                               Japanese
2       Seafood, Asian, Filipino, Indian
3                        Japanese, Sushi
4                       Japanese, Korean
                      ...               
9546                             Turkish
9547     World Cuisine, Patisserie, Cafe
9548              Italian, World Cuisine
9549                     Restaurant Cafe
9550                                Cafe
Name: Cuisines, Length: 9551, dtype: object], [0 

In [32]:
# --- 1. Import Libraries ---
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# --- 2. Load Data ---
df = pd.read_csv('Dataset .csv')
reco_df = df.copy()

# 1. Clean the Cuisines column to remove generic, unhelpful terms.
# We are keeping this step as it is still valuable.
def clean_text(text):
    text = str(text).lower()
    # Remove generic cuisines
    text = text.replace('fast food', '')
    text = text.replace('desserts', '')
    text = text.replace('beverages', '')
    # Remove commas and other punctuation that doesn't add meaning
    text = text.replace(',', ' ')
    return text

reco_df['Cuisines_Cleaned'] = reco_df['Cuisines'].apply(clean_text)

# 2. Create the final feature soup by combining multiple columns.
# We give Cuisines the highest weight (x4).
# We also include the Restaurant Name and Locality to ensure there's always a signal.
# This prevents the "empty feature" problem for places like donut shops.
def create_soup(x):
    return (x['Cuisines_Cleaned'] + ' ') * 4 + \
           (x['Restaurant Name'] + ' ') * 2 + \
           x['Locality']

reco_df['soup'] = reco_df.apply(create_soup, axis=1)

# --- 4. Vectorization and Similarity Calculation ---
# Initialize TfidfVectorizer with English stop words
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the 'features' column into a TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(reco_df['soup'])

# Compute the cosine similarity matrix using the efficient linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# --- 5. Building the Recommendation Function ---
# Create a reverse map of indices and restaurant names to handle lookups
indices = pd.Series(reco_df.index, index=reco_df['Restaurant Name']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim, indices=indices):
    """
    This function takes a restaurant title as input and returns the top 10
    most similar restaurants based on the weighted feature soup.
    """
    try:
        # Get the index of the restaurant that matches the title
        idx = indices[title]

        if isinstance(idx, pd.Series): 
            idx = idx.iloc[0] 

        # Get the pairwise similarity scores of all restaurants with that restaurant
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Sort the restaurants based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)


        # Get the scores of the 10 most similar restaurants (excluding itself)
        sim_scores = sim_scores[1:31]

        # Get the restaurant indices
        restaurant_indices = [i[0] for i in sim_scores]

        recommended_restaurants = reco_df['Restaurant Name'].iloc[restaurant_indices]
        unique_restaurants = recommended_restaurants.drop_duplicates().head(10)
        # Return the top 10 most similar restaurants
        return unique_restaurants
    except KeyError:
        return f'Restaurant "{title}" not found in the dataset.'

# --- 6. Testing the Advanced Recommender ---
print('--- Recommendations for "Jahanpanah" (Advanced Content-Based) ---')
print(get_recommendations('Jahanpanah'))

print('\n--- Recommendations for "Pizza Hut" (Advanced Content-Based) ---')
print(get_recommendations('Desserts'))

--- Recommendations for "Jahanpanah" (Advanced Content-Based) ---
625         Rangrezz Restaurant
4156            Chicken Mughlai
8247                 They�_����
8418                The Kitchen
3054           Mughlai Junction
6255                   K's Town
8662                    Pitstop
1980              Mughlai Treat
1763    Gurgaon Mughlai Chicken
1817                  R S Foods
Name: Restaurant Name, dtype: object

--- Recommendations for "Pizza Hut" (Advanced Content-Based) ---
Restaurant "Desserts" not found in the dataset.


In [None]:
# --- 6. NEW FUNCTION: Recommend by Cuisine with Weighted Ranking ---

# First, we need the original 'df' DataFrame that has the rating and votes columns.
# Let's make sure the required columns are numeric and clean.
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')
df['Aggregate rating'] = pd.to_numeric(df['Aggregate rating'], errors='coerce') # errors='coerce' instructs the function to handle any values that cannot be successfully converted by replacing them with a designated "missing" or "invalid" value.
df.dropna(subset=['Votes', 'Aggregate rating'], inplace=True)

# Calculate the components for our weighted rating formula
# C = the mean rating across the whole dataset
C = df['Aggregate rating'].mean()

# m = the minimum votes required to be considered. We'll set this as the 90th percentile.
# This means a restaurant needs more votes than 90% of the others to be considered.
m = df['Votes'].quantile(0.90)

# Filter restaurants that qualify for the chart (must have more votes than m)
q_restaurants = df.copy().loc[df['Votes'] >= m]

def weighted_rating(x, m=m, C=C):
    """Calculates the weighted rating for a restaurant."""
    v = x['Votes']
    R = x['Aggregate rating']
    # The IMDB formula
    return (v / (v + m) * R) + (m / (v + m) * C)

# Apply the function to our qualified restaurants
q_restaurants['score'] = q_restaurants.apply(weighted_rating, axis=1)

# Sort the restaurants based on the score
q_restaurants = q_restaurants.sort_values('score', ascending=False)


def recommend_by_cuisine(cuisine, top_n=10):
    """
    Recommends top N restaurants for a given cuisine based on a weighted score.
    """
    # Filter for restaurants that serve the specified cuisine
    cuisine_df = q_restaurants[q_restaurants['Cuisines'].str.contains(cuisine, case=False, na=False)]

    # Check if any restaurants were found
    if cuisine_df.empty:
        return f"Sorry, no high-rated restaurants found for '{cuisine}'."

    # Return the top N restaurants from the pre-sorted list
    return cuisine_df[['Restaurant Name', 'Cuisines', 'Aggregate rating', 'Votes']].head(top_n)

# --- Example Usage ---
# Now you can get recommendations for a cuisine like this:
print(f"Top recommendations for 'Desserts':")
print(recommend_by_cuisine('Desserts'))

print(f"\nTop recommendations for 'Italian':")
print(recommend_by_cuisine('Italian'))

print(f"\nTop recommendations for 'Pizza':")
print(recommend_by_cuisine('Pizza'))


Top recommendations for 'Italian':
                           Restaurant Name  \
728                                   Toit   
507               Mazzaro's Italian Market   
512         Ella's Americana Folk Art Cafe   
2483                    The Fusion Kitchen   
743                            Big Brewsky   
4638                             Big Chill   
1464      Manhattan Brewery & Bar Exchange   
6997                   Echoes Satyaniketan   
1861  Downtown - Diners & Living Beer Cafe   
4639                             Big Chill   

                                               Cuisines  Aggregate rating  \
728                            Italian, American, Pizza               4.8   
507                                       Italian, Deli               4.9   
512                    International, Italian, Southern               4.8   
2483            North Indian, Italian, Chinese, Mexican               4.7   
743   Finger Food, North Indian, Italian, Continenta...               4.