In [1]:
import numpy as np
import pandas as pd
zomato = pd.read_csv("zomato.csv")
zomato.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


In [2]:
zomato.describe()

Unnamed: 0,votes
count,51717.0
mean,283.697527
std,803.838853
min,0.0
25%,7.0
50%,41.0
75%,198.0
max,16832.0


In [3]:
zomato.columns

Index(['url', 'address', 'name', 'online_order', 'book_table', 'rate', 'votes',
       'phone', 'location', 'rest_type', 'dish_liked', 'cuisines',
       'approx_cost(for two people)', 'reviews_list', 'menu_item',
       'listed_in(type)', 'listed_in(city)'],
      dtype='object')

In [4]:
# Let's delete unnecessay columns
# here I will drop the column "dish_liked", "phone", "url"
zomato2 = zomato.drop(['url','dish_liked','phone'], axis=1)

In [5]:
# let's check for duplicate values
print(zomato2.duplicated().sum())

43


In [6]:

# now let's drop the duplicate values
zomato2.drop_duplicates(inplace=True)

In [7]:
# Now let's have a look at the null values in the dataset
print(zomato2.isnull().sum())

address                           0
name                              0
online_order                      0
book_table                        0
rate                           7767
votes                             0
location                         21
rest_type                       227
cuisines                         45
approx_cost(for two people)     345
reviews_list                      0
menu_item                         0
listed_in(type)                   0
listed_in(city)                   0
dtype: int64


In [8]:
# let's drop all the null values
zomato2.dropna(how='any', inplace=True)

In [9]:
# Now I will change the names of some columns to make the columns easier to use in the process
zomato = zomato2.rename(columns={'approx_cost(for two people)':'cost','listed_in(type)':'type', 'listed_in(city)':'city'})

In [10]:
#Some Transformations
zomato['cost'] = zomato['cost'].astype(str) #Changing the cost to string
zomato['cost'] = zomato['cost'].apply(lambda x: x.replace(',','.')) #Using lambda function to replace ',' from cost
zomato['cost'] = zomato['cost'].astype(float)

In [11]:
#Removing '/5' from Rates
zomato = zomato.loc[zomato.rate !='NEW']
zomato = zomato.loc[zomato.rate !='-'].reset_index(drop=True)
remove_slash = lambda x: x.replace('/5', '') if type(x) == np.str else x
zomato.rate = zomato.rate.apply(remove_slash).str.strip().astype('float')

In [12]:
# Adjust the column names
zomato.name = zomato.name.apply(lambda x:x.title())
zomato.online_order.replace(('Yes','No'),(True, False),inplace=True)
zomato.book_table.replace(('Yes','No'),(True, False),inplace=True)

In [13]:
## Computing Mean Rating
restaurants = list(zomato['name'].unique())
zomato['Mean Rating'] = 0

for i in range(len(restaurants)):
    zomato['Mean Rating'][zomato['name'] == restaurants[i]] = zomato['rate'][zomato['name'] == restaurants[i]].mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zomato['Mean Rating'][zomato['name'] == restaurants[i]] = zomato['rate'][zomato['name'] == restaurants[i]].mean()


In [14]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (1,5))
zomato[['Mean Rating']] = scaler.fit_transform(zomato[['Mean Rating']]).round(2)

In [15]:
## Lower Casing
zomato["reviews_list"] = zomato["reviews_list"].str.lower()

In [16]:
# now I will define a function to remove punctuation from the reviews
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: remove_punctuation(text))

In [17]:
# Now let's remove the stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: remove_stopwords(text))

In [18]:
# Now let's remove the urls from the reviews
import re
def removeurls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: removeurls(text))

zomato[['reviews_list', 'cuisines']].sample(10)

Unnamed: 0,reviews_list,cuisines
4668,rated 30 ratedn sweet chariot probably 90s kid...,"Bakery, Fast Food"
23899,rated 10 ratedn hi cake received half kg fooli...,Bakery
36171,rated 50 ratedn really nice calm peaceful cafe...,"Bakery, Desserts"
29005,rated 40 ratedn food great price offer quantit...,"Continental, Parsi, Chinese, North Indian, Ita..."
9892,rated 30 ratedn gone shopping lido mall decide...,"Ice Cream, Bakery, Desserts"
484,rated 40 ratedn really like decor ambience wam...,"Asian, Korean, Indonesian, Japanese, Chinese, ..."
7306,rated 30 ratedn ordered chicken biriyani place...,"Mughlai, Chinese, North Indian"
23733,rated 50 ratedn ordering food almost 2 years g...,"North Indian, Chinese"
13382,rated 40 ratedn came primarily coupons free co...,"Cafe, Continental"
37325,rated 40 ratedn best place savour food junctio...,"South Indian, North Indian, Chinese"


In [19]:
# let's process of the names of reataurants
restaurant_names = list(zomato['name'].unique())
def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_nu_of_words]

In [20]:

# Now let's drop the unnecessary columns
zomato=zomato.drop(['address','rest_type', 'type', 'menu_item', 'votes'],axis=1)

In [21]:
# Now I will randomly sample the data
df_percent = zomato.sample(frac=0.5)

df_percent.set_index('name', inplace=True)
indices = pd.Series(df_percent.index)

In [22]:
# Now Let's build the restaurant recommendation system
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#Now I will perform tf-idf vectorization on the dataset
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [23]:
# Now the final step is to create a function to recommend restaurants
def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommendations = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommendations.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df = pd.DataFrame(columns=['cuisines', 'Mean Rating', 'cost'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommendations:
        df = df.append(pd.DataFrame(df_percent[['cuisines','Mean Rating', 'cost']][df_percent.index == each].sample()))

    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df = df.drop_duplicates(subset=['cuisines','Mean Rating', 'cost'], keep=False)
    df = df.sort_values(by='Mean Rating', ascending=False).head(10)
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df)), name))   
    return df
recommend('Keto Kitchen')

TOP 10 RESTAURANTS LIKE Keto Kitchen WITH SIMILAR REVIEWS: 


Unnamed: 0,cuisines,Mean Rating,cost
Burma Burma,"Asian, Burmese",4.74,1.5
Chianti,Italian,4.59,1.5
Foxtrot - House Of Subculture,"Cafe, American, Asian, North Indian",4.35,1.0
The Lantern Restaurant & Bar - The Ritz-Carlton...,Chinese,4.28,3.5
Mojo Pizza - 2X Toppings,Pizza,4.13,600.0
Nando'S,"Portuguese, Wraps, Burger, Salad",4.13,1.2
Forage,"Healthy Food, Beverages, European, Salad",4.03,1.5
The Grill House,"American, Mexican, Italian, Steak",3.95,1.0
Wow Paratha,North Indian,3.71,400.0
Green Pepper,"Seafood, South Indian, Chinese, Kerala",3.65,600.0


In [24]:
recommend('Pai Vihar')

TOP 10 RESTAURANTS LIKE Pai Vihar WITH SIMILAR REVIEWS: 


Unnamed: 0,cuisines,Mean Rating,cost
Burma Burma,"Asian, Burmese",4.74,1.5
Ilyazsab The House Of Chicken,"Rolls, Kebab",3.84,250.0
Andhra Ruchulu,"Andhra, Biryani, North Indian, Chinese",3.72,1.0
Dum Biryani Hub,Biryani,3.71,700.0
Wow Paratha,North Indian,3.71,400.0
Green Pepper,"Seafood, South Indian, Chinese, Kerala",3.65,600.0
King Of Spices,"South Indian, North Indian, Chinese, Biryani, ...",3.45,500.0
3 Leafs,"North Indian, South Indian, Chinese",3.45,600.0
Cafe @ Elanza,"Chinese, North Indian, Cafe",3.45,1.0
Chowpatty,"North Indian, Fast Food, Street Food",3.43,300.0


In [25]:
recommend('The Grill House')

TOP 10 RESTAURANTS LIKE The Grill House WITH SIMILAR REVIEWS: 


Unnamed: 0,cuisines,Mean Rating,cost
Salvadores,"Mediterranean, Continental, French, Italian",4.35,2.0
The Lantern Restaurant & Bar - The Ritz-Carlton...,Chinese,4.28,3.5
Bang - The Ritz-Carlton,Finger Food,4.14,3.0
Nando'S,"Portuguese, African",4.13,1.2
Marzipan Cafe & Bakery,"Cafe, Mediterranean, Bakery, Greek, Beverages",4.1,700.0
Watson'S,"Finger Food, Salad, Chinese, Continental",3.95,1.3
The Flying Squirrel,"Cafe, Bakery, Beverages, Burger, Continental",3.84,450.0
The Flying Squirrel,Cafe,3.84,450.0
Django,"Cafe, Italian, Pizza, Burger, Beverages",3.84,800.0
Bheema'S,Andhra,3.84,650.0
