In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [43]:
train_ds = pd.read_csv('zomato.csv')
train_ds.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


In [44]:
#Deleting Unnnecessary Columns
new_train = train_ds.drop(['url','dish_liked','phone'],axis=1) 
#Dropping the column "dish_liked", "phone", "url" and saving the new dataset as "new_train"

#Removing the Duplicates
new_train.duplicated().sum()
new_train.drop_duplicates(inplace=True)

#Remove the NaN values from the dataset
new_train.isnull().sum()
new_train.dropna(how='any',inplace=True)

#Changing the column names
new_train = new_train.rename(columns={'approx_cost(for two people)':'cost','listed_in(type)':'type', 'listed_in(city)':'city'})

#Some Transformations
new_train['cost'] = new_train['cost'].astype(str) #Changing the cost to string
new_train['cost'] = new_train['cost'].apply(lambda x: x.replace(',','.')) 
#Using lambda function to replace ',' from cost
new_train['cost'] = new_train['cost'].astype(float)

#Removing '/5' from Rates
new_train = new_train.loc[new_train.rate !='NEW']
new_train = new_train.loc[new_train.rate !='-'].reset_index(drop=True)
remove_slash = lambda x: x.replace('/5', '') if type(x) == np.str else x
new_train.rate = new_train.rate.apply(remove_slash).str.strip().astype('float')

# Adjust the column names
new_train.name = new_train.name.apply(lambda x:x.title())
new_train.online_order.replace(('Yes','No'),(True, False),inplace=True)
new_train.book_table.replace(('Yes','No'),(True, False),inplace=True)

## Computing Mean Rating
restaurants = list(new_train['name'].unique())
new_train['Mean Rating'] = 0

for i in range(len(restaurants)):
    new_train['Mean Rating'][new_train['name'] == restaurants[i]] = new_train['rate'][new_train['name'] == restaurants[i]].mean()
    
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (1,5))
new_train[['Mean Rating']] = scaler.fit_transform(new_train[['Mean Rating']]).round(2)


In [45]:
new_train.head()

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city,Mean Rating
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,True,True,4.1,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.0,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari,3.99
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,True,False,4.1,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800.0,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari,3.97
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,True,False,3.8,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800.0,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari,3.58
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,False,False,3.7,88,Banashankari,Quick Bites,"South Indian, North Indian",300.0,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari,3.45
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,False,False,3.8,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600.0,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari,3.58


In [46]:
## Lower Casing
new_train["reviews_list"] = new_train["reviews_list"].str.lower()

## Removal of Puctuations
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

new_train["reviews_list"] = new_train["reviews_list"].apply(lambda text: remove_punctuation(text))

In [47]:
new_train.head()

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city,Mean Rating
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,True,True,4.1,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.0,rated 40 ratedn a beautiful place to dine int...,[],Buffet,Banashankari,3.99
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,True,False,4.1,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800.0,rated 40 ratedn had been here for dinner with...,[],Buffet,Banashankari,3.97
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,True,False,3.8,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800.0,rated 30 ratedn ambience is not that good eno...,[],Buffet,Banashankari,3.58
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,False,False,3.7,88,Banashankari,Quick Bites,"South Indian, North Indian",300.0,rated 40 ratedn great food and proper karnata...,[],Buffet,Banashankari,3.45
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,False,False,3.8,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600.0,rated 40 ratedn very good restaurant in neigh...,[],Buffet,Banashankari,3.58


In [51]:
## Removal of URLS
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

new_train["reviews_list"] = new_train["reviews_list"].apply(lambda text: remove_urls(text))

new_train[['reviews_list', 'cuisines']].sample(5)


# Randomly sample 60% of your dataframe
new_train_percent= new_train.sample(frac=0.5)

In [49]:
# RESTAURANT NAMES:
restaurant_names = list(new_train['name'].unique())
def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_nu_of_words]
    
new_train=new_train.drop(['address','rest_type', 'type', 'menu_item', 'votes'],axis=1)






In [52]:
new_train_percent.set_index('name', inplace=True)
indices = pd.Series(new_train_percent.index)

# Creating tf-idf matrix
# tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
# tfidf_matrix = tfidf.fit_transform(new_train_percent['reviews_list'])

# cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [53]:
new_train_percent.head()

Unnamed: 0_level_0,online_order,book_table,rate,location,cuisines,cost,reviews_list,city,Mean Rating
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Grand Spicy Restaurant,False,False,3.6,Cunningham Road,"North Indian, Mughlai, Chinese",700.0,rated 50 ratedn very nice place royale feelin...,Lavelle Road,3.32
Tandoori Paradise,True,False,3.5,BTM,"North Indian, Chinese",600.0,rated 20 ratedn the food could have been bett...,Koramangala 6th Block,3.23
L J Fast Food,False,False,3.3,Malleshwaram,Chinese,300.0,rated 40 ratedn good and well maintained chin...,Rajajinagar,2.94
Fudge,True,False,3.2,Jayanagar,"Desserts, Momos",200.0,rated 30 ratedn looking for some chocolate sh...,Koramangala 5th Block,3.04
Mini Punjabi Dhaba,True,False,4.2,Banashankari,North Indian,350.0,rated 40 ratedn this place was introduced to ...,Banashankari,4.1


In [54]:
# Creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
# tfidf_matrix = tfidf.fit_transform(new_train_percent['reviews_list'])

In [55]:
tfidf_matrix = tfidf.fit_transform(new_train_percent['reviews_list'])

In [56]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [61]:
def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]

    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
        
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)

     # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(new_train_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['cuisines', 'Mean Rating', 'cost'])

     # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(new_train_percent[['cuisines','Mean Rating','cost']][new_train_percent.index == each].sample()))
    
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['cuisines','Mean Rating', 'cost'], keep=False)
    df_new = df_new.sort_values(by='Mean Rating', ascending=False).head(10)
    
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))
    
    return df_new
recommend('Pai Vihar')

TOP 10 RESTAURANTS LIKE Pai Vihar WITH SIMILAR REVIEWS: 


Unnamed: 0,cuisines,Mean Rating,cost
Soda Bottle Opener Wala,"Parsi, North Indian",4.36,1.3
Foxtrot - House Of Subculture,"Cafe, American, Asian, North Indian",4.35,1.0
CafãÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ© Felix,"American, Cafe, Continental",4.35,1.7
Little Green Cafe,"Cafe, Healthy Food, Salad, Continental, Bevera...",4.23,900.0
Church Street Social,"American, North Indian, Chinese, Finger Food, ...",4.23,1.5
Le Jardin - The Oberoi,"European, Mediterranean, North Indian",4.23,3.0
Hotel Tom'S Restaurant,"Mangalorean, Seafood, Chinese, North Indian",4.15,1.0
Nando'S,"Portuguese, African",4.13,1.2
Marzipan Cafe & Bakery,"Cafe, Mediterranean, Bakery, Greek, Beverages",4.1,700.0
Forage,"Healthy Food, Beverages, European, Salad",4.03,1.5
