In [1]:
import numpy as np
import pandas as pd
zomato = pd.read_csv("zomato.csv")
zomato.head()

FileNotFoundError: [Errno 2] No such file or directory: 'zomato.csv'

In [None]:
zomato.describe()

In [None]:
zomato.columns

In [None]:
# Let's delete unnecessay columns
# here I will drop the column "dish_liked", "phone", "url"
zomato2 = zomato.drop(['url','dish_liked','phone'], axis=1)

In [None]:
# let's check for duplicate values
print(zomato2.duplicated().sum())

In [None]:
# now let's drop the duplicate values
zomato2.drop_duplicates(inplace=True)

In [None]:
# Now let's have a look at the null values in the dataset
print(zomato2.isnull().sum())

In [None]:
# let's drop all the null values
zomato2.dropna(how='any', inplace=True)

In [None]:
# Now I will change the names of some columns to make the columns easier to use in the process
zomato = zomato2.rename(columns={'approx_cost(for two people)':'cost','listed_in(type)':'type', 'listed_in(city)':'city'})

In [None]:
#Some Transformations
zomato['cost'] = zomato['cost'].astype(str) #Changing the cost to string
zomato['cost'] = zomato['cost'].apply(lambda x: x.replace(',','.')) #Using lambda function to replace ',' from cost
zomato['cost'] = zomato['cost'].astype(float)

In [None]:
#Removing '/5' from Rates
zomato = zomato.loc[zomato.rate !='NEW']
zomato = zomato.loc[zomato.rate !='-'].reset_index(drop=True)
remove_slash = lambda x: x.replace('/5', '') if type(x) == np.str else x
zomato.rate = zomato.rate.apply(remove_slash).str.strip().astype('float')

In [None]:
# Adjust the column names
zomato.name = zomato.name.apply(lambda x:x.title())
zomato.online_order.replace(('Yes','No'),(True, False),inplace=True)
zomato.book_table.replace(('Yes','No'),(True, False),inplace=True)

In [None]:
## Computing Mean Rating
restaurants = list(zomato['name'].unique())
zomato['Mean Rating'] = 0

for i in range(len(restaurants)):
    zomato['Mean Rating'][zomato['name'] == restaurants[i]] = zomato['rate'][zomato['name'] == restaurants[i]].mean()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (1,5))
zomato[['Mean Rating']] = scaler.fit_transform(zomato[['Mean Rating']]).round(2)

In [None]:
## Lower Casing
zomato["reviews_list"] = zomato["reviews_list"].str.lower()

In [None]:
# now I will define a function to remove punctuation from the reviews
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: remove_punctuation(text))

In [None]:
# Now let's remove the stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: remove_stopwords(text))

In [None]:
# Now let's remove the urls from the reviews
import re
def removeurls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: removeurls(text))

zomato[['reviews_list', 'cuisines']].sample(10)

In [None]:
# let's process of the names of reataurants
restaurant_names = list(zomato['name'].unique())
def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_nu_of_words]

In [None]:
# Now let's drop the unnecessary columns
zomato=zomato.drop(['address','rest_type', 'type', 'menu_item', 'votes'],axis=1)

In [None]:
# Now I will randomly sample the data
df_percent = zomato.sample(frac=0.5)

df_percent.set_index('name', inplace=True)
indices = pd.Series(df_percent.index)

In [None]:
# Now Let's build the restaurant recommendation system
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#Now I will perform tf-idf vectorization on the dataset
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
# Now the final step is to create a function to recommend restaurants
def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommendations = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommendations.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df = pd.DataFrame(columns=['cuisines', 'Mean Rating', 'cost'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommendations:
        df = df.append(pd.DataFrame(df_percent[['cuisines','Mean Rating', 'cost']][df_percent.index == each].sample()))

    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df = df.drop_duplicates(subset=['cuisines','Mean Rating', 'cost'], keep=False)
    df = df.sort_values(by='Mean Rating', ascending=False).head(10)
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df)), name))   
    return df
recommend('Keto Kitchen')

In [None]:
recommend('Pai Vihar')

In [None]:
recommend('The Grill House')