<a href="https://colab.research.google.com/github/2sneha5/2sneha5.github.io/blob/master/food_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd 
import numpy as np

df=pd.read_csv('/content/drive/MyDrive/db_food/food.csv')
train=pd.read_csv('/content/drive/MyDrive/db_food/food.csv')
df.columns = ['food_id','title','restaurant_id','price', 'num_orders', 'category', 'avg_rating', 'num_rating', 'tags']

df

Unnamed: 0,food_id,title,restaurant_id,price,num_orders,category,avg_rating,num_rating,tags
0,1,Lala Maggi,1,30,35,maggi,3.9,10,"vegetarian, spicy"
1,2,Cheese Maggi,1,25,40,maggi,3.8,15,vegetarian
2,3,Masala Maggi,1,25,10,maggi,3.0,10,"vegetarian, spicy"
3,4,Veg Maggi,1,30,25,maggi,2.5,5,"vegetarian, healthy"
4,5,Paneer Tikka,1,60,50,meal,4.6,30,"vegetarian, healthy"
...,...,...,...,...,...,...,...,...,...
264,265,Til Pitha,11,30,0,dessert,3.4,10,vegetarian
265,266,Bebinca,3,60,0,dessert,2.3,17,vegetarian
266,267,Shufta,4,50,0,dessert,2.6,3,vegetarian
267,268,Mawa Bati,5,45,0,dessert,2.5,10,vegetarian


In [4]:
# mean of average ratings of all items
C= df['avg_rating'].mean()
D= df['avg_rating'].median()
print("mean of avg rating ->",C)
print("median of avg rating ->",D)
# the minimum number of votes required to appear in recommendation list, i.e, 40% among 'num_rating'
m= df['num_rating'].quantile(0.4)

# items that qualify the criteria of minimum num of votes
qualified_items = df.copy().loc[df['num_rating'] >= m]

# Calculation of weighted rating based on the IMDB formula
def weighted_rating(x, m=m, C=C):
    v = x['num_rating']
    R = x['avg_rating']
    return (v/(v+m) * R) + (m/(m+v) * C)

# Applying weighted_rating to qualified items
qualified_items['score'] = qualified_items.apply(weighted_rating, axis=1)

# Shortlisting the top rated items and popular items
top_rated_items = qualified_items.sort_values('score', ascending=False)
pop_items= df.sort_values('num_orders', ascending=False)

mean of avg rating -> 3.6066914498141265
median of avg rating -> 3.4


In [5]:
# Display results of demographic filtering
top_rated_items[['title', 'num_rating', 'avg_rating', 'score']].head()
pop_items[['title', 'num_orders']].head()

Unnamed: 0,title,num_rating,avg_rating,score
6,samosa,43,5.0,4.696005
178,Upma,42,5.0,4.690376
124,Panjeeri,42,5.0,4.690376
51,Ariselu,42,5.0,4.690376
103,Fara,42,5.0,4.690376


Unnamed: 0,title,num_orders
6,samosa,90
4,Paneer Tikka,50
1,Cheese Maggi,40
5,Chicken Tikka,40
0,Lala Maggi,35


In [6]:
#Define the stopwords to remove and the stemming tool
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer
from subprocess import check_output
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
# Creating soup string for each item
def create_soup(x):            
    tags = x['tags'].lower().split(', ')
    tags.extend(x['title'].lower().split())
    tags.extend(x['category'].lower().split())
    return " ".join(sorted(set(tags), key=tags.index))

df['soup'] = df.apply(create_soup, axis=1)
df.head(10)

Unnamed: 0,food_id,title,restaurant_id,price,num_orders,category,avg_rating,num_rating,tags,soup
0,1,Lala Maggi,1,30,35,maggi,3.9,10,"vegetarian, spicy",vegetarian spicy lala maggi
1,2,Cheese Maggi,1,25,40,maggi,3.8,15,vegetarian,vegetarian cheese maggi
2,3,Masala Maggi,1,25,10,maggi,3.0,10,"vegetarian, spicy",vegetarian spicy masala maggi
3,4,Veg Maggi,1,30,25,maggi,2.5,5,"vegetarian, healthy",vegetarian healthy veg maggi
4,5,Paneer Tikka,1,60,50,meal,4.6,30,"vegetarian, healthy",vegetarian healthy paneer tikka meal
5,6,Chicken Tikka,1,80,40,meal,4.2,28,"nonveg, healthy, spicy",nonveg healthy spicy chicken tikka meal
6,7,samosa,2,40,90,snacks,5.0,43,"vegetarian, healthy",vegetarian healthy samosa snacks
7,8,onion kachori,2,35,10,snacks,3.2,56,"vegetarian, healthy",vegetarian healthy onion kachori snacks
8,9,bread pakoda,2,25,10,snacks,3.9,10,"vegetarian, healthy",vegetarian healthy bread pakoda snacks
9,10,onion paratha,2,50,35,snacks,5.0,35,"vegetarian, spicy",vegetarian spicy onion paratha snacks


In [8]:
#Preprocess the text in training and testing
import nltk
nltk.download('punkt')
processed_train = []
X_text_train=df['title'].values
X_text_test=df['tags'].values
for doc in X_text_train:
    tokens = word_tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in filtered]
    processed_train.append(stemmed)
    
processed_test = []
for doc in X_text_test:
    tokens = word_tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in filtered]
    processed_test.append(stemmed)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english')

# df['soup']
count_matrix = count.fit_transform(df['soup'])
count_matrix
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)

indices_from_title = pd.Series(df.index, index=df['title'])
indices_from_food_id = pd.Series(df.index, index=df['food_id'])

<269x335 sparse matrix of type '<class 'numpy.int64'>'
	with 1146 stored elements in Compressed Sparse Row format>

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
X_train, X_test, y_train, y_test = train_test_split(df['title'],df['soup'],test_size=0.33,random_state=8675309)
X_train

X_test.shape
y_test.shape
y_train.shape
print("--->",X_train.shape)

115                    Misi roti
224                Veg Kolhapuri
262                     Red Rice
10                  idli sambhar
200                       Handwo
                 ...            
18                   Gulab jamun
232    Turiya Patra Vatana sabji
255                  Koldil Duck
140               Bisi bele bath
1                   Cheese Maggi
Name: title, Length: 180, dtype: object

(89,)

(89,)

(180,)

---> (180,)


In [11]:

X_train_cv = count.transform(df['soup'])

In [12]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(X_train)

X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
test_tfidf = tfidf.transform(df['soup'])

rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)
print(rf.score(X_test_tfidf, y_test))
predictions = rf.predict(X_test_tfidf)
# print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

TfidfVectorizer(stop_words='english')

RandomForestClassifier()

0.0
                                                  precision    recall  f1-score   support

           non vegetarian alu pitika main course       0.00      0.00      0.00       0.0
             non vegetarian beef fry main course       0.00      0.00      0.00       0.0
           non vegetarian black rice main course       0.00      0.00      0.00       0.0
           non vegetarian bombil fry main course       0.00      0.00      0.00       1.0
       non vegetarian butter chicken main course       0.00      0.00      0.00       1.0
 non vegetarian chicken tikka masala main course       0.00      0.00      0.00       1.0
            non vegetarian chicken tikka starter       0.00      0.00      0.00       0.0
  non vegetarian chingri malai curry main course       0.00      0.00      0.00       0.0
                non vegetarian galho main course       0.00      0.00      0.00       1.0
          non vegetarian kolim jawla main course       0.00      0.00      0.00       1.0
     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# Function that takes in food title or food id as input and outputs most similar dishes 
def get_recommendations(title="", cosine_sim=cosine_sim, idx=-1):
    # Get the index of the item that matches the title
    if idx == -1 and title != "":
        idx = indices_from_title[title]

    # Get the pairwsie similarity scores of all dishes with that dish
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the dishes based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar dishes
    sim_scores = sim_scores[1:3]

    # Get the food indices
    food_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar dishes
    return food_indices

In [14]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer

my_stop_words = text.ENGLISH_STOP_WORDS.union(df['soup'])

vectorizer = TfidfVectorizer(ngram_range=(1,1), stop_words=my_stop_words)

X = vectorizer.fit_transform(df['soup'])

idf_values = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

# printing the tfidf vectors
print(X)

# printing the vocabulary
print(vectorizer.vocabulary_)

  (0, 182)	0.5389964527121502
  (0, 171)	0.6379900453000649
  (0, 302)	0.5389964527121502
  (0, 332)	0.10924445077649832
  (1, 45)	0.7574307968428433
  (1, 182)	0.639904204902158
  (1, 332)	0.12969655563102792
  (2, 189)	0.5354618849010933
  (2, 182)	0.5911534360885646
  (2, 302)	0.5911534360885646
  (2, 332)	0.11981569104059384
  (3, 330)	0.6229696677622945
  (3, 106)	0.4916601171769017
  (3, 182)	0.5962988579221642
  (3, 332)	0.1208585713404484
  (4, 194)	0.53758489773278
  (4, 316)	0.5013285394791389
  (4, 223)	0.5013285394791389
  (4, 106)	0.4432493041037016
  (4, 332)	0.10895835511170089
  (5, 49)	0.3747019913607045
  (5, 213)	0.46823385930474504
  (5, 194)	0.42474401736748024
  (5, 316)	0.3960979908055121
  (5, 106)	0.35020978251872176
  :	:
  (262, 270)	0.6272257692529298
  (262, 58)	0.20829444955989568
  (262, 184)	0.20829444955989568
  (262, 332)	0.12168427180751258
  (263, 297)	0.9124353788710969
  (263, 58)	0.26744276242774984
  (263, 184)	0.26744276242774984
  (263, 332)	0.

  % sorted(inconsistent)


In [15]:
df.loc[get_recommendations(title="Paratha")]

Unnamed: 0,food_id,title,restaurant_id,price,num_orders,category,avg_rating,num_rating,tags,soup
98,99,Daal puri,7,30,0,main course,3.8,35,"vegetarian,healthy","vegetarian,healthy daal puri main course"
87,88,Bhatura,4,20,0,main course,2.3,35,vegetarian,vegetarian bhatura main course


In [16]:
# fetch few past orders of a user, based on which personalized recommendations are to be made
def get_latest_user_orders(user_id, orders, num_orders=3):
    counter = num_orders
    order_indices = []
    for index, row in orders[['user_id']].iterrows():
        if row.user_id == user_id:
            counter = counter -1
            order_indices.append(index)
        if counter == 0:
            break
    return order_indices

# utility function that returns a DataFrame given the food_indices to be recommended
def get_recomms_df(food_indices, df1, columns, comment):
    row = 0
    df = pd.DataFrame(columns=columns)
    
    for i in food_indices:
        df.loc[row] = df1[['title', 'restaurant_id', 'price']].loc[i]
        df.loc[row].comment = comment
        row = row+1
    return df

# return food_indices for accomplishing personalized recommendation using Count Vectorizer
def personalised_recomms(orders, df1, user_id, columns, comment="based on your past orders"):
    order_indices = get_latest_user_orders(user_id, orders)
    food_ids = []
    food_indices = []
    recomm_indices = []
    for i in order_indices:
        food_ids.append(orders.loc[i].food_id)
    for i in food_ids:
        food_indices.append(indices_from_food_id[i])
    for i in food_indices:
        recomm_indices.extend(get_recommendations(idx=i))
    return get_recomms_df(set(recomm_indices), df1, columns, comment)

# Simply fetch new items added by vendor or today's special at restaurant
def get_new_and_specials_recomms(new_and_specials, users, df, restaurant_id, columns, comment="new/today's special item  in your restaurant"):
    food_indices = []
    for index, row in new_and_specials[['restaurant_id']].iterrows():
        if row.restaurant_id == restaurant_id:
            food_indices.append(indices_from_food_id[new_and_specials.loc[index].food_id])
    return get_recomms_df(set(food_indices), df, columns, comment)

# utility function to get the restaurant given a user id
def get_user_home_restaurant(users, user_id):
    for index, row in users[['user_id']].iterrows():
        if row.user_id == user_id:
            return users.loc[index].home_parcel
    return -1

# fetch items from previously calculated top_rated_items list
def get_top_rated_items(top_rated_items, df1, columns, comment="top rated items across restaurants"):
    food_indices = []
    for index, row in top_rated_items.iterrows():
        food_indices.append(indices_from_food_id[top_rated_items.loc[index].food_id])
    return get_recomms_df(food_indices, df1, columns, comment)

# fetch items from previously calculated pop_items list
def get_popular_items(pop_items, df1, columns, comment="most popular items across restaurants"):
    food_indices = []
    for index, row in pop_items.iterrows():
        food_indices.append(indices_from_food_id[pop_items.loc[index].food_id])
    return get_recomms_df(food_indices, df1, columns, comment)
    

In [17]:
orders = pd.read_csv('/content/drive/MyDrive/db_food/orders.csv')
new_and_specials = pd.read_csv('/content/drive/MyDrive/db_food/new_and_specials.csv')
users = pd.read_csv('/content/drive/MyDrive/db_food/users.csv')

columns = ['title', 'restaurant_id', 'price', 'comment']
current_user = 2
current_restaurant = get_user_home_restaurant(users, current_user)
print("personalized recommendataions")
personalised_recomms(orders, df, current_user, columns)
print("new and special dishes of restaurant")
get_new_and_specials_recomms(new_and_specials, users, df, current_restaurant, columns)
# print("top rated items in restaurants")
# get_top_rated_items(top_rated_items, df, columns)
print("popular items")
get_popular_items(pop_items, df, columns).head(3)

personalized recommendataions


Unnamed: 0,title,restaurant_id,price,comment
0,Cheese Maggi,1,25,based on your past orders
1,Masala Maggi,1,25,based on your past orders
2,Paneer Tikka,1,60,based on your past orders
3,Chicken Tikka,1,80,based on your past orders
4,biryani with raita,2,100,based on your past orders
5,Paneer tikka masala,8,40,based on your past orders


new and special dishes of restaurant


Unnamed: 0,title,restaurant_id,price,comment
0,Cheese Maggi,1,25,new/today's special item in your restaurant
1,Khichdi,7,20,new/today's special item in your restaurant


top rated items in restaurants


Unnamed: 0,title,restaurant_id,price,comment
0,samosa,2,40,top rated items across restaurants
1,Upma,8,20,top rated items across restaurants
2,Panjeeri,2,25,top rated items across restaurants
3,Ariselu,1,75,top rated items across restaurants
4,Fara,6,60,top rated items across restaurants
...,...,...,...,...
188,Patra,10,40,top rated items across restaurants
189,Bhatura,4,20,top rated items across restaurants
190,Cheera Doi,10,35,top rated items across restaurants
191,bataka poha,3,25,top rated items across restaurants


popular items


Unnamed: 0,title,restaurant_id,price,comment
0,samosa,2,40,most popular items across restaurants
1,Paneer Tikka,1,60,most popular items across restaurants
2,Cheese Maggi,1,25,most popular items across restaurants
