<a href="https://colab.research.google.com/github/C23-PS396/LeftLovers-MachineLearning/blob/main/content_based_filtering_movies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# kaggle API
!pip install -q kaggle
from google.colab import files
files.upload()
!mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [3]:
# download the data set and unzip
!kaggle datasets download -d ahmedshahriarsakib/uber-eats-usa-restaurants-menus
!unzip /content/uber-eats-usa-restaurants-menus.zip

Downloading uber-eats-usa-restaurants-menus.zip to /content
 91% 121M/132M [00:02<00:00, 70.4MB/s]
100% 132M/132M [00:02<00:00, 63.3MB/s]
Archive:  /content/uber-eats-usa-restaurants-menus.zip
  inflating: restaurant-menus.csv    
  inflating: restaurants.csv         


In [4]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [5]:
# load the dataframe
df_restaurant = pd.read_csv('restaurants.csv')
df_menu = pd.read_csv('restaurant-menus.csv')

# print the columns for each df
print(f"restaurant df cols: {df_restaurant.columns.tolist()} \nmenu df cols: {df_menu.columns.tolist()}")

restaurant df cols: ['id', 'position', 'name', 'score', 'ratings', 'category', 'price_range', 'full_address', 'zip_code', 'lat', 'lng'] 
menu df cols: ['restaurant_id', 'category', 'name', 'description', 'price']


In [6]:
# grab price range and join by id
df_menu.columns = ['id', 'category', 'name', 'description', 'price']
# inspect each col type
df_menu.dtypes

id              int64
category       object
name           object
description    object
price          object
dtype: object

In [7]:
# convert price to float type
df_menu['price'] = df_menu['price'].str.replace(' USD','').astype(float)
df_menu.dtypes

id               int64
category        object
name            object
description     object
price          float64
dtype: object

In [8]:
# get price range
price_range_df = (df_menu.groupby('id')['price'].max() - df_menu.groupby('id')['price'].min()).reset_index().rename(columns={'price':'price_range'})
price_range_df.head()

Unnamed: 0,id,price_range
0,1,15.1
1,2,7.89
2,3,13.78
3,4,14.5
4,5,19.0


In [9]:
# drop cols before joining
df_restaurant.drop(columns=['price_range', 'full_address', 'zip_code', 'position'], inplace=True)

In [10]:
# merged
df = df_restaurant.merge(price_range_df, on='id')
df.head()

Unnamed: 0,id,name,score,ratings,category,lat,lng,price_range
0,1,PJ Fresh (224 Daniel Payne Drive),,,"Burgers, American, Sandwiches",33.562365,-86.830703,15.1
1,2,J' ti`'z Smoothie-N-Coffee Bar,,,"Coffee and Tea, Breakfast and Brunch, Bubble Tea",33.58364,-86.77333,7.89
2,3,Philly Fresh Cheesesteaks (541-B Graymont Ave),,,"American, Cheesesteak, Sandwiches, Alcohol",33.5098,-86.85464,13.78
3,4,Papa Murphy's (1580 Montgomery Highway),,,Pizza,33.404439,-86.806614,14.5
4,5,Nelson Brothers Cafe (17th St N),4.7,22.0,"Breakfast and Brunch, Burgers, Sandwiches",33.51473,-86.8117,19.0


In [11]:
# inspecting statistics
df.describe()

Unnamed: 0,id,score,ratings,lat,lng,price_range
count,40141.0,22194.0,22194.0,40141.0,40141.0,40141.0
mean,20112.976284,4.561476,74.912229,39.926547,-96.562832,31.376105
std,11620.735144,0.297944,72.437955,5.76807,17.957123,44.544915
min,1.0,1.3,10.0,0.0,-123.84124,0.0
25%,10042.0,4.4,25.0,37.078242,-113.59215,12.1
50%,20102.0,4.6,51.0,39.003566,-96.59384,18.39
75%,30186.0,4.8,100.0,45.49364,-77.5302,34.01
max,40227.0,5.0,500.0,48.96395,0.0,1099.51


In [12]:
# inspecting the 0 price range
print(f"amount {df[df.price_range == 0]['id'].count()}")
df[df.price_range == 0].head()

amount 626


Unnamed: 0,id,name,score,ratings,category,lat,lng,price_range
138,139,Dunkin (300 Commons Dr),4.7,18.0,"Breakfast &amp; Brunch, Donuts",33.44203,-86.83068,0.0
209,210,Steel City Pops (2821 Central Ave),,,"Desserts, Ice Cream + Frozen Yogurt, Japanese ...",33.4793,-86.7945,0.0
256,257,Thirstea Tea (Pizitz),,,"Bubble Tea, Juice and Smoothies, Bubble Tea",33.51418,-86.808179,0.0
280,281,Yogurt Street,,,"Ice Cream &amp; Frozen Yogurt, Comfort Food, D...",34.15158,-86.83939,0.0
429,430,Dunkin' (2536 Helena Rd Ste D),,,"Donuts, Breakfast &amp; Brunch, American",33.2824,-86.85145,0.0


In [13]:
# assign it with mean
df.loc[df['price_range'] == 0] = df.price_range.mean()

In [14]:
# count the missing values
total = df.shape[0]
(df.isnull().sum() / total * 100).reset_index().rename(columns={0: 'missing(%)'})

Unnamed: 0,index,missing(%)
0,id,0.0
1,name,0.0
2,score,43.935129
3,ratings,43.935129
4,category,0.042351
5,lat,0.0
6,lng,0.0
7,price_range,0.0


In [15]:
# handling missing values
# numerical value
for i in ['score', 'ratings']:
  df[i].fillna(df[i].mean(), inplace=True)


In [16]:
# category
df.dropna(inplace=True)

In [17]:
# check
print(f"df shape {df.shape}")
df.isnull().sum()

df shape (40124, 8)


id             0
name           0
score          0
ratings        0
category       0
lat            0
lng            0
price_range    0
dtype: int64

In [18]:
df.drop_duplicates(subset='id', inplace=True)

In [19]:
# check 
df.id.duplicated().sum()

0

In [20]:
# sort data frame based on the number of ratings
# picking 90% of quantile
df = df[df.ratings >= df.ratings.quantile(.9)].reset_index(drop=True)
df.shape

(3968, 8)

In [21]:
df.head()

Unnamed: 0,id,name,score,ratings,category,lat,lng,price_range
0,1739.0,Pho Cali Noodle House,4.4,200.0,"Vietnamese, Noodles, Sandwich, Asian",42.958143,-87.94815,28.99
1,1742.0,Buffalo Wild Wings (8171 S Howell Ave),4.4,128.0,wings,42.895949,-87.912668,45.99
2,1790.0,Famous Dave's - Greenfield,4.4,200.0,"American, BBQ, Family Meals",42.952583,-87.949458,83.6
3,1799.0,Portillo’s Hot Dogs (8705 West Sura Lane),4.5,163.0,"American, Sandwiches",42.95969,-88.022154,30.47
4,1811.0,Outback Steakhouse (8625 W. Sura Lane),4.4,132.0,"Burgers, American, Sandwiches, Steak, Seafood,...",42.959611,-88.020757,48.49


In [22]:
# Getting the similarity of the items based on their category
from sklearn.feature_extraction.text import TfidfVectorizer

# matrix which describes each item
tfidf = TfidfVectorizer(stop_words='english')
# construct the matrix
tfidf_matrix = tfidf.fit_transform(df.category)
# check its shape
tfidf_matrix.shape

(3968, 218)

In [23]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [24]:
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.11180652, 0.        ,
        0.23206996],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.08478783,
        0.07425027],
       ...,
       [0.11180652, 0.        , 0.        , ..., 1.        , 0.07373264,
        0.33637438],
       [0.        , 0.        , 0.08478783, ..., 0.07373264, 1.        ,
        0.17335502],
       [0.23206996, 0.        , 0.07425027, ..., 0.33637438, 0.17335502,
        1.        ]])

In [25]:
# create rever mapping restaurant to index
indices = pd.Series(index=df['id'], data=df.index)
indices

id
1739.0        0
1742.0        1
1790.0        2
1799.0        3
1811.0        4
           ... 
40223.0    3963
40224.0    3964
40225.0    3965
40226.0    3966
40227.0    3967
Length: 3968, dtype: int64

In [26]:
df

Unnamed: 0,id,name,score,ratings,category,lat,lng,price_range
0,1739.0,Pho Cali Noodle House,4.4,200.0,"Vietnamese, Noodles, Sandwich, Asian",42.958143,-87.948150,28.99
1,1742.0,Buffalo Wild Wings (8171 S Howell Ave),4.4,128.0,wings,42.895949,-87.912668,45.99
2,1790.0,Famous Dave's - Greenfield,4.4,200.0,"American, BBQ, Family Meals",42.952583,-87.949458,83.60
3,1799.0,Portillo’s Hot Dogs (8705 West Sura Lane),4.5,163.0,"American, Sandwiches",42.959690,-88.022154,30.47
4,1811.0,Outback Steakhouse (8625 W. Sura Lane),4.4,132.0,"Burgers, American, Sandwiches, Steak, Seafood,...",42.959611,-88.020757,48.49
...,...,...,...,...,...,...,...,...
3963,40223.0,Mangia la pasta! (5610 N Interstate Hwy 35),4.8,500.0,"Pasta, Comfort Food, Italian, Group Friendly",30.316248,-97.708441,18.49
3964,40224.0,Wholly Cow Burgers (S Lamar),4.6,245.0,"American, Burgers, Breakfast and Brunch, Aller...",30.242816,-97.783821,12.99
3965,40225.0,EurAsia Ramen 3,4.7,293.0,"Sushi, Asian, Japanese, Exclusive to Eats, Gro...",30.324290,-97.740200,21.60
3966,40226.0,Austin's Habibi (5th St),4.7,208.0,"Mediterranean, Gluten Free Friendly, Allergy F...",30.269580,-97.753110,38.50


In [29]:
def get_recommendations(title, sim_mat=cosine_sim):
  idx = indices[title]

  # Get the pairwsie similarity scores of all movies with that movie
  sim_scores = list(enumerate(cosine_sim[idx]))
  print(f"sim score: {sim_scores}")

  # Sort the restaurants based on the similarity scores
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  print(f"sorted sim score: {sim_scores}")
  # Get the scores of the 10 most similar restaurants
  sim_scores = sim_scores[0:20]

  # Get the movie indices
  restaurant = [i[0] for i in sim_scores if i[0] != idx]

  # Return the top 10 most similar restaurants
  return df.iloc[restaurant].head(10).sort_values(by='score', ascending=False)

In [31]:
get_recommendations(1799.0)

sim score: [(0, 0.0), (1, 0.0), (2, 0.14250936625117114), (3, 1.0), (4, 0.40020726615778657), (5, 0.0), (6, 0.13259250410058912), (7, 0.40020726615778657), (8, 0.0), (9, 0.0), (10, 0.0), (11, 0.0), (12, 0.0), (13, 0.0), (14, 0.0), (15, 0.0), (16, 0.0), (17, 0.0), (18, 0.0), (19, 0.0), (20, 0.0), (21, 0.0), (22, 0.0), (23, 0.15613480184179698), (24, 0.09060009110915522), (25, 0.28403221097035464), (26, 0.8359825928168906), (27, 0.1500205662290849), (28, 0.0), (29, 0.1471487914847295), (30, 0.37003540364794724), (31, 0.4721914299531655), (32, 0.09060009110915522), (33, 0.8359825928168906), (34, 0.15758518946775), (35, 0.13481684090592874), (36, 0.09098083175099742), (37, 0.14675191939307272), (38, 0.08486724121142702), (39, 0.6705290991680251), (40, 0.14683756637326187), (41, 0.0), (42, 0.13584052241424105), (43, 0.15758518946775), (44, 0.16319055280387723), (45, 0.1357104817138022), (46, 0.16319055280387723), (47, 0.08349724301510122), (48, 0.0), (49, 0.0857645745117981), (50, 0.0), (51

Unnamed: 0,id,name,score,ratings,category,lat,lng,price_range
1571,18840.0,CIRCA - Clarendon (Courthouse),4.8,252.0,"American, Sandwiches",38.886999,-77.094316,31.0
146,5954.0,Brunch Cafe Mchenry,4.8,125.0,"Burgers, American, Sandwiches",42.322675,-88.273008,16.0
352,8742.0,Uneeda Burger,4.8,162.0,"Burgers, American, Sandwiches",47.65952,-122.34976,17.0
3063,32699.0,El Meson Sandwiches (Escorial),4.7,370.0,Sandwiches,18.395708,-65.995137,19.97
452,9791.0,The Habit Burger Grill (2831 Duportail St),4.7,147.0,"American, Burgers, Sandwiches",46.25668,-119.31073,43.26
226,8131.0,Burger and Kabob Hut (Seattle),4.5,178.0,"Burgers, American, Sandwiches",47.65815,-122.31301,8.25
26,2110.0,Michael's Family Restaurant,4.4,190.0,"Burgers, American, Sandwiches",43.03907,-87.94116,16.35
524,10601.0,Umami Burger (SEA26-1),4.4,176.0,"American, Burgers, Sandwiches",47.52541,-122.31321,12.46
139,5893.0,Portillo’s Hot Dogs (6102 W Grand Avenue),4.3,200.0,"Burgers, American, Sandwiches",42.391239,-87.954279,30.47
33,2203.0,Mo's Irish Pub (Wauwatosa),4.2,152.0,"Burgers, American, Sandwiches",43.035598,-88.047875,22.0
