# AIM: Recommend Relevant Businesses To Users Based On The Activity Of Users On Yelp 

### Activity = reviews given by users to businesses

In [None]:
# !pip install wordcloud
# !pip install mpl_toolkits
# !pip install surprise
# !pip install gensim
# !pip install vaderSentiment

In [None]:
import pandas as pd
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import operator
import collections
import re, string
import sys
import time
import random
from tqdm import tqdm
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from surprise import Dataset
from surprise import BaselineOnly
#from surprise.model_selection import train_test_split
from sklearn.model_selection import train_test_split 
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
import torch.nn as nn
from sklearn import model_selection,metrics,preprocessing
import torch
import json
from torch.utils.data import Dataset, DataLoader

## PRE-PROCESSING TECHNIQUES

In [None]:
#=================================================================
# Importing Yelp dataset into Dataframe
#=================================================================

def init_ds(json):
    ds= {}
    keys = json.keys()
    for k in keys:
        ds[k]= []
    return ds, keys

#=================================================================
# Converting Json files to pd.Dataframes
#=================================================================
def read_json(file):
    dataset = {}
    keys = []
    with open(file, encoding='utf-8') as file_lines:
        for count, line in enumerate(file_lines):
            data = json.loads(line.strip())
            if count ==0:
                dataset, keys = init_ds(data)
            for k in keys:
                dataset[k].append(data[k])
                
        return pd.DataFrame(dataset)

# Import yelp business
yelp_business = read_json('yelp_dataset\yelp_dataset\yelp_academic_dataset_business.json')

#Import yelp_review
yelp_review= read_json('yelp_dataset\yelp_dataset\yelp_academic_dataset_review.json')

#Import yelp_user
yelp_user = read_json('yelp_dataset\yelp_dataset\yelp_academic_dataset_user.json')

### Average Star Rating Given By Users

In [None]:
#==============================================
# getting the average star given by the users from
# yelp_review dataset (dataset with the review in)
#==============================================
list_star_count = yelp_review['stars'].tolist()
plt.hist(list_star_count, bins = 5)
plt.show()

### Yelping Since

In [None]:
#==============================================
# getting the number of reviews each year
# given by the user
#==============================================
list_date_count = pd.DatetimeIndex(yelp_review['date']).year.tolist()
counts = pd.Series(list_date_count).value_counts()
counts.plot.bar()

### Average Reviews Received By Businesses

In [None]:
#==============================================
# getting the number of reviews (x coordinate)
# received by number of businesses businesses (y coordinate)
#==============================================
list_business_review_count = yelp_business['review_count'].tolist()
plt.hist(list_business_review_count, bins = 5,range= (0,20))
plt.hist(list_business_review_count, bins = 5,range= (20,50))
plt.hist(list_business_review_count, bins = 5,range= (50,100))
plt.show()

### Filtering Restaurants Dataset


In [None]:
yelp_business = yelp_business.loc[(yelp_business['categories'].str.contains('Food')) | (yelp_business['categories'].str.contains('Restaurants'))]

#### Review Count For The Filtered Businesses

In [None]:
list_business_review_counts = yelp_business['review_count'].tolist()
plt.hist(list_business_review_counts, bins = 5,range= (0,20))
plt.hist(list_business_review_counts, bins = 5,range= (20,50))
plt.hist(list_business_review_counts, bins = 5,range= (50,100))
plt.show()

#### Filter Businesses with 20 or more reviews

In [None]:
yelp_business = yelp_business.loc[(yelp_business['review_count']>= 20)]
yelp_business.shape

### Filtering User Dataset

#### Average Reviews Given By Users

In [None]:
list_review_count = yelp_user['review_count'].tolist()

In [None]:
print(yelp_user['review_count'].min(),yelp_user['review_count'].max(),yelp_user['review_count'].mean())

plt.hist(list_review_count, bins = 5,range= (0,20))
plt.hist(list_review_count, bins = 5,range= (20,50))
plt.hist(list_review_count, bins = 5,range= (50,100))
plt.show()


#### Filtering Users with 20 or more reviews

In [None]:
yelp_user = yelp_user[yelp_user.review_count >= 20]
list_review_count = yelp_user['review_count'].tolist()
plt.hist(list_review_count, bins = 5,range= (0,19))
plt.hist(list_review_count, bins = 5,range= (20,50))
plt.hist(list_review_count, bins = 5,range= (50,100))

plt.show()

### Filtering yelp_review dataset

In [None]:
# filtering yelp-review dataset with user_ids that are present in business_id and yelp_review datasets
# basicaly getting users users with more than 20 reviews and restaurants with more than 20 reviews into a dataset
yelp_review = yelp_review.loc[(yelp_review['user_id'].isin(yelp_user['user_id'])) & (yelp_review['business_id'].isin(yelp_business['business_id']))]

In [None]:
yelp_review

## Preparing New Dataset to User for Recommendations

In [None]:
#================================================================
# Getting a smaller sample of dataset so it's easier to handle
#================================================================

sample = yelp_review.head(int(yelp_review.shape[0]/100))

#================================================================
# Renaming all the user Ids so it's easier to handle
#================================================================

user_id_unique = sample.user_id.unique()

user_dictionary = {}

for i in range(len(user_id_unique)):
    user_dictionary[user_id_unique[i]] = i
        
sample = sample.replace({"user_id": user_dictionary})

yelp_user = yelp_user.replace({"user_id": user_dictionary})

#================================================================
# Renaming all the business Ids so it's easier to handle
#================================================================

business_id_unique = sample.business_id.unique()

business_dictionary = {}

for i in range(len(business_id_unique)):
    business_dictionary[business_id_unique[i]] = i
        
sample = sample.replace({"business_id": business_dictionary})

yelp_business = yelp_business.replace({"business_id": business_dictionary})

#================================================================
# Obtaining user and business dataset with users in the sample 
# dataset above
#================================================================

yelp_user_filtered = yelp_user.loc[yelp_user['user_id'].isin(sample['user_id'])]

yelp_business_filtered = yelp_business.loc[yelp_business['business_id'].isin(sample['business_id'])]

#================================================================
# Saving these datasets into a csv file to be used for the summative
#================================================================

sample.to_csv('sample.csv')

yelp_user_filtered.to_csv('yelp_user_filtered.csv')

yelp_business_filtered.to_csv('yelp_business_filtered.csv')

# Building Recommender System -- Run From Here

1. Cold Start Problem Solved - KMeans Clustering Algorithm
2. Non personalised Recommender System
3. Personalised Recommender System - SVD
4. Personalised Recommender System - Neural Network, Deep Learning

## 1. Cold start solved (KMeans Clustering)

In [None]:
#============================================================
# Reading CSV file, my own mini version of Yelp after
# preprocessing
#============================================================

sample = pd.read_csv('sample.csv')

yelp_user_filtered = pd.read_csv('yelp_user_filtered.csv')

yelp_business_filtered = pd.read_csv('yelp_business_filtered.csv')


#### a) Visualising the number of restaurnats in all the states

In [None]:
#===================================================================================
# plot of the latitude and longitude of all the restaurants
#===================================================================================
plt.scatter(yelp_business_filtered['latitude'],yelp_business_filtered['longitude'])

#### b) Visualising the number restaurants in a state

In [None]:
#===================================================================================
# plot of the latitude and longitude of the restaurants in a specific state
#===================================================================================
state_business = yelp_business_filtered.loc[yelp_business_filtered['state'] == 'PA']
plt.scatter(state_business['latitude'],state_business['longitude'])

#### c) Visualising the number restaurants in a city of a state

In [None]:
#===================================================================================
# plot of the latitude and longitude of the restaurants in a specific city
#===================================================================================
city_business = yelp_business_filtered.loc[yelp_business_filtered['city'] == 'Edmonton']
plt.scatter(city_business['latitude'],city_business['longitude'])

#### d) KMeans Clustering Algorithm 

In [None]:
#================================================================
#============= K-Nearest Neighbour Algorithm ====================
#      this is to find the clusters of location
#================================================================
from sklearn.cluster import KMeans

def distortion_counter(min,max,dataset):
    distortions = []
    K = range(min,max)
    for k in K:
        kmeansModel = KMeans(n_clusters = k)
        KmeansModel = kmeansModel.fit(dataset)
        distortions.append(kmeansModel.inertia_)
    
    fig,ax = plt.subplots(figsize=(12,8))
    plt.plot(K,distortions,marker='o')
    plt.xlabel('k')
    plt.ylabel('Distortions')
    plt.title('Elbow Method For Optimal k')
    plt.show()

#==================================================================
# grouping for cities using KMeans
#==================================================================

# getting the number of city
m_clus = int((1/4)*len(yelp_business_filtered['city'].unique()))
kmeans_city = KMeans(n_clusters=m_clus,init='k-means++')
kmeans_city.fit(yelp_business_filtered[['latitude','longitude']])
x = kmeans_city.labels_
yelp_business_filtered['city_category'] = x

#==================================================================
# This is to solve the cold start problem
# Recommendation based on location
#==================================================================

def increase_radius(lat,long,number):
    business_id = []
    n = 0.01
    while len(business_id) <= number:
        
        #==================================================
        # lat values
        lat_values = [lat + n, lat - n]
        #==================================================
        # long values
        long_values = [long + n, long - n]
        #==================================================
        # for loop to check the regions
        for lat in lat_values:
            for long in long_values:
                #==================================================
                # get the region of cluster
                cluster = kmeans_city.predict(np.array([long,lat]).reshape(1,-1))[0]
                #==================================================
                # getting the number of number of restaurants for that city cluster
                restaurants = yelp_business_filtered[yelp_business_filtered['city_category']==cluster]
                #==================================================
                # number of restaurants in that region
                n_o_restaurants = restaurants.shape[0]

                list_of_new_id = restaurants['business_id'].values.tolist()
                res = [x for x in list_of_new_id + business_id if x not in business_id]
                business_id_with_more_star = []
                for i in res:
                    star_now = yelp_business_filtered['stars'][yelp_business_filtered['business_id']==i]
                    if int(star_now) >= 4:
                        business_id_with_more_star.append(i)
                if len(res) == 0:
                    pass
                else:
                    for i in range(len(business_id_with_more_star)):
                        business_id.append(business_id_with_more_star[i])
        n = n + 0.01
        
    return business_id

def new_user():
    #=================================================================================================
    # gets input from the user 
    #=================================================================================================
    longitude = float(input("Your Longitude, ex:-75.2: "))
    latitude = float(input("Your Latitude, ex:39.9: "))
    num_of_recommendations = int(input("How many number of recommendations: "))
    cluster = kmeans_city.predict(np.array([longitude,latitude]).reshape(1,-1))[0]
    #=================================================================================================
    # getting the number of number of restaurants for that city cluster 
    #=================================================================================================
    restaurants = yelp_business_filtered[yelp_business_filtered['city_category']==cluster]
    n_o_restaurants = restaurants.shape[0]
    list_of_new_id = restaurants['business_id'].values.tolist()

    if num_of_recommendations>n_o_restaurants:
        #=============================================================================================
        # You can tweak the radius a little to give the number of recommendations requested by user
        #=============================================================================================
        list_of_new_id = increase_radius(int(latitude),int(longitude),num_of_recommendations)
        
    y = yelp_business_filtered[yelp_business_filtered['business_id'].isin(list_of_new_id)] #.values.tolist()
    y = y[:num_of_recommendations]
    business_name_ = y['business_id'].tolist()
    business_ = []
    for i in range(len(business_name_)):
        business_name = yelp_business_filtered['name'].loc[yelp_business_filtered['business_id'] == business_name_[i]]
        business_.append(business_name.tolist()[0])
    return business_
        

In [None]:
d = yelp_business_filtered[['latitude','longitude']]
distortion_counter(2,30,d)

## 2) Non Personalised Recommender System 

In [None]:
df = sample.rename(columns={'business_id': "businessId", 'stars': "rating"})
df = df[["businessId","rating"]]
ratings_mean = df.groupby(['businessId'])[['rating']].mean().rename(columns = {'rating':'mean_rating'}).reset_index()
ratings_sum = df.groupby(['businessId'])[['rating']].sum().rename(columns = {'rating':'sum_rating'}).reset_index()
alpha = 1
ratings_sum['sum_rating_factor'] = ratings_sum['sum_rating'] + alpha*(df['rating'].mean())
ratings_count = df.groupby(['businessId'])[['rating']].count().rename(columns = {'rating':'count_rating'}).reset_index()
ratings_count['count_rating_factor'] = ratings_count['count_rating'] + alpha
ratings_damped = pd.merge(ratings_sum,ratings_count[['businessId','count_rating','count_rating_factor']], on=['businessId'], how='left')
ratings_damped['damped_mean'] = ratings_damped['sum_rating_factor']/ratings_damped['count_rating_factor']
rating_mean_dampmean = pd.merge(ratings_mean[['businessId','mean_rating']],ratings_damped[['businessId','damped_mean']], on =['businessId'], how= 'left')
rating_mean_dampmean = rating_mean_dampmean.sort_values(['mean_rating'], ascending=False)

#====================================================================
# Call this function to make recommendation 
#====================================================================

def non_personalised(n):
    list_of_predictions = rating_mean_dampmean['businessId'][:n].tolist()
    business_ = []
    for i in range(len(list_of_predictions)):
            business_name_ = list_of_predictions[i]
            business_name = yelp_business_filtered['name'].loc[yelp_business_filtered['business_id'] == business_name_]
            business_.append(business_name.tolist()[0])
    for i in range(len(business_)):
        pass
        #print("Your number",i+1,"recommendation is",business_[i])
        
    return business_

#====================================================================
# Model's loss function (MSE loss)
#====================================================================


mse = mean_squared_error(rating_mean_dampmean['mean_rating'], rating_mean_dampmean['damped_mean'])
print("MSE of Non Personalised Recommender System",mse)

#====================================================================
# Coverage
#====================================================================

list_of_cities = yelp_business_filtered['city'].unique()
    

# picking 5 users for each city with 100 businesses
num_of_users_to_pick = []

# values to be calculated for coverage
val = []

for i in range(len(list_of_cities)):
    
    # getting the number of business in a city
    num_business = len(yelp_business_filtered[yelp_business_filtered['city']== list_of_cities[i]])
    
    print("num business:", num_business)
    if num_business >= 90:

        test_user = int(10*(num_business/100))
        num_of_users_to_pick.append(test_user)
        #print("City",list_of_cities[i], "has", num_business, "businesses and their target test user",test_user )
        
        # to get business_ids and user_id of each city
        business_id = yelp_business_filtered['business_id'][yelp_business_filtered['city']== list_of_cities[i]]
        business_id = business_id.tolist()
        
        
        # to get business_ids and user_id of each city
        user_id_all = (sample['user_id'][sample['business_id'].isin(business_id)]).tolist()
        random.shuffle(user_id_all)
        user_to_be_predictd = (user_id_all[:test_user])
       
        list_of_restaurant_prediction = []
        
        for i in range(len(user_to_be_predictd)):
            prediction_10 = non_personalised(10)
            list_of_restaurant_prediction.extend(prediction_10)
            
        num_of_recommended = len(np.unique(list_of_restaurant_prediction))
        
        num = (num_of_recommended/num_business)*100
    
        val.append(num)
        
from statistics import mean

coverage = mean(val)

print(coverage)



In [None]:
def serendipity():
    users = sample['user_id'].tolist()
    
    for i in range(len(users)):
        restaurant = []
        business_id = sample['business_id'][sample['user_id'] == users[i]].tolist()
        print(business_id)
        for i in range(len(business_id)):
            restaurant.append(yelp_business_filtered['name'][yelp_business_filtered['business_id']== business_id[i]].tolist())

        if restaurant[0] in non_personalised(10):
            print("True")
        
        print("restaurant", restaurant)
        print('non_personalised(10)',non_personalised(10))
        
serendipity()

## 3) Personalised Recommender System - SVD

In [None]:
from surprise import Dataset
from surprise import Reader
#=======================================================================================
# split the train test by date

user_item_rating = sample[["user_id", 'business_id', 'date', 'stars']]

user_item_rating["date"] = pd.to_datetime(user_item_rating["date"])

user_item_rating["date"] = user_item_rating["date"].astype('datetime64[ns]')

user_item_rating.sort_values(by='date', inplace=False)

#=======================================================================================

train, test_val = train_test_split(user_item_rating, test_size=0.4,random_state=None, shuffle=False, stratify=None)

val, test = train_test_split(test_val, test_size=0.5,random_state=None, shuffle=False, stratify=None)

#=======================================================================================

reader = Reader(rating_scale = (0.0, 5.0))

trainset = train.loc[:,['user_id', 'business_id', 'stars']]

trainset.columns = ['userID', 'itemID','rating']

valset = val.loc[:,['user_id', 'business_id', 'stars']]

valset.columns = ['userID', 'itemID','rating']

testset = test.loc[:,['user_id', 'business_id', 'stars']]

testset.columns = ['userID', 'itemID','rating']

user_item_rating = user_item_rating.loc[:,['user_id', 'business_id', 'stars']]

user_item_rating.columns = ['userID', 'itemID','rating']

#=======================================================================================

train_data = Dataset.load_from_df(trainset[['userID', 'itemID','rating']], reader)

val_data = Dataset.load_from_df(valset[['userID', 'itemID','rating']], reader)

test_data = Dataset.load_from_df(testset[['userID', 'itemID','rating']], reader)

user_item_rating = Dataset.load_from_df(user_item_rating[['userID', 'itemID','rating']], reader)

#=======================================================================================

train_sr = train_data.build_full_trainset()

val_sr_before = val_data.build_full_trainset()

val_sr = val_sr_before.build_testset()

test_sr_before = test_data.build_full_trainset()

test_sr = test_sr_before.build_testset()

user_item_rating = user_item_rating.build_full_trainset()

#=====================================================================================
# Hyperparameter Tuning with SVD model
#=====================================================================================

def tuning(train_sr,val_sr):
    RMSE_tune = {}
    n_epochs = [ 30, 40, 50, 60]  # the number of iteration of the SGD procedure
    lr_all = [0.001, 0.003, 0.005, 0.01] # the learning rate for all parameters
    reg_all =  [0.05, 0.1, 0.4, 0.5] # the regularization term for all parameters

    for n in n_epochs:
        for l in lr_all:
            for r in reg_all:
                #print('Fitting n: {0}, l: {1}, r: {2}'.format(n, l, r))
                # SVD model initialized
                algo = SVD(n_epochs = n, lr_all = l, reg_all = r)
                # Training set is fit to the model
                algo.fit(train_sr)
                # Testing the algorithm
                predictions = algo.test(val_sr)
                # Geting the RMSE for the test set
                RMSE_tune[n,l,r] = accuracy.rmse(predictions)
    # Returns the min value of the dictionary
    return min(RMSE_tune.items(), key=operator.itemgetter(1))[0]

#================================================================
# Getting the best parameter
#================================================================

parameters = tuning(train_sr,val_sr)
print(parameters)

best_number_of_epochs = parameters[0]

best_learning_rate = parameters[1]

best_reg_all = parameters[2]

print(best_number_of_epochs,best_learning_rate,best_reg_all)

#===================================================================================================
# training SVD algorithm in the optimal parameter 
#====================================================================================================

algo_real = SVD(n_epochs = best_number_of_epochs, lr_all = best_learning_rate, reg_all = best_reg_all)

algo_real.fit(train_sr)

predictions = algo_real.test(test_sr)

#================================================================
# Evalution
#================================================================

print("RMSE for SVD",accuracy.rmse(predictions))

print("MAE for SVD", accuracy.mae(predictions))

print("R2 Score for SVD",r2_score([t[2] for t in predictions], [t[3] for t in predictions]))

#================================================================
# SVD Recommendation
#================================================================

def SVD_recommend(user,num_of_recommendations):
    list_of_predictions = []
    unique_business = sample.business_id.unique()
    business_ = []

    for i in range(sample.business_id.nunique()):
            business_id = unique_business[i]
            y_hat = algo_real.predict(user, business_id).est
            list_of_predictions.append([business_id,y_hat])

    list_of_predictions.sort(key=lambda x: x[1],reverse = True)
    list_of_predictions = list_of_predictions[:num_of_recommendations]
    for i in range(len(list_of_predictions)):
            business_name_ = list_of_predictions[i]
            business_name = yelp_business_filtered['name'].loc[yelp_business_filtered['business_id'] == business_name_[0]]
            business_.append(business_name.tolist()[0])

    for i in range(len(business_)):
            print("Your number",i+1,"recommendation is",business_[i])

#====================================================================
# Calculating the coverage
#====================================================================
list_of_cities = yelp_business_filtered['city'].unique()


# picking 5 users for each city with 100 businesses
num_of_users_to_pick = []

# values to be calculated for coverage
val = []

for i in range(len(list_of_cities)):
    
    # getting the number of business in a city
    num_business = len(yelp_business_filtered[yelp_business_filtered['city']== list_of_cities[i]])
    
    if num_business >= 90:

        test_user = int(10*(num_business/100))
        num_of_users_to_pick.append(test_user)
    
        
        # to get business_ids and user_id of each city
        business_id = yelp_business_filtered['business_id'][yelp_business_filtered['city']== list_of_cities[i]]
        business_id = business_id.tolist()
        
        
        # to get business_ids and user_id of each city
        user_id_all = (sample['user_id'][sample['business_id'].isin(business_id)]).tolist()
        random.shuffle(user_id_all)
        user_to_be_predictd = (user_id_all[:test_user])
       
        list_of_restaurant_prediction = []
        
        for i in range(len(user_to_be_predictd)):
            prediction_10 = SVD_recommend(user_to_be_predictd[i],10)
            print(prediction_10)
            list_of_restaurant_prediction.extend(prediction_10)
            
        num_of_recommended = len(np.unique(list_of_restaurant_prediction))
        
        num = (num_of_recommended/num_business)*100
    
        val.append(num)
        
from statistics import mean

coverage = mean(val)

print(coverage)


## 4) Personalised Recommender System - Deep Learning

In [None]:
from sklearn import model_selection,metrics, preprocessing
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

#====================================================================
# Getting the dataset ready
#====================================================================

df = sample[['user_id','business_id','stars']]

df = df.rename(columns={'user_id':'userId','business_id':'businessId','stars':'rating'})

df = df[:28214]

#====================================================================
# Training Dataset Class Wrapper 
#====================================================================

class YelpDataset:
    def __init__(self, users, business, ratings):
        self.users = users
        self.business = business
        self.ratings = ratings
        
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self,item):
        users = self.users[item]
        business = self.business[item]
        ratings = self.ratings[item]
        
        return {
            "users": torch.tensor(users, dtype=torch.long),
            "business": torch.tensor(business, dtype=torch.long),
            "ratings": torch.tensor(ratings, dtype=torch.long),
        }
    
#====================================================================
# Model Definiting 
#====================================================================

class RecSysModel(nn.Module):
    def __init__(self, n_users, n_business):
        super().__init__()
        # trainable lookup matrix for shallow embedding vectors
    
        self.user_embed = nn.Embedding(n_users, 32)
        self.business_embed = nn.Embedding(n_business,32)
        self.out = nn.Linear(64,1)
        
    def forward(self, users, business, rating=None):
        user_embeds = self.user_embed(users)
        business_embeds = self.business_embed(business)
        output = torch.cat([user_embeds,business_embeds], dim=1)
        
        output = self.out(output)
        
        return output

#====================================================================
# label encoder is a bit like getting all the uniques values
# to make sure each userid fits into the total unique user Id
#====================================================================

lbl_user = preprocessing.LabelEncoder()
lbl_business = preprocessing.LabelEncoder()

#====================================================================
# fit_transform() is used on the training data so that we can scale 
# the training data and also learn the scaling parameters of that data.
#====================================================================

df.userId = lbl_user.fit_transform(df.userId.values)
df.businessId = lbl_business.fit_transform(df.businessId.values)

#====================================================================
# Getting the training and validating sets 
#====================================================================

df_train, df_valid = model_selection.train_test_split(
    df, test_size=0.1, random_state=None, stratify=df.rating.values)

train_dataset = YelpDataset(
    users = df_train.userId.values,
    business = df_train.businessId.values,
    ratings = df_train.rating.values)

valid_dataset = YelpDataset(
    users = df_valid.userId.values,
    business = df_valid.businessId.values,
    ratings = df_valid.rating.values)

#====================================================================
# Batch Processing in batches of 4
#====================================================================

train_loader = DataLoader(dataset = train_dataset,
                          batch_size = 4,
                          shuffle = True,
                          num_workers = 0)

validation_loader = DataLoader(dataset = valid_dataset,
                          batch_size = 4,
                          shuffle = True,
                          num_workers = 0)

dataiter = iter(train_loader)
dataloader_data = dataiter.next()

#====================================================================
# Processing in GPU
#====================================================================

device = torch.device('cude' if torch.cuda.is_available() else 'cpu')

model = RecSysModel(
    n_users = len(lbl_user.classes_),
    n_business = len(lbl_business.classes_),
).to(device)

#====================================================================
# Model's optimizer
#====================================================================

optimizer = torch.optim.Adam(model.parameters())
sch = torch.optim.lr_scheduler.StepLR(optimizer, step_size =3, gamma=0.7)

#====================================================================
# Model's loss function (MSE loss)
#====================================================================
loss_func = nn.MSELoss()

user_embed = nn.Embedding(len(lbl_user.classes_),32)
business_embed = nn.Embedding(len(lbl_business.classes_),32)

out = nn.Linear(64,1)

user_embeds = user_embed(dataloader_data['users'])
business_embeds = business_embed(dataloader_data['business'])

output = torch.cat([user_embeds,business_embeds], dim=1)
output = out(output)

with torch.no_grad():
    model_output = model(dataloader_data['users'],
                        dataloader_data['business'])

rating = dataloader_data['ratings']

#====================================================================
# Model Training 
#====================================================================
epochs = 1
total_loss = 0
plot_steps, print_steps = 100,100
step_cnt = 0
all_losses_list = []

model.train()

for epoch_i in range(epochs):
    
    for i, train_data in enumerate(train_loader):
        
        output = model(train_data["users"],
                      train_data["business"])
        rating = train_data["ratings"].view(4,-1).to(torch.float32)
        
        loss = loss_func(output, rating)
        total_loss = total_loss + loss.sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        step_cnt = step_cnt + len(train_data["users"])
        
        if(step_cnt % plot_steps == 0):
            avg_loss = total_loss/(len(train_data["users"])*plot_steps)
            all_losses_list.append(avg_loss)
            total_loss = 0

#====================================================================
# Saving the weight into a pth file
#====================================================================

PATH = "model_weights.pth"

torch.save(model.state_dict(), PATH)

#====================================================================
# Loading the weight into the model
#====================================================================

model.load_state_dict(torch.load(PATH))

#====================================================================
# Evaluating the model to make predictions
#====================================================================

model.eval()

#====================================================================
# Call this function to make recommendation to a specific user
#====================================================================

def predict_best_restaurants(user,num_of_rec):
    print("There are",df.userId.nunique(),"number of users and their ids range between 0 and",df.userId.nunique()-1) 
    print('Recommendation for user_id:')
    user = user #int(input())
    print('How many recommendations would you like:')
    num_of_rec = num_of_rec #int(input())
    user = torch.tensor([user], dtype=torch.long)

    list_of_predictions = []
    business_ = []
    unique_business = df.businessId.unique().tolist()
    for i in range(df.businessId.nunique()):
        business_id = unique_business[i]
        business = torch.tensor([business_id], dtype=torch.long)
        y_hat = model(user,business)
        y_hat = y_hat.detach().numpy()
        list_of_predictions.append([business_id,y_hat[0]])
        
    list_of_predictions.sort(key=lambda x: x[1],reverse = True)
    list_of_predictions = list_of_predictions[:num_of_rec]

    for i in range(len(list_of_predictions)):
        business_name_ = list_of_predictions[i][0]
        business_name = yelp_business_filtered['name'].loc[yelp_business_filtered['business_id'] == business_name_]
        business_.append(business_name.tolist()[0])
        
    for i in range(len(business_)):
        print("Your number",i+1,"recommendation is",business_[i])
    
    return business_

In [None]:
#====================================================================
# Model's loss function (MSE loss) in a graph
#====================================================================
print("The minimum MSE score for the model is:",min(all_losses_list) )
print(min(all_losses_list))
plt.figure()
plt.plot(all_losses_list)
plt.show()


In [None]:
#====================================================================
# Calculating the coverage
#====================================================================
list_of_cities = yelp_business_filtered['city'].unique()


# picking 5 users for each city with 100 businesses
num_of_users_to_pick = []

# values to be calculated for coverage
val = []

for i in range(len(list_of_cities)):
    
    # getting the number of business in a city
    num_business = len(yelp_business_filtered[yelp_business_filtered['city']== list_of_cities[i]])
    
    print("num business:", num_business)
    if num_business >= 90:

        test_user = int(10*(num_business/100))
        num_of_users_to_pick.append(test_user)
        print("City",list_of_cities[i], "has", num_business, "businesses and their target test user",test_user )
        
        # to get business_ids and user_id of each city
        business_id = yelp_business_filtered['business_id'][yelp_business_filtered['city']== list_of_cities[i]]
        business_id = business_id.tolist()
        
        
        # to get business_ids and user_id of each city
        user_id_all = (sample['user_id'][sample['business_id'].isin(business_id)]).tolist()
        random.shuffle(user_id_all)
        user_to_be_predictd = (user_id_all[:test_user])
       
        list_of_restaurant_prediction = []
        
        for i in range(len(user_to_be_predictd)):
            prediction_10 = predict_best_restaurants(user_to_be_predictd[i],10)
            list_of_restaurant_prediction.extend(prediction_10)
            
        num_of_recommended = len(np.unique(list_of_restaurant_prediction))
        
        num = (num_of_recommended/num_business)*100
    
        val.append(num)
        
from statistics import mean

coverage = mean(val)

print(coverage)