In [298]:
import math
import pymysql
import pandas as pd


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import wikipedia
import spacy
from textblob import TextBlob
from IPython.display import Image

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from scipy import sparse
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

In [299]:
restaurants_df = pd.read_csv('https://raw.githubusercontent.com/Ajinth/Recommender-System/master/Research/Sample_Restaurants.csv', index_col='Users')

In [300]:
restaurants_df.fillna(value=0, inplace=True)

In [301]:
restaurants_df.head()

Unnamed: 0_level_0,Little_Donkey,Caf__Nero,India_Pavilion,Desi_Dhaba,Thai_Sensation,Dumpling_House,Caf__Luna
Users,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ajinth,4.0,0.0,5.0,0.0,3.0,0.0,2.0
Tess,3.0,3.0,3.0,0.0,4.0,0.0,0.0
Jessica,4.0,3.0,0.0,0.0,2.0,0.0,1.0
Mark,0.0,2.0,0.0,2.0,0.0,2.0,4.0
Joe,5.0,0.0,5.0,0.0,8.0,0.0,8.0


# Collaborative Item to Item Filtering

In [302]:
'''List of All Restaurants from the Dataframe'''
all_restaurants = list(restaurants_df.columns)
all_restaurants

['Little_Donkey',
 'Caf__Nero',
 'India_Pavilion',
 'Desi_Dhaba',
 'Thai_Sensation',
 'Dumpling_House ',
 'Caf__Luna']

In [303]:
'''List of Restaurants for Ajinth'''
ajinth_restaurant_list = list(restaurants_df.loc['Ajinth', :] ==0)
ajinth_restaurant_list

[False, True, False, True, False, True, False]

In [304]:
'''List of Restaurants that Ajinth has Rated'''
user_rated=[]
for key, a_res in enumerate(ajinth_restaurant_list):
    if a_res == False:
        user_rated.append((restaurants_df.loc['Ajinth', :] ==0).index[key])
    else: 
        continue
print (user_rated)  

['Little_Donkey', 'India_Pavilion', 'Thai_Sensation', 'Caf__Luna']


In [305]:
'''List of Restaurants that Ajinth has Not Rated'''
user_not_rated=[]
for key, a_res in enumerate(ajinth_restaurant_list):
    if a_res == True:
        user_not_rated.append((restaurants_df.loc['Ajinth', :] ==0).index[key])
    else: 
        continue
print (user_not_rated) 

['Caf__Nero', 'Desi_Dhaba', 'Dumpling_House ']


In [306]:
# (restaurants_df.loc['Ajinth', :] ==0).index[0]

In [307]:
'''Custom Function to calculate the Co-Similarity'''
def cosim(v1, v2):
    dot_product = sum(n1 * n2 for n1, n2 in zip(v1, v2) )
    magnitude1 = math.sqrt(sum(n ** 2 for n in v1))
    magnitude2 = math.sqrt(sum(n ** 2 for n in v2))
    return dot_product / (magnitude1 * magnitude2)

In [308]:
'''
Looping through the User Restaurants and all restaurants to calculate the Matrix.
The results are stored in a dictionary
'''
ur_rest_dict = {}
for ur in user_rated: 
    ur_cosim=[]
    for ar in all_restaurants: 
        similarity = cosim(restaurants_df[ur],\
                                        restaurants_df[ar])
        ur_cosim.append(similarity)
    ur_rest_dict[ur] = ur_cosim

In [309]:
'''Dataframe for the co-occurence matrix'''
co_occurence_matrix = pd.DataFrame.from_dict(ur_rest_dict, orient='index')
co_occurence_matrix.columns=all_restaurants
co_occurence_matrix

Unnamed: 0,Little_Donkey,Caf__Nero,India_Pavilion,Desi_Dhaba,Thai_Sensation,Dumpling_House,Caf__Luna
Little_Donkey,1.0,0.606349,0.811778,0.0,0.862105,0.309839,0.651273
India_Pavilion,0.811778,0.17091,1.0,0.0,0.904497,0.0,0.706049
Thai_Sensation,0.862105,0.272259,0.904497,0.0,1.0,0.0,0.809807
Caf__Luna,0.651273,0.174034,0.706049,0.433861,0.809807,0.194029,1.0


In [310]:
user_rated

['Little_Donkey', 'India_Pavilion', 'Thai_Sensation', 'Caf__Luna']

In [311]:
user_not_rated

['Caf__Nero', 'Desi_Dhaba', 'Dumpling_House ']

In [312]:
co_occurence_matrix

Unnamed: 0,Little_Donkey,Caf__Nero,India_Pavilion,Desi_Dhaba,Thai_Sensation,Dumpling_House,Caf__Luna
Little_Donkey,1.0,0.606349,0.811778,0.0,0.862105,0.309839,0.651273
India_Pavilion,0.811778,0.17091,1.0,0.0,0.904497,0.0,0.706049
Thai_Sensation,0.862105,0.272259,0.904497,0.0,1.0,0.0,0.809807
Caf__Luna,0.651273,0.174034,0.706049,0.433861,0.809807,0.194029,1.0


In [313]:
restaurants_df

Unnamed: 0_level_0,Little_Donkey,Caf__Nero,India_Pavilion,Desi_Dhaba,Thai_Sensation,Dumpling_House,Caf__Luna
Users,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ajinth,4.0,0.0,5.0,0.0,3.0,0.0,2.0
Tess,3.0,3.0,3.0,0.0,4.0,0.0,0.0
Jessica,4.0,3.0,0.0,0.0,2.0,0.0,1.0
Mark,0.0,2.0,0.0,2.0,0.0,2.0,4.0
Joe,5.0,0.0,5.0,0.0,8.0,0.0,8.0
Chris,3.0,5.0,0.0,0.0,0.0,4.0,0.0


In [314]:
type(restaurants_df.loc['Ajinth', : ])

pandas.core.series.Series

In [315]:
restaurants_to_rate = co_occurence_matrix[user_not_rated]
restaurants_to_rate

Unnamed: 0,Caf__Nero,Desi_Dhaba,Dumpling_House
Little_Donkey,0.606349,0.0,0.309839
India_Pavilion,0.17091,0.0,0.0
Thai_Sensation,0.272259,0.0,0.0
Caf__Luna,0.174034,0.433861,0.194029


In [316]:
'''Calculating Predicted Score for Cafe Nero - Manual Method'''
((0.64*4)+(.17*5) + (0.27*3) + (.17*2))/(0.64+0.17+0.27+0.17)

3.6480000000000006

In [320]:
'''Calculating Predicted Score for Dumpling House - Manual Method'''
((0.309839*4)+(2*0.194029))/(0.309839+0.194029)

3.2298419427310328

In [321]:
'''Calculating the Predicted Score'''

predicted_rating_dict = {}
for not_rated in restaurants_to_rate.columns: 
    column_df = pd.DataFrame(restaurants_to_rate[not_rated].sort_values(ascending=False))
    predicted_rating = []
    top_sim_list = []
    for top_n in range(2):   # Just getting the top two rated items 
        
        top_sim = column_df[not_rated][top_n]
        top_sim_list.append(top_sim)
        
        given_rr = column_df[not_rated].index[top_n]
        given_rank_df = pd.DataFrame(restaurants_df.loc['Ajinth', :]) # While converting this to function the name should be an argument to the function
        given_rank = given_rank_df.loc[given_rr,:]
        
        predicted_rating_item = float(given_rank*top_sim)
        predicted_rating.append(predicted_rating_item)
    total_sim_weight = sum(top_sim_list)
    adjusted_predicted_rating = (sum(predicted_rating))/total_sim_weight
    predicted_rating_dict[not_rated] = adjusted_predicted_rating
print (predicted_rating_dict)
    



{'Caf__Nero': 3.6901248118353513, 'Desi_Dhaba': 2.0, 'Dumpling_House ': 3.2298426551391857}


In [322]:
predicted_df = pd.DataFrame.from_dict(predicted_rating_dict, orient='index')
predicted_df.columns=['Ajinth']
predicted_df

Unnamed: 0,Ajinth
Caf__Nero,3.690125
Desi_Dhaba,2.0
Dumpling_House,3.229843
