In [133]:
import pandas as pd
import numpy as np
from numpy.linalg import svd
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial

REVIEW_SOURCE = r'C:\Users\alexanderh\Documents\Kaggle\vivino-recommenderpy\sample_data\review_data_2020-06-21.csv'

reviews = pd.read_csv(REVIEW_SOURCE).dropna()
reviews['FullName'] = reviews['WineName'] + "_" + reviews['Winery']
reviews.head()

Unnamed: 0.1,Unnamed: 0,Username,WineName,Winery,Rating,FullName
0,0,Joel Miller,Clos de Tafall 2017,Clos Berenguer,4.5,Clos de Tafall 2017_Clos Berenguer
1,1,Frank Webster,Clos de Tafall 2017,Clos Berenguer,4.5,Clos de Tafall 2017_Clos Berenguer
2,2,Emil Bojar,Clos de Tafall 2017,Clos Berenguer,4.5,Clos de Tafall 2017_Clos Berenguer
3,3,Mike Bor,Clos de Tafall 2017,Clos Berenguer,4.5,Clos de Tafall 2017_Clos Berenguer
4,4,! DALLAS !,Clos de Tafall 2017,Clos Berenguer,4.5,Clos de Tafall 2017_Clos Berenguer


In [134]:
# Only keep users with more than 10 reviews, and wines with more than 100

nr_reviews = reviews.groupby("Username").size()#count().sort_values("WineName",ascending=False)
nr_reviews.reset_index().head()

nr_reviews = nr_reviews[nr_reviews > 10]

nr_reviews_wine = reviews.groupby("FullName").size()
nr_reviews_wine = nr_reviews_wine[nr_reviews_wine > 100]

In [135]:
reviews = reviews[reviews['Username'].isin(nr_reviews.index)]
reviews = reviews[reviews['FullName'].isin(nr_reviews_wine.index)]

In [136]:
# Drop duplicate wine-user pairs

dupliclates = reviews[reviews.duplicated(subset=['FullName','Username'], keep=False)]
#dupliclates
reviews = reviews.drop(dupliclates.index).reset_index(drop=True).drop(reviews.columns[0], axis=1)
reviews

Unnamed: 0,Username,WineName,Winery,Rating,FullName
0,Joel Miller,Clos de Tafall 2017,Clos Berenguer,4.5,Clos de Tafall 2017_Clos Berenguer
1,Warren K,Clos de Tafall 2017,Clos Berenguer,4.5,Clos de Tafall 2017_Clos Berenguer
2,David McLeish,Clos de Tafall 2017,Clos Berenguer,4.5,Clos de Tafall 2017_Clos Berenguer
3,Gonzalo Cabrera,Clos de Tafall 2017,Clos Berenguer,4.0,Clos de Tafall 2017_Clos Berenguer
4,Marlin,Clos de Tafall 2017,Clos Berenguer,4.0,Clos de Tafall 2017_Clos Berenguer
...,...,...,...,...,...
99550,G F,Campo Arriba Old Vines 2017,Barahonda,4.0,Campo Arriba Old Vines 2017_Barahonda
99551,Jesus Rodriguez,Campo Arriba Old Vines 2017,Barahonda,4.0,Campo Arriba Old Vines 2017_Barahonda
99552,B P,Campo Arriba Old Vines 2017,Barahonda,3.5,Campo Arriba Old Vines 2017_Barahonda
99553,A G,Campo Arriba Old Vines 2017,Barahonda,3.5,Campo Arriba Old Vines 2017_Barahonda


In [137]:
# Turn this into a users x wines matrix
review_matrix = reviews.pivot(index='Username', columns='FullName', values='Rating').fillna(0)

# Use these lists so we can convert back from indexes to wines and usernames
wine_list = review_matrix.columns.to_numpy()
user_list = review_matrix.index.to_numpy()

review_matrix_np = review_matrix.to_numpy()
u, s, vh = svd(review_matrix_np, full_matrices=False) # Rows of u corresponds to users, columns of vh to wines

In [138]:
max_similarity = 0
closest_index = None

similarity_matrix = np.zeros((vh.shape[1],vh.shape[1]))
for i in range(vh.shape[1]):
    for j in range(vh.shape[1]):
        similarity = 1 - spatial.distance.cosine(vh[:,i],vh[:,j]) # this calculates distance, so subtract from 1 to get similarity
        if i != j:
            similarity_matrix[i,j] = similarity

In [139]:
# Get top rated wine for each user, in case of a tie pick randomly
top_rating_index = reviews.groupby("Username")["Rating"].transform(max) == reviews["Rating"]
top_wines = reviews[top_rating_index].drop_duplicates(subset=["Username"]).reset_index(drop=True)
top_wines

Unnamed: 0,Username,WineName,Winery,Rating,FullName
0,Joel Miller,Clos de Tafall 2017,Clos Berenguer,4.5,Clos de Tafall 2017_Clos Berenguer
1,Warren K,Clos de Tafall 2017,Clos Berenguer,4.5,Clos de Tafall 2017_Clos Berenguer
2,Gonzalo Cabrera,Clos de Tafall 2017,Clos Berenguer,4.0,Clos de Tafall 2017_Clos Berenguer
3,Marlin,Clos de Tafall 2017,Clos Berenguer,4.0,Clos de Tafall 2017_Clos Berenguer
4,Eugene Ustinov,Clos de Tafall 2017,Clos Berenguer,4.0,Clos de Tafall 2017_Clos Berenguer
...,...,...,...,...,...
5612,Giorgio Codias,Barbera d'Alba Vignota 2018,Conterno Fantino,5.0,Barbera d'Alba Vignota 2018_Conterno Fantino
5613,Nicolò,Barbera d'Alba Vignota 2018,Conterno Fantino,4.5,Barbera d'Alba Vignota 2018_Conterno Fantino
5614,Yves Ramon,Barbera d'Alba Vignota 2018,Conterno Fantino,4.5,Barbera d'Alba Vignota 2018_Conterno Fantino
5615,Kurt Christiansen,Barbera d'Alba Vignota 2018,Conterno Fantino,4.5,Barbera d'Alba Vignota 2018_Conterno Fantino


In [140]:
review_validation = np.zeros((len(top_wines["Username"]), 3)) # Matrix of user, review of most recommended, average review of user


for i, user in enumerate(top_wines["Username"]):
    wine_id = np.where(wine_list == top_wines.iloc[i]["FullName"])[0][0]
    user_id = np.where(user_list == user)[0][0]

    best_match = similarity_matrix[wine_id].argmax()
    review_validation[i,0] = user_id
    review_validation[i,1] = review_matrix_np[user_id,best_match]
    review_validation[i,2] = np.mean(review_matrix_np[user_id, :][review_matrix_np[user_id, :] != 0])



In [141]:
# Hopefully the score of the recommended wines is higher than that of a user's average
print(np.mean(review_validation[:,2]))
print(np.mean(review_validation[:,1][review_validation[:,1] != 0]))
print(len(review_validation[:,1][review_validation[:,1] != 0]), len(review_validation[:,1][review_validation[:,1] == 0]))

3.570040215396135
3.5852842809364547
299 5318
