In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from surprise import Reader, Dataset, SVD, evaluate  # special library for reco systems
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm # Track progress 

In [2]:
# Normal train and test set (filtered and split)
df_train= pd.read_csv('../../data/generated/inter_train.csv')
df_train = df_train.dropna()
df_test= pd.read_csv('../../data/generated/inter_test.csv') 
# df_test = df_test.dropna()


# Read as matrix format of train and test set
X_train = pd.read_csv('../../data/generated/inter_test.csv').pivot_table(
                            index='u', columns='i', values='rating', dropna=False)
X_test = pd.read_csv('../../data/generated/inter_train.csv').pivot_table(
                            index='u', columns='i', values='rating', dropna=False)


# Full data - no train/test split 
full_data = pd.read_csv('../../data/generated/full_data_filtered.csv')
full_data_matrix = full_data.pivot_table(index='u', columns='i', values='rating', dropna=False)


# Create a dataframe giving the recipe's name along with its id 'i'
names = pd.read_csv('../../data/generated/names.csv')
names.columns=['i','name']
names = names.set_index('i') # set 'i' as index, useful later. 

## CF

In [3]:
# User we consider for recommendation
user_index = 0

# Number of similar users for recommendation
n_recommendation = 25

# Plot top n recommendations
n_plot = 10

# Fill in missing values with mean value 
# train = X_train.T.fillna(X_train.mean(axis=1)).T

# Fill in missing values with -1 and add 1 to each rating (want NaN to 0)
train = X_train.fillna(-1)
train = train.add(1)
train2 = train.T  # make us save a few seconds in similarity computation 

In [4]:
# Compute similarity metric (only for user 0 and all other users)
import time
start_time = time.time()

l = train.T.columns.tolist()
mean_user_x = X_train.iloc[user_index,:].mean()
similarity = []

for i in range(len(l)):
    m = X_train.iloc[i,:].mean()
    sim = np.sum( (train2[l[i]] * train.iloc[user_index,:]) / ((m * mean_user_x) * (abs(train2[l[i]] - train.iloc[user_index,:]) +1 )) )
    similarity.append(sim)  

# Watch time it takes
end_time = time.time()
total_time = end_time - start_time
print("Time: ", total_time)

# Remove similarity of user with itself 
similarity.remove(similarity[user_index])

Time:  51.99180817604065


In [5]:
# Sort similar users by index
similar_user_index = np.argsort(similarity)[::-1]

# Sort similar users by score
similar_user_score = np.sort(similarity)[::-1]

In [7]:
# Store unrated recipes (for the user considered)
unrated_recipes = X_train.iloc[user_index][X_train.iloc[user_index].isna()].index 

# Weight ratings of the top n most similar users via similarity score and compute the mean predicted rating for each recipe
# Score associated with each recipe indicates how likely user X is to like this recipe wrt to others. It is not a predicted rating. 
mean_recipe_recommendations = (train.iloc[similar_user_index[:n_recommendation]].T * similar_user_score[:n_recommendation]).T.mean(axis=0)

# Filter already rated recipes for user regarded and sort results
# Also join the corresponding name of each recipe 
best_recipe_recommendations = mean_recipe_recommendations[unrated_recipes].sort_values(ascending=False).to_frame().join(names)

# Print 10 recommendations 
best_recipe_recommendations[:n_plot]

Unnamed: 0_level_0,0,name
i,Unnamed: 1_level_1,Unnamed: 2_level_1
160154,0.883578,lemon baked cod
57030,0.816411,spicy potato bumps
33680,0.700062,pesto chicken
118670,0.667233,grilled chicken breast with barbecue glaze
97450,0.646995,roasted parmesan garlic cauliflower
44474,0.454737,pacific rim chicken burgers with ginger mayonn...
137554,0.454737,chicken bites with spice
14789,0.454737,amanda s thai peanut
164837,0.454737,sweet salsa dump chicken oamc
177367,0.454737,cashew chicken curry


In [15]:
# We want to estimate the rating a user would give to these recipes
# Create trace
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
trace = go.Bar(x = best_recipe_recommendations.iloc[:n_plot, 0],
               text = best_recipe_recommendations['name'],
               textposition = 'inside',
               textfont = dict(color = '#000000'),
               orientation = 'h',
               y = list(range(1, n_plot+1)),
               marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = 'Ranking Of Top {} Recommended Recipes For A User Based On Similarity'.format(n_plot),
              xaxis = dict(title = 'Recommendation-Rating',
                           range = (0, best_recipe_recommendations.iloc[0,0])),
              yaxis = dict(title = 'Recipe'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [12]:
# Let's see what would be the predicted rating of each recommended recipe. 
# The rating is computed as the average of the most similar users' ratings, weighted by their similarity score. 

prediction = []
brr = best_recipe_recommendations[:n_plot]
list_brr = brr.index.values.tolist()

# Regard all recipes 
for recipe_id in list_brr:
    
    # Compute predicted score (weighted average of similar user score)
    a = X_train.iloc[similar_user_index[:n_recommendation]][recipe_id].notnull()
    a = pd.DataFrame(a.ravel()) 
    score = (X_train.iloc[similar_user_index[a[a[0]==True].index.tolist()]][recipe_id] * similar_user_score[a[a[0]==True].index.tolist()]).sum() / similar_user_score[a[a[0]==True].index.tolist()].sum()
    prediction.append([recipe_id, score])
    
# Create prediction DataFrame - predict ratings of the recipes in test sets (for each user)
df_pred = pd.DataFrame(prediction, columns=['i', 'Prediction']).set_index(['i'])

In [16]:
# Create trace
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
trace = go.Bar(x = df_pred.iloc[:,0],
               text = best_recipe_recommendations['name'],
               textposition = 'inside',
               textfont = dict(color = '#000000'),
               orientation = 'h',
               y = list(range(1, n_plot+1)),
               marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = 'Ranking Of Top {} Recommended Recipes For A User Based On Similarity'.format(n_plot),
              xaxis = dict(title = 'Recommendation-Rating',
                           range = (0, 5)),
              yaxis = dict(title = 'Recipe'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [24]:
# Let's just check what would happen if we don't weight ratings using similarity score - and take the mean of only exisiting ratings 
mean_reci_reco = (X_train.iloc[similar_user_index[:n_recommendation]].T).T.mean(axis=0)
best_rec_reco= mean_reci_reco[unrated_recipes].sort_values(ascending=False).to_frame().join(names)
best_rec_reco[:n_plot]
# Recipes with only one rating, but a 5.0 are favoured with this technique. Also, too many recipes could be proposed (score of 5)

Unnamed: 0_level_0,0,name
i,Unnamed: 1_level_1,Unnamed: 2_level_1
177367,5.0,cashew chicken curry
82309,5.0,sirloin soup italiano
92110,5.0,tater tot cups with cheese and eggs
91525,5.0,kittencal s rich homemade beef stock crock po...
90467,5.0,pammy s crock pot chicken breast and gravy
85481,5.0,chocolate chip mexican wedding cakes
85206,5.0,almost apple pie
29746,5.0,roasted cauliflower 16 roasted cloves of garlic
31418,5.0,easy grilled cajun chicken
82257,5.0,the best peanut butter oatmeal cookies


## Test accuracy of predictions on test set 

In [30]:
# Look at our predictions for test set recipes, rated by our targeted user. 
# Predictions are a weighted average of simimar user ratings

# Create user-id mapping
user_id = l[user_index]
# print('for user'+str(user_id))
# user_id_mapping = {id:i for i, id in enumerate(train.index)}

prediction = []

# Regard only recipes that are present in the test set - careful there are NaN values
for recipe_id in df_test[df_test['u']==user_id]['i'].values:
    
    # Compute predicted score (weighted average of similar user score)
    a = X_test.iloc[similar_user_index[:n_recommendation]][recipe_id].notnull()
    a = pd.DataFrame(a.ravel()) 
    score = (X_test.iloc[similar_user_index[a[a[0]==True].index.tolist()]][recipe_id] * similar_user_score[a[a[0]==True].index.tolist()]).sum() / similar_user_score[a[a[0]==True].index.tolist()].sum()
    prediction.append([user_id, recipe_id, score])
 


invalid value encountered in double_scalars



In [51]:
# Create prediction DataFrame - predict ratings of the recipes in test sets (for each user)
df_pred = pd.DataFrame(prediction, columns=['u', 'i', 'Prediction']).set_index(['i'])

# Add rating of each recipe for this user to the prediction dataframe 
df_i = pd.DataFrame(df_test[df_test['u']==user_id]['i'])
df_r = pd.DataFrame(df_test[df_test['u']==user_id]['rating'])
df_test_user = df_i.join(df_r).set_index(['i'])
df_pred = df_pred.join(df_test_user )

# Drop NaN values (observations without ratings for each user)
df_pred = df_pred.dropna()

In [57]:
# Get labels and predictions
y_true = df_pred['rating'].values
y_pred = df_pred['Prediction'].values

# Compute RMSE
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred))

## REDO same as above but with the whole dataset (no train/test split)
This yields more accurate predicted ratings but withdraw the possibility to test our findings. 

In [69]:
# Simply use and rerun code above 
X_train = full_data_matrix 
df_train = full_data

KeyboardInterrupt: 

In [102]:
"""
# Sparse matrix 
# Read doc, there are several types 
import scipy.sparse as sp
from scipy.sparse import csr_matrix, find 
train_s = csr_matrix (train)
test = X_test.fillna(-1)
test = test.add(1)
test_s = csr_matrix(test)
df_full_data = sp.vstack((train_s, test_s))
"""

## Compute this similarity measure for all users - use sparse matrix

In [16]:
"""

# Compute mean rating of each user
l = train.T.columns.tolist() # store all user id
    
m =[]
for i in range(len(l)):
    m.append(X_train.iloc[i,:].mean())


# Compute similarity metric (only for user 0 and all other users)
# Find a way to use mean above (speed up computations)

l = train.columns.tolist()
dico={}
for j in tqdm(range(X_train.shape[1])): 

    mean_user_x = X_train.iloc[:,j].mean()
    sim = []
    for i in range(len(l)):
        m = X_train.iloc[:,i].mean()
        similarity = np.sum( (train[l[i]] * train.iloc[:,j]) / ((m * mean_user_x) * (abs(train[l[i]] - train.iloc[:,j])+1 )) )
        sim.append(similarity)  
    dico[j]= sim
    

# Convert dico into dataframe 
# pd.DataFrame.from_dict(data) 

# Maybe need to delete similarity with respect to itself
# Only 10 non neg values. They are quite different, which is pretty nice. 

"""

KeyboardInterrupt: 

## Extension of CF to all users with cosine similarity 

In [64]:
# Redefine train
train = X_train.T.fillna(-1).T
train = train.add(1)

# Compute similarity between all users
similarity = cosine_similarity(train.values)

# Remove self-similarity from similarity-matrix
similarity -= np.eye(similarity.shape[0])

# Sort similar users by index
similar_user_index = np.argsort(similarity[user_index])[::-1]

# Sort similar users by score
similar_user_score = np.sort(similarity[user_index])[::-1]

# Store unrated recipes (for the user considered)
unrated_recipes = X_train.iloc[user_index][X_train.iloc[user_index].isna()].index 

# Weight ratings of the top n most similar users with their rating and compute the mean for each recipe
mean_recipe_recommendations = (train.iloc[similar_user_index[:n_recommendation]].T * similar_user_score[:n_recommendation]).T.mean(axis=0)

# Filter already rated recipes for user regarded and sort results
best_recipe_recommendations = mean_recipe_recommendations[unrated_recipes].sort_values(ascending=False).to_frame().join(names)

# Create user-id mapping
user_id_mapping = {id:i for i, id in enumerate(train.index)}

KeyboardInterrupt: 

In [None]:
prediction = []
# Iterate over all testset users
for user_id in df_test['u'].unique():
    
    # Sort similar users by index 
    # We previously need to compute the similarity of each user wrt to all others
    similar_user_index = np.argsort(similarity[user_id_mapping[user_id]])[::-1]
    # Sort similar users by score
    similar_user_score = np.sort(similarity[user_id_mapping[user_id]])[::-1]
    
    # Regard only recipes that are present in the test set - there are NaN values
    for recipe_id in df_test[df_test['u']==user_id]['i'].values:

        # Compute predicted score (weighted average of similar user score)
        score = (train.iloc[similar_user_index[:n_recommendation]][recipe_id] * similar_user_score[:n_recommendation]).values.sum() / similar_user_score[:n_recommendation].sum()
        prediction.append([user_id, recipe_id, score])

In [None]:
# Create prediction DataFrame - predict ratings of the recipes in test sets (for each user)
df_pred = pd.DataFrame(prediction, columns=['u', 'i', 'Prediction']).set_index(['u', 'i'])
df_pred = df_test.set_index(['u', 'i']).join(df_pred)

In [None]:
# Drop NaN values (observations without ratings for each user)
df_pred = df_pred.dropna()

In [None]:
# Get labels and predictions
y_true = df_pred['rating'].values
y_pred = df_pred['Prediction'].values

In [None]:
# Compute RMSE
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred))

In [None]:
# Create trace
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
trace = go.Bar(x = best_recipe_recommendations.iloc[:n_plot, 0],
               text = best_recipe_recommendations['name'],
               textposition = 'inside',
               textfont = dict(color = '#000000'),
               orientation = 'h',
               y = list(range(1, n_plot+1)),
               marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = 'Ranking Of Top {} Recommended Recipes For A User Based On Similarity: {:.4f} RMSE'.format(n_plot, rmse),
              xaxis = dict(title = 'Recommendation-Rating',
                           range = (4.1, 4.5)),
              yaxis = dict(title = 'Movie'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)