In [30]:
# Data 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Visualization
from IPython.display import display, HTML

import warnings
warnings.filterwarnings('ignore')

# Create Data Pipeline

In [31]:
# Load all items
items = pd.read_csv('scenario_profitability\\items.csv', sep = ';', index_col=False)
# Drop recency column
items = items.drop(columns=['recency'])
# Show dataframe
items

Unnamed: 0,item,meal,profitability,sustainability,nutritionalvalue
0,1,A,1,2,3
1,2,B,3,2,1
2,3,C,3,1,3
3,4,D,5,1,3


In [32]:
# Load all users
users = pd.read_csv('scenario_profitability\\users.csv', sep = ';', index_col=False)
# Show dataframe
users

Unnamed: 0,user,profitability,sustainability,nutritionalvalue,w_recency,recency
0,User_1,3,4,3,0,0
1,User_2,3,4,3,0,0
2,User_3,3,4,3,0,0
3,User_4,3,4,3,0,0
4,User_5,3,4,3,0,0
...,...,...,...,...,...,...
95,User_96,3,4,3,0,0
96,User_97,3,4,3,0,0
97,User_98,3,4,3,0,0
98,User_99,3,4,3,0,0


In [33]:
print("Enter a userid to get recommendations:")
selected_user = 'User_10' #'User_'+input()

print('You have selected',format(selected_user))

Enter a userid to get recommendations:
You have selected User_10


# Create User Item Matrix

## 1. Create a user x item matrix based on predefined ratings

In [34]:
# Load all training ratings
training_ratings = pd.read_csv('scenario_profitability\\training_ratings.csv', sep = ';', index_col=False)
# Remove duplicate ratings for user - item combinations
training_ratings = training_ratings.drop_duplicates(subset=['user', 'item'], keep='last')

# request info on the dataframe
training_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 390 entries, 0 to 389
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   user    390 non-null    object
 1   item    390 non-null    int64 
 2   rating  390 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 12.2+ KB


In [35]:
# Create user Item Matrix to Dataframe
training_matrix = training_ratings.pivot(index='item', columns='user', values='rating')
# Replace NaN with 0
training_matrix = training_matrix.replace(np.nan, 0)
# Print the matrix
display(HTML(training_matrix.to_html()))

user,User_1,User_10,User_100,User_11,User_12,User_13,User_14,User_15,User_16,User_17,User_18,User_19,User_2,User_20,User_21,User_22,User_23,User_24,User_25,User_26,User_27,User_28,User_29,User_3,User_30,User_31,User_32,User_33,User_34,User_35,User_36,User_37,User_38,User_39,User_4,User_40,User_41,User_42,User_43,User_44,User_45,User_46,User_47,User_48,User_49,User_5,User_50,User_51,User_52,User_53,User_54,User_55,User_56,User_57,User_58,User_59,User_6,User_60,User_61,User_62,User_63,User_64,User_65,User_66,User_67,User_68,User_69,User_7,User_70,User_71,User_72,User_73,User_74,User_75,User_76,User_77,User_78,User_79,User_8,User_80,User_81,User_82,User_83,User_84,User_85,User_86,User_87,User_88,User_89,User_9,User_90,User_91,User_92,User_93,User_94,User_95,User_96,User_97,User_98,User_99
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1
1,3.0,3.0,3.0,3.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0
2,5.0,5.0,5.0,4.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0
3,4.0,4.0,4.0,4.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0
4,0.0,0.0,4.0,4.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,0.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,0.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,0.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,0.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,0.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,0.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,0.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,0.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0,4.0,5.0,3.0


In [36]:
training_ratings[training_ratings['user'] == selected_user].sort_values('item', ascending=True)

Unnamed: 0,user,item,rating
9,User_10,1,3
109,User_10,2,5
209,User_10,3,4


In [37]:
# Number of items and users and ratings
print('This dataframe has', items.shape[0], 'items')
print('This dataframe has', users.shape[0], 'users')
print('This dataframe has', training_ratings.shape[0], 'ratings')

This dataframe has 4 items
This dataframe has 100 users
This dataframe has 390 ratings


## 2. Recreate the User - Item Matrix with the Weighted Linear Combination Method

In [38]:
# Load all user preferences
user_preferences = pd.read_csv('scenario_profitability\\users.csv', sep = ';')

user_preferences['w_profitability'] = 0.9 #user_preferences['profitability'] / 10
user_preferences['w_sustainability'] = user_preferences['sustainability'] / 10
user_preferences['w_nutritionalvalue'] = user_preferences['nutritionalvalue'] / 10
user_preferences['w_recency'] =  user_preferences['w_recency']  / 10
user_preferences['recency'] = user_preferences['recency'] / 10

user_preferences = user_preferences[["user", "w_profitability", "w_sustainability", "w_nutritionalvalue", "w_recency", "recency"]]

user_preferences.head(11)

Unnamed: 0,user,w_profitability,w_sustainability,w_nutritionalvalue,w_recency,recency
0,User_1,0.9,0.4,0.3,0.0,0.0
1,User_2,0.9,0.4,0.3,0.0,0.0
2,User_3,0.9,0.4,0.3,0.0,0.0
3,User_4,0.9,0.4,0.3,0.0,0.0
4,User_5,0.9,0.4,0.3,0.0,0.0
5,User_6,0.9,0.4,0.3,0.0,0.0
6,User_7,0.9,0.4,0.3,0.0,0.0
7,User_8,0.9,0.4,0.3,0.0,0.0
8,User_9,0.9,0.4,0.3,0.0,0.0
9,User_10,0.9,0.4,0.3,0.0,0.0


In [39]:
# add item characteristics
combined_frame = pd.merge(training_ratings,items,on='item',how='left')
# add user preferences
combined_frame = pd.merge(combined_frame,user_preferences,on='user',how='left')
# drop the meal name
combined_frame = combined_frame.drop(columns=['meal'])

# update recency, to only show 1 for item 3
combined_frame['w_recency'] = combined_frame['item'].apply(lambda x: 1 if x == 3 else 0)

# create several columns and fill them with some data
combined_frame['updated_rating'] = combined_frame['rating']
combined_frame['updated_recency'] = combined_frame['rating']
combined_frame['updated_nutritionalvalue'] = combined_frame['rating']
combined_frame['updated_sustainability'] = combined_frame['rating']
combined_frame['updated_profitability'] = combined_frame['rating']

# show the user item matrix for our selected user
combined_frame[combined_frame['user'] == selected_user]

Unnamed: 0,user,item,rating,profitability,sustainability,nutritionalvalue,w_profitability,w_sustainability,w_nutritionalvalue,w_recency,recency,updated_rating,updated_recency,updated_nutritionalvalue,updated_sustainability,updated_profitability
9,User_10,1,3,1,2,3,0.9,0.4,0.3,0,0.0,3,3,3,3,3
109,User_10,2,5,3,2,1,0.9,0.4,0.3,0,0.0,5,5,5,5,5
209,User_10,3,4,3,1,3,0.9,0.4,0.3,1,0.0,4,4,4,4,4


In [40]:
# iterate through each row and update the score
for ind in combined_frame.index:
    combined_frame['updated_rating'][ind] = combined_frame['rating'][ind] + combined_frame['w_recency'][ind] * combined_frame['recency'][ind] + combined_frame['w_nutritionalvalue'][ind] * combined_frame['nutritionalvalue'][ind] + combined_frame['w_sustainability'][ind] * combined_frame['sustainability'][ind] + combined_frame['w_profitability'][ind] * combined_frame['profitability'][ind] 
    
    combined_frame['updated_recency'][ind] = combined_frame['w_recency'][ind] * combined_frame['recency'][ind]
    combined_frame['updated_nutritionalvalue'][ind] = combined_frame['w_nutritionalvalue'][ind] * combined_frame['nutritionalvalue'][ind]
    combined_frame['updated_sustainability'][ind] = combined_frame['w_sustainability'][ind] * combined_frame['sustainability'][ind]
    combined_frame['updated_profitability'][ind] = combined_frame['w_profitability'][ind] * combined_frame['profitability'][ind]

# show the user item matrix for our selected user
combined_frame[combined_frame['user'] == selected_user]

# r(u,i) + w_recency * recency(u,i) + w_nutVal * nutritionalValue(i) + w_sustainability * sustainability(i) + w_profitability * profitability(i)

Unnamed: 0,user,item,rating,profitability,sustainability,nutritionalvalue,w_profitability,w_sustainability,w_nutritionalvalue,w_recency,recency,updated_rating,updated_recency,updated_nutritionalvalue,updated_sustainability,updated_profitability
9,User_10,1,3,1,2,3,0.9,0.4,0.3,0,0.0,5.6,0,0.9,0.8,0.9
109,User_10,2,5,3,2,1,0.9,0.4,0.3,0,0.0,8.8,0,0.3,0.8,2.7
209,User_10,3,4,3,1,3,0.9,0.4,0.3,1,0.0,8.0,0,0.9,0.4,2.7


In [41]:
updated_ratings = combined_frame[["user","item","updated_rating"]]
updated_ratings[updated_ratings['user'] == selected_user]

Unnamed: 0,user,item,updated_rating
9,User_10,1,5.6
109,User_10,2,8.8
209,User_10,3,8.0


In [42]:
# Create user Item Matrix to Dataframe
updated_matrix = updated_ratings.pivot(index='item', columns='user', values='updated_rating')
# Replace NaN with 0
updated_matrix = updated_matrix.replace(np.nan, 0)
# Print the matrix
display(HTML(updated_matrix.to_html()))

user,User_1,User_10,User_100,User_11,User_12,User_13,User_14,User_15,User_16,User_17,User_18,User_19,User_2,User_20,User_21,User_22,User_23,User_24,User_25,User_26,User_27,User_28,User_29,User_3,User_30,User_31,User_32,User_33,User_34,User_35,User_36,User_37,User_38,User_39,User_4,User_40,User_41,User_42,User_43,User_44,User_45,User_46,User_47,User_48,User_49,User_5,User_50,User_51,User_52,User_53,User_54,User_55,User_56,User_57,User_58,User_59,User_6,User_60,User_61,User_62,User_63,User_64,User_65,User_66,User_67,User_68,User_69,User_7,User_70,User_71,User_72,User_73,User_74,User_75,User_76,User_77,User_78,User_79,User_8,User_80,User_81,User_82,User_83,User_84,User_85,User_86,User_87,User_88,User_89,User_9,User_90,User_91,User_92,User_93,User_94,User_95,User_96,User_97,User_98,User_99
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1
1,5.6,5.6,5.6,5.6,7.6,5.6,6.6,7.6,5.6,6.6,7.6,5.6,6.6,6.6,7.6,5.6,6.6,7.6,5.6,6.6,7.6,5.6,6.6,7.6,7.6,5.6,6.6,7.6,5.6,6.6,7.6,5.6,6.6,7.6,5.6,5.6,6.6,7.6,5.6,6.6,7.6,5.6,6.6,7.6,5.6,6.6,6.6,7.6,5.6,6.6,7.6,5.6,6.6,7.6,5.6,6.6,7.6,7.6,5.6,6.6,7.6,5.6,6.6,7.6,5.6,6.6,7.6,5.6,5.6,6.6,7.6,5.6,6.6,7.6,5.6,6.6,7.6,5.6,6.6,6.6,7.6,5.6,6.6,7.6,5.6,6.6,7.6,5.6,6.6,7.6,7.6,5.6,6.6,7.6,5.6,6.6,7.6,5.6,6.6,7.6
2,8.8,8.8,8.8,7.8,7.8,8.8,6.8,7.8,8.8,6.8,7.8,8.8,6.8,6.8,7.8,8.8,6.8,7.8,8.8,6.8,7.8,8.8,6.8,7.8,7.8,8.8,6.8,7.8,8.8,6.8,7.8,8.8,6.8,7.8,8.8,8.8,6.8,7.8,8.8,6.8,7.8,8.8,6.8,7.8,8.8,6.8,6.8,7.8,8.8,6.8,7.8,8.8,6.8,7.8,8.8,6.8,7.8,7.8,8.8,6.8,7.8,8.8,6.8,7.8,8.8,6.8,7.8,8.8,8.8,6.8,7.8,8.8,6.8,7.8,8.8,6.8,7.8,8.8,6.8,6.8,7.8,8.8,6.8,7.8,8.8,6.8,7.8,8.8,6.8,7.8,7.8,8.8,6.8,7.8,8.8,6.8,7.8,8.8,6.8,7.8
3,8.0,8.0,8.0,8.0,7.0,8.0,9.0,7.0,8.0,9.0,7.0,8.0,9.0,9.0,7.0,8.0,9.0,7.0,8.0,9.0,7.0,8.0,9.0,7.0,7.0,8.0,9.0,7.0,8.0,9.0,7.0,8.0,9.0,7.0,8.0,8.0,9.0,7.0,8.0,9.0,7.0,8.0,9.0,7.0,8.0,9.0,9.0,7.0,8.0,9.0,7.0,8.0,9.0,7.0,8.0,9.0,7.0,7.0,8.0,9.0,7.0,8.0,9.0,7.0,8.0,9.0,7.0,8.0,8.0,9.0,7.0,8.0,9.0,7.0,8.0,9.0,7.0,8.0,9.0,9.0,7.0,8.0,9.0,7.0,8.0,9.0,7.0,8.0,9.0,7.0,7.0,8.0,9.0,7.0,8.0,9.0,7.0,8.0,9.0,7.0
4,0.0,0.0,9.8,9.8,8.8,9.8,10.8,8.8,9.8,10.8,8.8,9.8,0.0,10.8,8.8,9.8,10.8,8.8,9.8,10.8,8.8,9.8,10.8,0.0,8.8,9.8,10.8,8.8,9.8,10.8,8.8,9.8,10.8,8.8,0.0,9.8,10.8,8.8,9.8,10.8,8.8,9.8,10.8,8.8,9.8,0.0,10.8,8.8,9.8,10.8,8.8,9.8,10.8,8.8,9.8,10.8,0.0,8.8,9.8,10.8,8.8,9.8,10.8,8.8,9.8,10.8,8.8,0.0,9.8,10.8,8.8,9.8,10.8,8.8,9.8,10.8,8.8,9.8,0.0,10.8,8.8,9.8,10.8,8.8,9.8,10.8,8.8,9.8,10.8,0.0,8.8,9.8,10.8,8.8,9.8,10.8,8.8,9.8,10.8,8.8


In [43]:
updated_ratings['updated_rating'].max()

10.8

In [44]:
updated_ratings['updated_rating'].min()

5.6000000000000005

## 3. Predict the missing ratings to create a complete user - item matrix

# Now we can apply a model based approach

In [45]:
import surprise
from surprise import Dataset
from surprise import Reader
from surprise import BaselineOnly
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import SVD
from surprise import model_selection
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from collections import defaultdict
import missingno as msno

In [46]:
# Load surprise dataset
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(updated_ratings[['user', 'item', 'updated_rating']], Reader(rating_scale=(5,8.5)))

In [47]:
# 80-20 Train test split
trainset, testset = surprise.model_selection.split.train_test_split(data, test_size=0.2, train_size=None, random_state=42, shuffle=False)
train_full = data.build_full_trainset()

In [48]:
#user_filter = ['User_1', 'User_2','User_3','User_4','User_5','User_6','User_7','User_8','User_9','User_10']
#
#trainset_filtered = updated_ratings[~updated_ratings.user.isin(user_filter)]
#trainset = Dataset.load_from_df(trainset_filtered, Reader(rating_scale=(4,7)))
#trainset = trainset.build_full_trainset()
#
#testset_filtered = updated_ratings[updated_ratings.user.isin(user_filter)]
#testset = Dataset.load_from_df(testset_filtered, Reader(rating_scale=(4,7)))
#testset = testset.build_full_trainset()

In [49]:
#trainset._raw2inner_id_users
#trainset._raw2inner_id_items

## Ensembling

In [50]:
# load all algorithms
algorithms = [
 'CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=20, random_state=42)'
, 'SlopeOne()'
, 'NMF(n_factors=15, n_epochs=50, random_state=42)'
#, 'SVDpp'
, 'SVD(n_factors=50, n_epochs=20, biased=True, random_state=42)'
#, 'KNNBaseline'
#, 'KNNWithZScore'
#, 'KNNWithMeans'
, 'KNNBasic(k=3)'
#, 'BaselineOnly'
]
#, 'NormalPredictor']

In [51]:
# create empty dataframe
final_predictions = pd.DataFrame(columns=["iid", "est", "model"])
sim_options = {'name': 'cosine'}

for method in algorithms:

    print(method)
    recommendation_method = "surprise."+method#+"()"
    print(recommendation_method)
    
    algo = eval(recommendation_method)

    # Train the algorithm on the trainset, and predict ratings for the testset
    algo.fit(train_full)

    # make predictions for on user
    uid = selected_user
    items = updated_ratings["item"].unique()

    predictions = [
        algo.predict(uid, item) # (uid, iid, r_ui=None, clip=True, verbose=False)
        for item in items
    ]
    
    df_pred = pd.DataFrame(predictions)
    df_pred["model"] = method

    df_pred = df_pred[["uid", "iid", "est", "model"]]

    final_predictions = pd.concat([final_predictions,df_pred])

CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=20, random_state=42)
surprise.CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=20, random_state=42)
SlopeOne()
surprise.SlopeOne()
NMF(n_factors=15, n_epochs=50, random_state=42)
surprise.NMF(n_factors=15, n_epochs=50, random_state=42)
SVD(n_factors=50, n_epochs=20, biased=True, random_state=42)
surprise.SVD(n_factors=50, n_epochs=20, biased=True, random_state=42)
KNNBasic(k=3)
surprise.KNNBasic(k=3)
Computing the msd similarity matrix...
Done computing similarity matrix.


## Evaluate Performance

In [52]:
benchmark = []
# Iterate over all algorithms
for algorithm in algorithms:
    recommendation_method = "surprise."+algorithm#+"()"
    print(recommendation_method)
    
    algorithm = eval(recommendation_method)
    #[surprise.CoClustering(), surprise.SlopeOne(), surprise.NMF(), surprise.SVD(), surprise.KNNBasic()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['MAE','RMSE','MSE'], cv=5, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
perf_evaluation = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_mae')
perf_evaluation

surprise.CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=20, random_state=42)
surprise.SlopeOne()
surprise.NMF(n_factors=15, n_epochs=50, random_state=42)
surprise.SVD(n_factors=50, n_epochs=20, biased=True, random_state=42)
surprise.KNNBasic(k=3)
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0_level_0,test_mae,test_rmse,test_mse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KNNBasic,0.36812,0.744745,0.56537,0.00042,0.006552
SVD,0.864529,1.055547,1.120188,0.009386,0.0
SlopeOne,1.111349,1.281769,1.643681,0.001588,0.00101
CoClustering,1.148687,1.45708,2.147416,0.021667,0.003533
NMF,2.192171,2.460014,6.074065,0.020887,0.000743


In [53]:
perf_evaluation.mean()

test_mae     1.136971
test_rmse    1.399831
test_mse     2.310144
fit_time     0.010789
test_time    0.002368
dtype: float64

In [54]:
final_predictions

Unnamed: 0,iid,est,model,uid
0,1,5.0,"CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=...",User_10
1,2,6.462801,"CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=...",User_10
2,3,7.912548,"CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=...",User_10
3,4,8.5,"CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=...",User_10
0,1,6.45,SlopeOne(),User_10
1,2,7.49,SlopeOne(),User_10
2,3,8.46,SlopeOne(),User_10
3,4,8.5,SlopeOne(),User_10
0,1,5.0,"NMF(n_factors=15, n_epochs=50, random_state=42)",User_10
1,2,6.563756,"NMF(n_factors=15, n_epochs=50, random_state=42)",User_10


In [55]:
# groupby columns Col1 and estimate the mean of column Col2
ensembled = final_predictions.groupby(['iid','uid'])['est'].mean()
ensembled = pd.DataFrame(ensembled)
ensembled.reset_index(inplace=True)
ensembled

Unnamed: 0,iid,uid,est
0,1,User_10,5.715123
1,2,User_10,7.391952
2,3,User_10,7.670108
3,4,User_10,8.218487


In [56]:
pred_df = pd.DataFrame(ensembled).merge(updated_ratings[updated_ratings['user'] == selected_user] , how = 'left', left_on = ['iid','uid'], right_on = ['item', 'user'])
pred_df

Unnamed: 0,iid,uid,est,user,item,updated_rating
0,1,User_10,5.715123,User_10,1.0,5.6
1,2,User_10,7.391952,User_10,2.0,8.8
2,3,User_10,7.670108,User_10,3.0,8.0
3,4,User_10,8.218487,,,


## Baseline Only

In [57]:
# Load surprise dataset
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(training_ratings[['user', 'item', 'rating']], Reader(rating_scale=(1,5)))

# 80-20 Train test split
trainset, testset = surprise.model_selection.split.train_test_split(data, test_size=0.2, train_size=None, random_state=42, shuffle=False)
train_full = data.build_full_trainset()

In [58]:
# create empty dataframe
baseline_predictions = pd.DataFrame(columns=["iid", "est", "model"])

algo = BaselineOnly()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(train_full)

# make predictions for on user
uid = selected_user
items = updated_ratings["item"].unique()

predictions = [
    algo.predict(uid, item)
    for item in items
]

df_baseline_pred = pd.DataFrame(predictions)
df_baseline_pred["model"] = 'BaselineOnly'

df_baseline_pred = df_baseline_pred[["iid", "est", "model"]]

baseline_predictions = df_baseline_pred
baseline_predictions

Estimating biases using als...


Unnamed: 0,iid,est,model
0,1,3.981937,BaselineOnly
1,2,4.0183,BaselineOnly
2,3,3.991028,BaselineOnly
3,4,3.990131,BaselineOnly
