In [1]:
##### IMPORT LIBRARIES 

# Main library 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm # Track progress 
from sklearn.metrics import mean_squared_error

# Similarity metric
from sklearn.metrics.pairwise import cosine_similarity 

# Recommender systems
import surprise as sp
from surprise import Reader, Dataset, SVD, evaluate, accuracy  
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

# To create deep learning models
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model

# To create sparse matrices
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from scipy.sparse import vstack # To stack sparse matrices
import scipy.sparse as sp

# To light fm
# from lightfm import LightFM
# from lightfm.evaluation import precision_at_k

Using TensorFlow backend.


In [27]:
##### DATASETS 

# Normal train and test set (filtered and split)
df_train= pd.read_csv('../../data/generated/inter_train.csv')
df_train = df_train.dropna()
df_test= pd.read_csv('../../data/generated/inter_test.csv') 
df_test = df_test.dropna()


# Read as matrix format of train and test set
X_train = pd.read_csv('../../data/generated/inter_train.csv').pivot_table(
                            index='u', columns='i', values='rating', dropna=False)
X_test = pd.read_csv('../../data/generated/inter_test.csv').pivot_table(
                            index='u', columns='i', values='rating', dropna=False)

# Fill in missing values with -1 and add 1 to each rating (want NaN to 0)
train = X_train.fillna(-1)
train = train.add(1)
train2 = train.T  # make us save a few seconds in similarity computation 


# Full data - no train/test split 
full_data = pd.read_csv('../../data/generated/full_data_filtered.csv')
full_data_matrix = full_data.pivot_table(index='u', columns='i', values='rating', dropna=False)


# Create a dataframe giving the recipe's name along with its id 'i'
names = pd.read_csv('../../data/generated/names.csv')
names.columns=['i','name']
names = names.set_index('i') # set 'i' as index, useful later. 

## Hybrid reco with names of recipes

In [28]:
# Work directly on names that we need to tokenise 
list_names = names.name.tolist()   # do it directly on full_data to have the right dimension 
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(list_names)
# print(vectorizer.get_feature_names())
print(X.shape)

# Add sparse compact vectors of 0 and 1 in a list. 
tok_names = []
d = {'sparse_vectors':[]}
for i in range(names.shape[0]):
    tok_names.append(X[i]) 
    d['sparse_vectors'].append(X[i])

# Reimport dataset to get a normal index, not i as index
names = pd.read_csv('../../data/generated/names.csv')
names.columns=['i','name']

# Join this to vectors created above
data = pd.DataFrame(d)
data = data.join(names)
df_train = df_train.merge(data, on='i')
df_test = df_test.merge(data, on='i')
full_data = full_data.merge(data, on='i')  

# Store those vectors
# tokenised_names = full_data['sparse_vectors']
tokenised_names = df_train['sparse_vectors']
tokenised_names_test = df_test['sparse_vectors']

# Stack the sparse matrices
tokenised_names = vstack(tokenised_names)
tokenised_names_test =  vstack(tokenised_names_test)

# Leave as array for neural network implementation ease
#tokenised_name = tokenised_names.toarray() 
#tokenised_name_test =  tokenised_names_test.toarray()

(11120, 4378)


In [4]:
# Create user- & recipe-id mapping
df_filter= pd.read_csv('../../data/generated/inter_train.csv')
user_id_mapping = {id:i for i, id in enumerate(df_filter['u'].unique())}
recipe_id_mapping = {id:i for i, id in enumerate(df_filter['i'].unique())}

# Create correctly mapped train & testset
train_user_data = df_train['u'].map(user_id_mapping)
train_recipe_data = df_train['i'].map(recipe_id_mapping)
test_user_data = df_test['u'].map(user_id_mapping)
test_recipe_data = df_test['i'].map(recipe_id_mapping)

# Get input variable-sizes
users = len(user_id_mapping)  #10007
recipes = len(recipe_id_mapping) #11120
embed_size_user = 10
embed_size_recipe = 10


# Set input layers. We input respectively 10007 and 11120 items of size 1 (u and i) 
user_id_input = Input(shape=[1], name='user')  # shape (None,1)
recipe_id_input = Input(shape=[1], name='recipe') # shape (None,1)
name_input = Input(shape=[4378], name='tokenised_names') # shape (None, 4378) #sparse = True


# Create embedding layers for users and recipes
user_embedding = Embedding(output_dim=embed_size_user, 
                           input_dim=users,
                           input_length=1, 
                           name='user_embedding')(user_id_input)  # shape (None, 1, embedding_size)
recipe_embedding = Embedding(output_dim=embed_size_recipe, 
                            input_dim=recipes,
                            input_length=1, 
                            name='item_embedding')(recipe_id_input) # shape (None, 1, embedding_size)


# Dimensionality reduction with Dense layers
name_vectors = Dense(128, activation='relu')(name_input)
name_vectors = Dense(32, activation='relu')(name_vectors)

In [5]:
# Reshape the embedding layers (- like flattening)
user_vector = Reshape([embed_size_user])(user_embedding) # shape (None, embedding_size)
recipe_vector = Reshape([embed_size_recipe])(recipe_embedding) # shape (None, embedding_size)


# Concatenate the reshaped embedding layers
concat = Concatenate()([user_vector, recipe_vector, name_vectors])


# Combine with dense layers
dense = Dense(512, activation='relu')(concat)
dense = Dropout(0.2)(dense)
dense = Dense(128)(dense)
y = Dense(1)(dense)


# Instantiate a model given input and output layers. 
model3 = Model(inputs=[user_id_input, recipe_id_input, name_input], outputs=y)
model3.compile(loss='mse', optimizer='adam')


# Fit model
model3.fit([train_user_data, train_recipe_data, tokenised_names],
          df_train['rating'],
          batch_size= 1200, 
          epochs=5,
           validation_data = ([test_user_data, test_recipe_data, tokenised_names_test], df_test['rating']),
          shuffle=True)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 225338 samples, validate on 55925 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x1a44437160>

In [6]:
### Investigate predictions on test set (unseen data)

# Compute predictions and compare them to the true value
y_pred = model3.predict([test_user_data, test_recipe_data, tokenised_names_test])

# Add names
# df_test = df_test.merge(names,on='i')
y_pred = pd.DataFrame(y_pred)
df_test_copy = df_test.reset_index(drop=True)
y_pred = y_pred.join(df_test_copy)
y_pred = y_pred.rename(columns={0:'pred_rating'})

# Look at a user in particular
y_pred[y_pred['u']==87]

Unnamed: 0.1,pred_rating,Unnamed: 0,rating,u,i,sparse_vectors,name
32,4.50111,174738,4.0,87,78551,"(0, 3063)\t0.4182780190955211\n (0, 990)\t0...",crock pot potato chowder
8800,4.600294,292278,5.0,87,172148,"(0, 2759)\t0.7896216758775201\n (0, 3117)\t...",oreo pudding
16572,4.566111,238734,5.0,87,142774,"(0, 1156)\t0.6097664458543058\n (0, 3548)\t...",dirty shrimp in butter beer sauce
16597,5.015951,268995,5.0,87,103884,"(0, 727)\t0.71564958063136\n (0, 3894)\t0.6...",taco cheesecake
16604,4.830988,275263,3.0,87,149422,"(0, 591)\t0.2883718187858596\n (0, 1598)\t0...",gingerbread gingerbread cake
16608,4.747349,281036,5.0,87,114081,"(0, 1486)\t0.7381545937583465\n (0, 2991)\t...",black forest pizza
16610,4.788479,281094,3.0,87,148228,"(0, 3486)\t0.7041787357709044\n (0, 635)\t0...",best seller caramel corn
16623,5.066853,349418,5.0,87,112084,"(0, 1016)\t0.5719906303358219\n (0, 3307)\t...",pie crust cinnamon rolls
16629,4.864528,367410,5.0,87,72975,"(0, 2426)\t0.6109239677447547\n (0, 2208)\t...",lil cheddar meatloaves
16643,4.801425,371050,5.0,87,115658,"(0, 247)\t0.609799794291513\n (0, 860)\t0.4...",kittencal s bakery coconut cream pie


In [21]:
### Print best recommendation 

user_index = 87 
# List of unrated recipes for a user - include train and test
unrated_recipes = full_data_matrix.iloc[user_index][full_data_matrix.iloc[user_index].isna()].index.tolist()

# Get tokenised names for each recipe, in the right order
t_n = data['sparse_vectors']
t_n = vstack(t_n)
t_n =t_n.toarray()

#len(lst)
unrated_recipes = pd.Series(unrated_recipes)
ur = unrated_recipes.map(recipe_id_mapping)
lst = [87] * len(unrated_recipes)
y_pred = model3.predict([lst, ur, t_n])

y_pred = pd.DataFrame(y_pred)
y_pred = y_pred.join(names)
y_pred = y_pred.rename(columns ={0:'score'})
y_pred.sort_values('score', ascending=False)[:10]

Unnamed: 0,score,i,name
6250,5.639419,98499,broiled lobster tails for 2
4340,5.578915,67254,fried apples stekte epler
4478,5.545162,69528,applewood farmhouse apple fritters
5847,5.53309,91978,julia child s cherry clafouti
2681,5.509263,40667,fruity grilled cheese sandwich
10350,5.506562,165260,pizza bagel bites oamc
2246,5.50239,34399,spooktacular halloween graveyard cake
4477,5.501045,69483,not your ordinary chocolate chip cookies liqu...
7199,5.498896,113704,nana s chocolate frosting
6569,5.495666,103571,my moms lemon bars are better than your moms l...


## Hybrid with additional metadata in the middle layer

In [22]:
# Create a dataframe with only tokenised names and recipe_id
pp_recipes = pd.read_csv('../../data/pp_recipes.csv')
raw_recipes = pd.read_csv('../../data/raw_recipes.csv')
pp_recip = pp_recipes[['calorie_level','id']]
raw_recip = raw_recipes[['minutes','n_steps','n_ingredients','id']]
pp_recip = pp_recip.merge(raw_recip,on='id')
# Normalise data pp_recip

# Update my entire dataset 
full_data = full_data.rename(columns={"recipe_id": "id"})
full_data = full_data.merge(pp_recip, on='id')


In [23]:
# Create user- & recipe-id mapping
user_id_mapping = {id:i for i, id in enumerate(full_data['u'].unique())}
recipe_id_mapping = {id:i for i, id in enumerate(full_data['i'].unique())}

# Create correctly mapped train & testset
train_user_data = full_data['u'].map(user_id_mapping)
train_recipe_data = full_data['i'].map(recipe_id_mapping)

# Get input variable-sizes
users = len(user_id_mapping)  #10007
recipes = len(recipe_id_mapping) #11120
embed_size_user = 10
embed_size_recipe = 10


# Set input layers. We input respectively 10007 and 11120 items of size 1 (u and i) 
user_id_input = Input(shape=[1], name='user')  # shape (None,1)
recipe_id_input = Input(shape=[1], name='recipe') # shape (None,1)
calorie_input = Input(shape=[1], name='calorie') # shape (None, 1) 
time_input =  Input(shape=[1], name='minutes')
steps_input=  Input(shape=[1], name='steps')
ing_input =  Input(shape=[1], name='ingredients')



# Create embedding layers for users and recipes
user_embedding = Embedding(output_dim=embed_size_user, 
                           input_dim=users,
                           input_length=1, 
                           name='user_embedding')(user_id_input)  # shape (None, 1, embedding_size)
recipe_embedding = Embedding(output_dim=embed_size_recipe, 
                            input_dim=recipes,
                            input_length=1, 
                            name='item_embedding')(recipe_id_input) # shape (None, 1, embedding_size)

# Augment it 
# name_vectors = Dense(16, activation='relu')(name_input)


# Reshape the embedding layers (- like flattening)
user_vector = Reshape([embed_size_user])(user_embedding) # shape (None, embedding_size)
recipe_vector = Reshape([embed_size_recipe])(recipe_embedding) # shape (None, embedding_size)


# Concatenate the reshaped embedding layers
concat = Concatenate()([user_vector, recipe_vector, calorie_input, time_input, steps_input, ing_input])

# Combine with dense layers
dense = Dense(56)(concat)
y = Dense(1)(dense)


# Instantiate a model given input and output layers. 
model4 = Model(inputs=[user_id_input, recipe_id_input, calorie_input, time_input, steps_input, ing_input], outputs=y)
model4.compile(loss='mse', optimizer='adam')


# Fit model
model4.fit([train_user_data, train_recipe_data, tokenised_names],
          df_train['rating'],
          batch_size= 1200, 
          epochs=8,
           validation_data = ([test_user_data, test_recipe_data, tokenised_names_test], df_test['rating']),
          shuffle=True)

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 6 array(s), but instead got the following list of 3 arrays: [array([[    0],
       [    1],
       [    2],
       ...,
       [10005],
       [ 9977],
       [ 9982]]), array([[    0],
       [    0],
       [    0],
       ...,
       [11119],
       [11119...

In [None]:
### Print best recommendation 

user_index = 87 
# List of unrated recipes for a user - include train and test
unrated_recipes = full_data_matrix.iloc[user_index][full_data_matrix.iloc[user_index].isna()].index.tolist()

# Get tokenised names for each recipe, in the right order
t_n = data['sparse_vectors']
t_n = vstack(t_n)
t_n =t_n.toarray()

#len(lst)
unrated_recipes = pd.Series(unrated_recipes)
ur = unrated_recipes.map(recipe_id_mapping)
lst = [87] * len(unrated_recipes)
y_pred = model3.predict([lst, ur, t_n])

y_pred = pd.DataFrame(y_pred)
y_pred = y_pred.join(names)
y_pred = y_pred.rename(columns ={0:'score'})
y_pred.sort_values('score', ascending=False)[:10]