# Demo | Distributed representations of sentences & documents

In [1]:
# import packages
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import re
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora
import random

## Prepare Data

In [2]:
random.seed(0)

In [3]:
# load data
with np.load('../data/raw/simplified-recipes-1M.npz', allow_pickle=True) as data:
    ingredients = data['ingredients']
    recipes = data['recipes']

In [4]:
# load data into list
recipes_list = []
for i in range(len(recipes)):
    try:
        recipes_list.append(list(ingredients[recipes[i]]))
    except:
        print(i)
        continue

727892


In [5]:
# shuffle array randomly
random.shuffle(recipes_list)

# split into test and train (85/15)
split = round(len(recipes_list)*.85)

# X_train and X_test
X_train = recipes_list[:split]
X_test = recipes_list[split:]

## Doc2Vec Classification

In [6]:
# create doc2vec vocab training
train_corpus = []
for i, sentence in enumerate(X_train):
    train_corpus.append(TaggedDocument(words=sentence, tags=str(i)))

In [7]:
# build and train doc2vec model
model = Doc2Vec(vector_size=50, min_count=2, epochs=25, dm=1)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [8]:
# build d2v test dataset
X_test_d2v = []
labels = []
for recipe in X_test:
    random.shuffle(recipe)
    labels.append(recipe.pop())
    X_test_d2v.append(model.infer_vector(recipe))
X_test_d2v = pd.DataFrame(X_test_d2v)

In [9]:
# get vectors for each ingredient
ingredients_d2v = []
for ingredient in ingredients:
    ingredients_d2v.append(model.infer_vector([ingredient]))
ingredients_d2v = pd.DataFrame(ingredients_d2v)

In [10]:
# matrix of pairwise cosine results
pw_cosine_results = pd.DataFrame(cosine_similarity(ingredients_d2v, X_test_d2v))

In [11]:
# calculate rank of prediction
rank = []
for col in pw_cosine_results:
    vals = pw_cosine_results[col]
    ingredients = list(ingredients)
    results = pd.DataFrame()
    results['cs'] = vals
    results['ingredients'] = ingredients
    results = results.sort_values(by='cs', ascending=False)
    
    for i, item in enumerate(list(results.ingredients)):
        if labels[int(col)] == item:
            rank.append(i+1)

In [12]:
mrr = np.mean(1/np.array(rank))
print('Mean reciprocal rank:', mrr)

Mean reciprocal rank: 0.010080604037852428


In [13]:
random_guess = np.random.randint(low=1, high=len(ingredients), size=len(ingredients), dtype=int)
mrr_random = np.mean(1/random_guess)
print('Random guess:', mrr_random)

Random guess: 0.0018642590181902833
