In [17]:
from gensim.models import Word2Vec
import pymongo
import nltk
import copy
from nltk.tokenize import word_tokenize
from scipy.spatial import distance

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/filippo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
db = pymongo.MongoClient()['cousine']
recipes = db['foodcom']

q = {}
recipe_corpus = []
size = recipes.count_documents(q)
limit = 50_000

for recipe in recipes.find(q).limit(limit):
    try:
        recipe_corpus.append(word_tokenize(recipe['description'].lower()))
    except TypeError:
        pass
    except AttributeError:
        pass

In [8]:
print(recipe_corpus[0])

['from', 'cooking', 'light', ',', 'with', 'slight', 'midifications', '.', 'vegetarian', 'meal', 'hubby', 'will', 'actually', 'eat', '!']


In [9]:
recipe_model = Word2Vec(
    sentences=recipe_corpus,
    vector_size=300,
    window=5,
    min_count=1,
    workers=8,
    epochs=25
)

In [10]:
recipe_model.wv.most_similar('dinner')

[('supper', 0.7136813998222351),
 ('meal', 0.6285492181777954),
 ('dinners', 0.5337151885032654),
 ('brunch', 0.480575293302536),
 ('entree', 0.4634782075881958),
 ('lunch', 0.461673378944397),
 ('snack', 0.4466129541397095),
 ('entertaining', 0.4406588077545166),
 ('tailgate', 0.43861398100852966),
 ('buffet', 0.42506498098373413)]

In [12]:
dm = recipe_model.wv.doesnt_match(['pasta', 'spaghetti', 'noodles', 'apple'])
common = recipe_model.wv.get_mean_vector(['pasta', 'spaghetti', 'noodles', 'risotto'])
common_word = recipe_model.wv.similar_by_vector(common)
analogy = recipe_model.wv.most_similar(positive=['pizza', 'steak'], negative=['tomato'])

print(f"Doesn't match: {dm}")
print(f"Common terms: {common_word}")
print(f"Analogy: {analogy}")

Doesn't match: apple
Common terms: [('pasta', 0.8408735990524292), ('spaghetti', 0.8084529042243958), ('noodles', 0.6921340823173523), ('risotto', 0.6604927182197571), ('linguine', 0.6206898093223572), ('penne', 0.6200934648513794), ('fettuccine', 0.5846065878868103), ('lasagna', 0.5527559518814087), ('polenta', 0.552187442779541), ('rigatoni', 0.5521164536476135)]
Analogy: [('steaks', 0.4115698039531708), ('skirt', 0.40858206152915955), ('flank', 0.4015762209892273), ('sirloin', 0.37781035900115967), ('grill', 0.35197216272354126), ('crust', 0.34695467352867126), ('souvlaki', 0.34689754247665405), ('fabuloso', 0.34486350417137146), ('hamburgers', 0.3444518446922302), ('rib', 0.3326165974140167)]


In [25]:
# Semantic Shift: moving from one corpus to another, how does the meaning of words change?

# WE CAN'T COMPARE TWO MODELS TRAINED SEPARATELY

# Options:
# 1. Use math to transpose one space on to the other
# 2. Compute relative similarity within each model and compare similarities
# 3. Tune a model on both corpuses, then fine-tune two versions of the model, one for each corpus
#       It works because vectors are already defined, we are just tweaking weights

In [14]:
italian_q = {'search_terms': 'italian'}
chinese_q = {'search_terms': 'chinese'}
limit = 5_000
italian_corpus = []
chinese_corpus = []

for q, c in [(italian_q, italian_corpus), (chinese_q, chinese_corpus)]:
    for doc in recipes.find(q).limit(limit):
        try:
            tokens = word_tokenize(doc['description'].lower())
            c.append(tokens)
        except AttributeError:
            pass

print(f"Italian corpus: {len(italian_corpus)}, Chinese corpus: {len(chinese_corpus)}")

Italian corpus: 4922, Chinese corpus: 4173


In [15]:
main_corpus = italian_corpus + chinese_corpus
m0 = Word2Vec(
    sentences=main_corpus,
    vector_size=100,
    window=5,
    min_count=1,
    workers=8,
    epochs=50
)

In [16]:
m0.wv.most_similar('dinner')

[('meal', 0.6197167038917542),
 ('supper', 0.6117156147956848),
 ('lunch', 0.6020562648773193),
 ('guests', 0.520651638507843),
 ('informal', 0.499549001455307),
 ('picnics', 0.484683096408844),
 ('crowd', 0.48251673579216003),
 ('brunch', 0.4791308343410492),
 ('occasion', 0.47841671109199524),
 ('snack', 0.47509321570396423)]

In [19]:
m_it = copy.deepcopy(m0)
m_ch = copy.deepcopy(m0)

# Fine tuning
m_it.train(italian_corpus, total_examples=m0.corpus_count, epochs=m0.epochs)
m_ch.train(chinese_corpus, total_examples=m0.corpus_count, epochs=m0.epochs)

(6120375, 8695900)

In [23]:
word = 'spaghetti'
v0, vit, vch = m0.wv.get_vector(word), m_it.wv.get_vector(word), m_ch.wv.get_vector(word)

print(f"Moving to IT: {distance.cosine(vit, v0)}")
print(f"Moving to CH: {distance.cosine(vch, v0)}")
print(f"Moving from IT to CH: {distance.cosine(vch, vit)}")

Moving to IT: 0.04938506047944213
Moving to CH: 0.0430384014382722
Moving from IT to CH: 0.09096499077947129
