# Word2Vec using GenSim

In [1]:
from gensim.models import Word2Vec
import pymongo
import nltk
import copy
from nltk.tokenize import word_tokenize
from scipy.spatial import distance

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/filippo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
db = pymongo.MongoClient()['cousine']
recipes = db['foodcom']

q = {}
recipe_corpus = []
size = recipes.count_documents(q)
limit = 50_000

for recipe in recipes.find(q).limit(limit):
    try:
        recipe_corpus.append(word_tokenize(recipe['description'].lower()))
    except TypeError:
        pass
    except AttributeError:
        pass

In [3]:
print(recipe_corpus[0])

['from', 'cooking', 'light', ',', 'with', 'slight', 'midifications', '.', 'vegetarian', 'meal', 'hubby', 'will', 'actually', 'eat', '!']


In [4]:
recipe_model = Word2Vec(
    sentences=recipe_corpus,
    vector_size=300,
    window=5,
    min_count=1,
    workers=8,
    epochs=25
)

### Similarity with GenSim

In [5]:
recipe_model.wv.most_similar('dinner')

[('supper', 0.6975187063217163),
 ('meal', 0.6011375188827515),
 ('dinners', 0.5321587920188904),
 ('brunch', 0.48907995223999023),
 ('entree', 0.4524165987968445),
 ('entertaining', 0.4512987434864044),
 ('lunch', 0.44626790285110474),
 ('superbowl', 0.44285741448402405),
 ('snack', 0.4113234281539917),
 ('buffet', 0.4104422330856323)]

### Compositionality with GenSim

In [6]:
dm = recipe_model.wv.doesnt_match(['pasta', 'spaghetti', 'noodles', 'apple'])
common = recipe_model.wv.get_mean_vector(['pasta', 'spaghetti', 'noodles', 'risotto'])
common_word = recipe_model.wv.similar_by_vector(common)
analogy = recipe_model.wv.most_similar(positive=['pizza', 'steak'], negative=['tomato'])

print(f"Doesn't match: {dm}")
print(f"Common terms: {common_word}")
print(f"Analogy: {analogy}")

Doesn't match: apple
Common terms: [('pasta', 0.8309046030044556), ('spaghetti', 0.8067542314529419), ('noodles', 0.6823669075965881), ('risotto', 0.6756052374839783), ('linguine', 0.6583754420280457), ('penne', 0.5989437699317932), ('fettuccine', 0.5860188007354736), ('lasagna', 0.5693743228912354), ('ravioli', 0.5428127646446228), ('lasagne', 0.5396863222122192)]
Analogy: [('flank', 0.44398728013038635), ('skirt', 0.43080782890319824), ('sirloin', 0.40152326226234436), ('steaks', 0.3841758370399475), ('grill', 0.37974080443382263), ('souvlaki', 0.3479692041873932), ('hamburgers', 0.3448116183280945), ('crust', 0.3362443149089813), ('pie', 0.3314985930919647), ('veal', 0.3248113691806793)]


## Semantic Shift

Moving from one corpus to another, how does the meaning of the words change?

To do so, we can't just train two separate models and compare the embeddings. This is simply because dimensions do not necessarily match across models, thus the same word is like orthogonal across the two models.

Instead, we have 3 options:
1. Use math to transpose one space on to the other, to avoid the above explained issue.
2. Compute relative similarity within each model and compare similarities. In this case, similarities work as a sort of normalization to avoid orthogonality issues.
3. Tune a model on both corpuses, then fine-tune two versions of the model, one for each corpus (this is what is implemented below).


In [14]:
italian_q = {'search_terms': 'italian'}
chinese_q = {'search_terms': 'chinese'}
limit = 5_000
italian_corpus = []
chinese_corpus = []

for q, c in [(italian_q, italian_corpus), (chinese_q, chinese_corpus)]:
    for doc in recipes.find(q).limit(limit):
        try:
            tokens = word_tokenize(doc['description'].lower())
            c.append(tokens)
        except AttributeError:
            pass

print(f"Italian corpus: {len(italian_corpus)}, Chinese corpus: {len(chinese_corpus)}")

Italian corpus: 4922, Chinese corpus: 4173


In [15]:
main_corpus = italian_corpus + chinese_corpus
m0 = Word2Vec(
    sentences=main_corpus,
    vector_size=100,
    window=5,
    min_count=1,
    workers=8,
    epochs=50
)

In [16]:
m0.wv.most_similar('dinner')

[('meal', 0.6197167038917542),
 ('supper', 0.6117156147956848),
 ('lunch', 0.6020562648773193),
 ('guests', 0.520651638507843),
 ('informal', 0.499549001455307),
 ('picnics', 0.484683096408844),
 ('crowd', 0.48251673579216003),
 ('brunch', 0.4791308343410492),
 ('occasion', 0.47841671109199524),
 ('snack', 0.47509321570396423)]

In [19]:
m_it = copy.deepcopy(m0)
m_ch = copy.deepcopy(m0)

# Fine tuning
m_it.train(italian_corpus, total_examples=m0.corpus_count, epochs=m0.epochs)
m_ch.train(chinese_corpus, total_examples=m0.corpus_count, epochs=m0.epochs)

(6120375, 8695900)

In [23]:
word = 'spaghetti'
v0, vit, vch = m0.wv.get_vector(word), m_it.wv.get_vector(word), m_ch.wv.get_vector(word)

print(f"Moving to IT: {distance.cosine(vit, v0)}")
print(f"Moving to CH: {distance.cosine(vch, v0)}")
print(f"Moving from IT to CH: {distance.cosine(vch, vit)}")

Moving to IT: 0.04938506047944213
Moving to CH: 0.0430384014382722
Moving from IT to CH: 0.09096499077947129
