In [12]:
from gensim.models import Word2Vec
import multiprocessing
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

cores = multiprocessing.cpu_count() # Count the number of cores in a computer
print(cores)

8


In [13]:
w2v_model = Word2Vec(min_count=0,
                     window=7,      # context window size
                     vector_size=5,        # embedding dimension
                     sample=5e-5,   # threshold for configuring which higher-frequency words are randomly downsampled (apparently highly important)
                     alpha=0.03, 
                    #  min_alpha=0.0007, 
                     sg=1,          # 1 for skip-gram; otherwise CBOW
                     negative=10,    # for negative sampling
                     workers=cores-1)

INFO - 23:20:24: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=5, alpha=0.03>', 'datetime': '2023-10-24T23:20:24.175851', 'gensim': '4.3.2', 'python': '3.11.4 (main, Jul  5 2023, 13:45:01) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-87-generic-x86_64-with-glibc2.31', 'event': 'created'}


In [14]:
# Step 1: Load the text corpus
sentences = []
with open('file.txt', 'r') as f:
    for line in f:
        sentences.append(line.strip().split('.'))

# Step 2: Preprocess the text corpus
corpus=[]
# Iterate through the list of sentences, break them into lists of words and remove the empty strings
for i in range(len(sentences)):
    sentences[i] = sentences[i][0].split(' ')

    # strip the words of any special characters
    sentences[i] = [word.strip(',?;"!').lower() for word in sentences[i]]

    sentences[i] = list(filter(None, sentences[i]))
    if(len(sentences[i])>0):
        corpus.append(sentences[i])

print(corpus)

[['food', 'is', 'an', 'integral', 'part', 'of', 'human', 'existence', 'sustaining', 'life', 'nourishing', 'our', 'bodies', 'and', 'delighting', 'our', 'senses'], ['fruits', 'and', 'vegetables', 'are', "nature's", 'bounty', 'offering', 'an', 'array', 'of', 'flavors', 'textures', 'and', 'colors'], ['protein-rich', 'foods', 'like', 'meat', 'fish', 'eggs', 'and', 'legumes', 'are', 'essential', 'for', 'growth', 'repair', 'and', 'overall', 'health'], ['dairy', 'products', 'including', 'milk', 'cheese', 'and', 'yogurt', 'are', 'valued', 'for', 'their', 'calcium', 'content', 'and', 'contribution', 'to', 'strong', 'bones', 'and', 'teeth'], ['spices', 'and', 'herbs', 'add', 'depth', 'and', 'complexity', 'to', 'dishes'], ['street', 'food', 'is', 'an', 'essential', 'part', 'of', 'culinary', 'exploration', 'allowing', 'us', 'to', 'taste', 'the', 'essence', 'of', 'local', 'culture'], ['exploring', 'exotic', 'and', 'unusual', 'foods', 'is', 'an', 'adventure', 'in', 'itself'], ['food', 'is', 'not', 'm

In [15]:
# Step 3: Build the vocabulary
print("Building the vocabulary")
w2v_model.build_vocab(corpus, progress_per=100)

INFO - 23:20:24: collecting all words and their counts
INFO - 23:20:24: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 23:20:24: collected 319 word types from a corpus of 558 raw words and 38 sentences
INFO - 23:20:24: Creating a fresh vocabulary
INFO - 23:20:24: Word2Vec lifecycle event {'msg': 'effective_min_count=0 retains 319 unique words (100.00% of original 319, drops 0)', 'datetime': '2023-10-24T23:20:24.191423', 'gensim': '4.3.2', 'python': '3.11.4 (main, Jul  5 2023, 13:45:01) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-87-generic-x86_64-with-glibc2.31', 'event': 'prepare_vocab'}
INFO - 23:20:24: Word2Vec lifecycle event {'msg': 'effective_min_count=0 leaves 558 word corpus (100.00% of original 558, drops 0)', 'datetime': '2023-10-24T23:20:24.191746', 'gensim': '4.3.2', 'python': '3.11.4 (main, Jul  5 2023, 13:45:01) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-87-generic-x86_64-with-glibc2.31', 'event': 'prepare_vocab'}
INFO - 23:20:24: deleting the raw cou

Building the vocabulary


In [16]:
# Step 4: Train the model
print("Training the model")
w2v_model.train(corpus, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

INFO - 23:20:24: Word2Vec lifecycle event {'msg': 'training model with 7 workers on 319 vocabulary and 5 features, using sg=1 hs=0 sample=5e-05 negative=10 window=7 shrink_windows=True', 'datetime': '2023-10-24T23:20:24.207110', 'gensim': '4.3.2', 'python': '3.11.4 (main, Jul  5 2023, 13:45:01) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-87-generic-x86_64-with-glibc2.31', 'event': 'train'}
INFO - 23:20:24: EPOCH 0: training on 558 raw words (71 effective words) took 0.0s, 155301 effective words/s
INFO - 23:20:24: EPOCH 1: training on 558 raw words (79 effective words) took 0.0s, 171976 effective words/s
INFO - 23:20:24: EPOCH 2: training on 558 raw words (75 effective words) took 0.0s, 203703 effective words/s
INFO - 23:20:24: EPOCH 3: training on 558 raw words (77 effective words) took 0.0s, 261136 effective words/s
INFO - 23:20:24: EPOCH 4: training on 558 raw words (74 effective words) took 0.0s, 175956 effective words/s
INFO - 23:20:24: EPOCH 5: training on 558 raw words (69 effective

Training the model


(2185, 16740)

In [22]:
w2v_model.wv.most_similar(positive=["today"])

[('potential', 0.8829987645149231),
 ('events', 0.8751197457313538),
 ('lactose', 0.8699023723602295),
 ('action', 0.8617751002311707),
 ('crucial', 0.8562429547309875),
 ('through', 0.8442476391792297),
 ('solutions', 0.8418565392494202),
 ("it's", 0.8273929357528687),
 ('like', 0.8253291845321655),
 ('of', 0.7943481206893921)]

In [18]:
w2v_model.wv.similarity('dishes', 'climate')

0.71894246

In [19]:
w2v_model.wv.doesnt_match(["climate", "local", "dishes"])

'dishes'

In [None]:
# get the word vector for a word
w2v_model.wv['dishes']