In [16]:
from gensim.models import Word2Vec
import multiprocessing
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

cores = multiprocessing.cpu_count() # Count the number of cores in a computer
print(cores)

8


In [17]:
w2v_model = Word2Vec(min_count=0,
                     window=2,      # context window size
                     vector_size=5,        # embedding dimension
                    #  sample=6e-5,   # threshold for configuring which higher-frequency words are randomly downsampled (apparently highly important)
                     alpha=0.03, 
                    #  min_alpha=0.0007, 
                     sg=1,          # 1 for skip-gram; otherwise CBOW
                     negative=5,    # for negative sampling
                     workers=cores-1)

INFO - 17:58:56: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=5, alpha=0.03>', 'datetime': '2023-10-24T17:58:56.093641', 'gensim': '4.3.2', 'python': '3.11.4 (main, Jul  5 2023, 13:45:01) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-87-generic-x86_64-with-glibc2.31', 'event': 'created'}


In [18]:
# Step 1: Load the text corpus
sentences = []
with open('file.txt', 'r') as f:
    for line in f:
        sentences.append(line.strip().split('.'))

# Step 2: Preprocess the text corpus
corpus=[]
# Iterate through the list of sentences, break them into lists of words and remove the empty strings
for i in range(len(sentences)):
    sentences[i] = sentences[i][0].split(' ')

    # strip the words of any special characters
    sentences[i] = [word.strip(',?;"!').lower() for word in sentences[i]]

    sentences[i] = list(filter(None, sentences[i]))
    if(len(sentences[i])>0):
        corpus.append(sentences[i])

print(corpus)

[['once', 'upon', 'a', 'time', 'two', 'little', 'fairies', 'from', 'magic', 'land', 'ventured', 'out', 'into', 'the', 'wide', 'world'], ['his', 'friend', 'lily-bell', 'was', 'completely', 'different', 'from', 'him', 'for', 'she', 'was', 'so', 'kind', 'and', 'friendly', 'that', 'everyone', 'loved', 'her'], ['“i’m', 'tired', 'and', 'thirsty,”', 'said', 'thistledown'], ['“dear', 'thistledown', 'be', 'kind', 'to', 'these', 'flowers', 'and', 'don’t', 'tease', 'them'], ['thistledown', 'laughed', 'and', 'then', 'he', 'took', 'the', 'honey', 'from', 'the', 'violets', 'and', 'shook', 'the', 'purple', 'bells', 'vigorously', 'to', 'get', 'all', 'their', 'dew', 'for', 'his', 'bath'], ['finally', 'he', 'came', 'to', 'a', 'very', 'lovely', 'rose', 'bush', 'with', 'one', 'rose', 'in', 'full', 'bloom', 'and', 'a', 'bud'], ['“little', 'rosebud', 'why', 'do', 'you', 'grow', 'so', 'slowly', 'you', 'are', 'now', 'too', 'old', 'to', 'stay', 'rocking', 'in', 'your', 'green', 'cradle'], ['“no', 'my', 'bud', 

In [19]:
# Step 3: Build the vocabulary
print("Building the vocabulary")
w2v_model.build_vocab(corpus, progress_per=100)

INFO - 17:58:56: collecting all words and their counts
INFO - 17:58:56: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 17:58:56: collected 645 word types from a corpus of 1924 raw words and 85 sentences
INFO - 17:58:56: Creating a fresh vocabulary
INFO - 17:58:56: Word2Vec lifecycle event {'msg': 'effective_min_count=0 retains 645 unique words (100.00% of original 645, drops 0)', 'datetime': '2023-10-24T17:58:56.112865', 'gensim': '4.3.2', 'python': '3.11.4 (main, Jul  5 2023, 13:45:01) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-87-generic-x86_64-with-glibc2.31', 'event': 'prepare_vocab'}
INFO - 17:58:56: Word2Vec lifecycle event {'msg': 'effective_min_count=0 leaves 1924 word corpus (100.00% of original 1924, drops 0)', 'datetime': '2023-10-24T17:58:56.113209', 'gensim': '4.3.2', 'python': '3.11.4 (main, Jul  5 2023, 13:45:01) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-87-generic-x86_64-with-glibc2.31', 'event': 'prepare_vocab'}
INFO - 17:58:56: deleting the raw 

Building the vocabulary


In [20]:
# Step 4: Train the model
print("Training the model")
w2v_model.train(corpus, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

INFO - 17:58:56: Word2Vec lifecycle event {'msg': 'training model with 7 workers on 645 vocabulary and 5 features, using sg=1 hs=0 sample=0.001 negative=5 window=2 shrink_windows=True', 'datetime': '2023-10-24T17:58:56.129474', 'gensim': '4.3.2', 'python': '3.11.4 (main, Jul  5 2023, 13:45:01) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-87-generic-x86_64-with-glibc2.31', 'event': 'train'}
INFO - 17:58:56: EPOCH 0: training on 1924 raw words (1348 effective words) took 0.0s, 393201 effective words/s
INFO - 17:58:56: EPOCH 1: training on 1924 raw words (1366 effective words) took 0.0s, 591312 effective words/s
INFO - 17:58:56: EPOCH 2: training on 1924 raw words (1354 effective words) took 0.0s, 446151 effective words/s
INFO - 17:58:56: EPOCH 3: training on 1924 raw words (1344 effective words) took 0.0s, 402522 effective words/s
INFO - 17:58:56: EPOCH 4: training on 1924 raw words (1372 effective words) took 0.0s, 410035 effective words/s
INFO - 17:58:56: EPOCH 5: training on 1924 raw word

Training the model


(41007, 57720)

In [26]:
w2v_model.wv.most_similar(positive=["all"])

[('i', 0.9990049600601196),
 ('became', 0.9982714653015137),
 ('berries', 0.9975578784942627),
 ('bring', 0.9973838329315186),
 ('they', 0.9971966743469238),
 ('violets', 0.9967678785324097),
 ('went', 0.9967342615127563),
 ('bells', 0.99602872133255),
 ('secret', 0.9958593845367432),
 ('nymph', 0.9958404302597046)]