In [4]:
%load_ext autoreload
%autoreload 2
!uv pip install .

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[2mUsing Python 3.12.10 environment at: /home/m/priv/opencv/4.11.0-Release/venv-py3[0m
[2K[2mResolved [1m1 package[0m [2min 3ms[0m[0m                                            [0m
[2K[2mPrepared [1m1 package[0m [2min 953ms[0m[0m                                              
[2mUninstalled [1m1 package[0m [2min 0.99ms[0m[0m
[2K[2mInstalled [1m1 package[0m [2min 0.86ms[0m[0m:///home/m/priv/opencv/ml)     [0m
 [33m~[39m [1mlib8[0m[2m==0.0.1 (from file:///home/m/priv/opencv/ml)[0m


In [None]:

from collections import Counter
from lib8 import stream_sentences, Word2Vec, create_skipgram_generator, create_skipgram_batch_generator
# import lib8
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os
from tqdm import tqdm # For a nice progress bar

device_ids = [0, 1] # Use the first two GPUs
device = torch.device(f"cuda:{device_ids[0]}" if torch.cuda.is_available() else "cpu")
print(f"Primary device: {device}")

# --- Parameters ---
file_path = "text8"
min_count = 5 # As in your original gensim code


# --- Training Parameters ---
batch_size = 1024  * 32 # <--- NEW! Start with a power of 2 and increase based on VRAM
embedding_dim = 50
learning_rate = 0.001
epochs = 200 # Note: One epoch over text8 is very long!
window_size = 5
num_cpu_cores = os.cpu_count()
print(f"Number of available CPU cores: {num_cpu_cores}")

# --- Initialize Model, Loss, and Optimizer ---
MODEL_PATH = f"word2vec_pytorch_{embedding_dim}.pth"
if os.path.exists(MODEL_PATH):
    model = Word2Vec.load_from_checkpoint(MODEL_PATH)
else:
    # --- Vocabulary Creation ---
    print("Building vocabulary...")
    # Note: This will iterate through the entire file once to build the vocab
    word_counts = Counter(word for sentence in stream_sentences(file_path) for word in sentence)
    vocab = [word for word, count in word_counts.items() if count >= min_count]
    word_to_ix = {word: i for i, word in enumerate(vocab)}
    ix_to_word = {i: word for i, word in enumerate(vocab)}
    vocab_size = len(vocab)

    print(f"Vocabulary size: {vocab_size}")
    model = Word2Vec(vocab_size, embedding_dim, word_to_ix, ix_to_word)

if torch.cuda.device_count() > 1:
  print(f"Using {len(device_ids)} GPUs: {device_ids}")
  model = nn.DataParallel(model, device_ids=device_ids)
model.to(device)


loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# --- Training Loop using the Generator ---
print("Starting training...")
for epoch in range(epochs):
    batch_generator = create_skipgram_batch_generator(file_path, model.module.word_to_ix, window_size, batch_size)
    progress_bar = tqdm(batch_generator, desc=f"Epoch {epoch+1}/{epochs}")

    total_loss = 0
    num_pairs = 0
    
    for target_batch, context_batch in progress_bar:
        target_batch = target_batch.to(device)
        context_batch = context_batch.to(device)

        optimizer.zero_grad()
        scores = model(target_batch)
        loss = loss_function(scores, context_batch)
        
        # You DO NOT need loss.mean() here. DataParallel handles the loss gathering automatically.
        # The output loss is already a mean-reduced scalar on the primary device.
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        num_pairs += 1
        progress_bar.set_postfix({"loss": f"{loss.item():.4f} Average Loss: {total_loss/num_pairs:.4f}"})
        # model.module.save_checkpoint(MODEL_PATH, optimizer=optimizer, epoch=epochs)
        # break
    
    print(f"End of Epoch {epoch+1}")
    model.module.save_checkpoint(MODEL_PATH, optimizer=optimizer, epoch=epochs)


print("Model training complete.")


Primary device: cuda:0
Number of available CPU cores: 96
Using 2 GPUs: [0, 1]
Starting training...


Epoch 1/200: 3014it [26:04,  1.93it/s, loss=6.9610 Average Loss: 7.0446]


End of Epoch 1
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 2/200: 3015it [26:03,  1.93it/s, loss=6.9553 Average Loss: 7.0241]


End of Epoch 2
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 3/200: 3014it [26:03,  1.93it/s, loss=6.9452 Average Loss: 7.0148]


End of Epoch 3
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 4/200: 3014it [26:03,  1.93it/s, loss=7.0282 Average Loss: 7.0081]


End of Epoch 4
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 5/200: 3014it [26:03,  1.93it/s, loss=7.0270 Average Loss: 7.0029]


End of Epoch 5
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 6/200: 3014it [26:03,  1.93it/s, loss=6.9725 Average Loss: 6.9984]


End of Epoch 6
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 7/200: 3014it [26:03,  1.93it/s, loss=6.9065 Average Loss: 6.9944]


End of Epoch 7
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 8/200: 3014it [26:03,  1.93it/s, loss=6.9834 Average Loss: 6.9909]


End of Epoch 8
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 9/200: 3015it [26:03,  1.93it/s, loss=6.9979 Average Loss: 6.9879]


End of Epoch 9
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 10/200: 3015it [26:03,  1.93it/s, loss=6.9845 Average Loss: 6.9849]


End of Epoch 10
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 11/200: 3014it [26:04,  1.93it/s, loss=6.9568 Average Loss: 6.9820]


End of Epoch 11
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 12/200: 3014it [26:02,  1.93it/s, loss=7.0109 Average Loss: 6.9795]


End of Epoch 12
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 13/200: 3014it [26:02,  1.93it/s, loss=6.9489 Average Loss: 6.9770]


End of Epoch 13
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 14/200: 3015it [26:03,  1.93it/s, loss=7.0500 Average Loss: 6.9748]


End of Epoch 14
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 15/200: 3014it [26:02,  1.93it/s, loss=6.9910 Average Loss: 6.9726]


End of Epoch 15
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 16/200: 3014it [26:02,  1.93it/s, loss=6.9593 Average Loss: 6.9707]


End of Epoch 16
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 17/200: 3014it [26:03,  1.93it/s, loss=6.9753 Average Loss: 6.9685]


End of Epoch 17
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 18/200: 3014it [26:03,  1.93it/s, loss=6.9017 Average Loss: 6.9666]


End of Epoch 18
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 19/200: 3014it [26:02,  1.93it/s, loss=6.9295 Average Loss: 6.9647]


End of Epoch 19
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 20/200: 3015it [26:03,  1.93it/s, loss=6.9923 Average Loss: 6.9634]


End of Epoch 20
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 21/200: 3014it [26:02,  1.93it/s, loss=7.0277 Average Loss: 6.9615]


End of Epoch 21
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 22/200: 3014it [26:04,  1.93it/s, loss=6.9879 Average Loss: 6.9598]


End of Epoch 22
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 23/200: 3014it [26:04,  1.93it/s, loss=6.9825 Average Loss: 6.9581]


End of Epoch 23
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 24/200: 939it [08:08,  1.93it/s, loss=7.1712 Average Loss: 6.9632]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Epoch 28/200: 3014it [26:04,  1.93it/s, loss=7.0406 Average Loss: 6.9511]


End of Epoch 28
Model checkpoint saved successfully to 'word2vec_pytorch_50.pth'.


Epoch 29/200: 170it [01:29,  1.93it/s, loss=7.1527 Average Loss: 6.9315]

In [7]:
# !uv pip install tqdm
!ls -l

total 2434896
drwxrwxr-x 4 m m      4096 Jun 11 15:57 build
drwxrwxr-x 4 m m      4096 Jun 11 15:58 lib8
drwxrwxr-x 2 m m      4096 Jun 11 15:58 lib8.egg-info
drwxrwxr-x 2 m m      4096 Jun 11 11:08 __pycache__
-rw-rw-r-- 1 m m       206 Jun 11 12:09 pyproject.toml
-rw-rw-r-- 1 m m     17866 Jun 11 16:04 pytorch-world2vec.ipynb
-rw-rw-r-- 1 m m 572338061 Jun 10 16:28 RelevantDataScrape.csv
-rw-rw-r-- 1 m m 753076320 Jun  9 22:26 Result_23.csv
-rw-rw-r-- 1 m m 801608149 Jun 10 12:10 Result_by.csv
-rw-rw-r-- 1 m m 100000000 Jun  9 22:38 text8
-rw-rw-r-- 1 m m     53166 Jun 11 11:08 Untitled.ipynb
-rw-rw-r-- 1 m m     20884 Jun 10 16:03 Week1-day1.ipynb
-rw-rw-r-- 1 m m     48939 Jun 10 21:16 Week1MLX.ipynb
-rw-rw-r-- 1 m m  88456919 Jun 11 16:04 word2vec_pytorch_50.pth
-rw-rw-r-- 1 m m  59217970 Jun 10 15:05 word2vec_text8_cbow.model
-rw-rw-r-- 1 m m  59217963 Jun 10 14:49 word2vec_text8_sg.model
-rw-rw-r-- 1 m m  59217971 Jun 10 15:12 word2vec_text8_skip.model


In [None]:
# print("Saving the model state dictionary...")
# MODEL_PATH = "word2vec_pytorch_skipgram.pth"
# torch.save(model.state_dict(), MODEL_PATH)
# print(f"Model state_dict saved to {MODEL_PATH}")