In [1]:
import logging
import os
from gensim.models import Word2Vec
import multiprocessing

# --- Configuration ---\
# *** IMPORTANT: Point this to the output file from the BPE preprocessing step ***
input_corpus_file = 'processed_darija_v2/darija_bpe_tokenized_for_w2v.txt' # <<< Use the BPE tokenized file

# Output file for the Word2Vec model
output_model_file = 'darija_word2vec_bpe_sg_ns.model' # <<< Updated model name

# Word2Vec Parameters (adjust as needed, especially min_count for subwords)
vector_size = 200
window = 5
min_count = 2      # <<< You might lower min_count for subwords (e.g., 2-5)
sg = 1
negative = 3
workers = multiprocessing.cpu_count()
epochs = 15        # <<< Consider more epochs for subwords if corpus isn't huge

# --- Setup Logging ---
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# --- Sentence Iterator (No change needed, reads space-separated tokens) ---
class SentenceIterator:
    def __init__(self, filepath):
        self.filepath = filepath

    def __iter__(self):
        try:
            with open(self.filepath, 'r', encoding='utf-8') as f:
                for line in f:
                    tokens = line.split() # Reads the space-separated BPE tokens
                    if tokens:
                        yield tokens
        except FileNotFoundError:
            logging.error(f"Error: Input corpus file not found at {self.filepath}")
            raise

# --- Train the Model ---
print(f"Starting Word2Vec training on BPE tokens...")
print(f"Parameters: vector_size={vector_size}, window={window}, min_count={min_count}, sg={sg}, negative={negative}, workers={workers}, epochs={epochs}")
print(f"Reading corpus from: {input_corpus_file}")

sentences = SentenceIterator(input_corpus_file)

model = Word2Vec(
    sentences=sentences,
    vector_size=vector_size,
    window=window,
    min_count=min_count,
    sg=sg,
    negative=negative,
    workers=workers,
    epochs=epochs
)

print("Training complete.")

# --- Save the Model ---
print(f"Saving model to: {output_model_file}")
model.save(output_model_file)
print("Model saved successfully.")

# --- Basic Model Testing (Optional) ---
vocab_size = len(model.wv.key_to_index)
print(f"\nVocabulary size: {vocab_size} unique BPE tokens (after applying min_count={min_count})")

# Example: Find similar BPE tokens
# Note: Similarity will be between subwords now.
# You might need to look for subwords of words you are interested in.
# Example: If 'استغلال' was tokenized into 'است', 'غلال', you'd test those.
example_tokens = ['استغلال', 'bzaf', 'غلال', 'tomobil', 'rasso'] # Adjust based on actual BPE tokens

print("\nTesting similarity for some example BPE tokens:")
for token in example_tokens:
    if token in model.wv: # Use __contains__ or check key_to_index
        try:
            similar_tokens = model.wv.most_similar(token, topn=10)
            print(f"Tokens most similar to '{token}': {similar_tokens}")
        except Exception as e:
            print(f"Could not get similar tokens for '{token}': {e}")
    else:
        print(f"Token '{token}' not found in the vocabulary.")

2025-05-06 11:20:24,891 : INFO : collecting all words and their counts


Starting Word2Vec training on BPE tokens...
Parameters: vector_size=200, window=5, min_count=2, sg=1, negative=3, workers=4, epochs=15
Reading corpus from: processed_darija_v2/darija_bpe_tokenized_for_w2v.txt


2025-05-06 11:20:25,242 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-05-06 11:20:25,682 : INFO : collected 24936 word types from a corpus of 2167536 raw words and 1 sentences
2025-05-06 11:20:25,682 : INFO : Creating a fresh vocabulary
2025-05-06 11:20:25,757 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 24877 unique words (99.76% of original 24936, drops 59)', 'datetime': '2025-05-06T11:20:25.757783', 'gensim': '4.3.3', 'python': '3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-1082-aws-x86_64-with-glibc2.31', 'event': 'prepare_vocab'}
2025-05-06 11:20:25,758 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 2167477 word corpus (100.00% of original 2167536, drops 59)', 'datetime': '2025-05-06T11:20:25.758758', 'gensim': '4.3.3', 'python': '3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-1082-aws-x86_64-with-glibc2.31', 'event': 'prepare_voca

Training complete.
Saving model to: darija_word2vec_bpe_sg_ns.model
Model saved successfully.

Vocabulary size: 24877 unique BPE tokens (after applying min_count=2)

Testing similarity for some example BPE tokens:
Tokens most similar to 'استغلال': [('الزراعيه', 0.9633772969245911), ('الاراضي', 0.9632007479667664), ('يد', 0.9529932737350464), ('سحاب', 0.9514985680580139), ('في', 0.950337290763855), ('نا', 0.9501468539237976), ('و', 0.9501280784606934), ('كل', 0.9474674463272095), ('ينا', 0.9467081427574158), ('علي', 0.9457512497901917)]
Token 'bzaf' not found in the vocabulary.
Tokens most similar to 'غلال': [('لقاسسيح', 0.28510451316833496), ('نستاهل', 0.25825485587120056), ('لمرا', 0.2532854974269867), ('كاتبغيني', 0.25202497839927673), ('تصوتوا', 0.248075932264328), ('فبيت', 0.2440420687198639), ('سيرهاني', 0.24261215329170227), ('تيره', 0.23698653280735016), ('غيبت', 0.23527854681015015), ('دالاع', 0.22881074249744415)]
Token 'tomobil' not found in the vocabulary.
Token 'rasso' not 