In [1]:
import logging
import os
from gensim.models import Word2Vec
import multiprocessing

# --- Configuration ---\
# *** IMPORTANT: Point this to the output file from the BPE preprocessing step ***
# input_corpus_file = 'processed_darija_v2/darija_processed_words_for_w2v.txt' # <<< Use the BPE tokenized file
input_corpus_file = 'processed_darija_v2/darija_bpe_tokenized_for_w2v.txt'
# Output file for the Word2Vec model
output_model_file = 'darija_word2vec_bpe_sg_ns.model' # <<< Updated model name

# Word2Vec Parameters (adjust as needed, especially min_count for subwords)
vector_size = 200
window = 5
min_count = 2      # <<< You might lower min_count for subwords (e.g., 2-5)
sg = 1
negative = 10
workers = multiprocessing.cpu_count()
epochs = 15        # <<< Consider more epochs for subwords if corpus isn't huge

# --- Setup Logging ---
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# --- Sentence Iterator (No change needed, reads space-separated tokens) ---
class SentenceIterator:
    def __init__(self, filepath):
        self.filepath = filepath

    def __iter__(self):
        try:
            with open(self.filepath, 'r', encoding='utf-8') as f:
                for line in f:
                    tokens = line.split() # Reads the space-separated BPE tokens
                    if tokens:
                        yield tokens
        except FileNotFoundError:
            logging.error(f"Error: Input corpus file not found at {self.filepath}")
            raise

# --- Train the Model ---
print(f"Starting Word2Vec training on BPE tokens...")
print(f"Parameters: vector_size={vector_size}, window={window}, min_count={min_count}, sg={sg}, negative={negative}, workers={workers}, epochs={epochs}")
print(f"Reading corpus from: {input_corpus_file}")

sentences = SentenceIterator(input_corpus_file)

model = Word2Vec(
    sentences=sentences,
    vector_size=vector_size,
    window=window,
    min_count=min_count,
    sg=sg,
    negative=negative,
    workers=workers,
    epochs=epochs
)

print("Training complete.")

# --- Save the Model ---
print(f"Saving model to: {output_model_file}")
model.save(output_model_file)
print("Model saved successfully.")

# --- Basic Model Testing (Optional) ---
vocab_size = len(model.wv.key_to_index)
print(f"\nVocabulary size: {vocab_size} unique BPE tokens (after applying min_count={min_count})")

# Example: Find similar BPE tokens
# Note: Similarity will be between subwords now.
# You might need to look for subwords of words you are interested in.
# Example: If 'استغلال' was tokenized into 'است', 'غلال', you'd test those.
example_tokens = ['غابه', 'الخدمة', 'كود', 'سياره', 'سيارة'] # Adjust based on actual BPE tokens

print("\nTesting similarity for some example BPE tokens:")
for token in example_tokens:
    if token in model.wv: # Use __contains__ or check key_to_index
        try:
            similar_tokens = model.wv.most_similar(token, topn=10)
            print(f"Tokens most similar to '{token}': {similar_tokens}")
        except Exception as e:
            print(f"Could not get similar tokens for '{token}': {e}")
    else:
        print(f"Token '{token}' not found in the vocabulary.")

2025-05-06 19:42:46,735 : INFO : collecting all words and their counts


Starting Word2Vec training on BPE tokens...
Parameters: vector_size=200, window=5, min_count=2, sg=1, negative=10, workers=4, epochs=15
Reading corpus from: processed_darija_v2/darija_bpe_tokenized_for_w2v.txt


2025-05-06 19:43:02,371 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-05-06 19:43:30,695 : INFO : collected 49786 word types from a corpus of 112366414 raw words and 1 sentences
2025-05-06 19:43:30,696 : INFO : Creating a fresh vocabulary
2025-05-06 19:43:30,853 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 49668 unique words (99.76% of original 49786, drops 118)', 'datetime': '2025-05-06T19:43:30.853006', 'gensim': '4.3.3', 'python': '3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-1082-aws-x86_64-with-glibc2.31', 'event': 'prepare_vocab'}
2025-05-06 19:43:30,853 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 112366296 word corpus (100.00% of original 112366414, drops 118)', 'datetime': '2025-05-06T19:43:30.853829', 'gensim': '4.3.3', 'python': '3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-1082-aws-x86_64-with-glibc2.31', 'event': 'prep

Training complete.
Saving model to: darija_word2vec_bpe_sg_ns.model
Model saved successfully.

Vocabulary size: 49668 unique BPE tokens (after applying min_count=2)

Testing similarity for some example BPE tokens:
Tokens most similar to 'غابه': [('تحنات', 0.27721866965293884), ('سابتيون', 0.2733318507671356), ('لحر', 0.26950228214263916), ('تورنا', 0.2614783048629761), ('دوير', 0.2575022578239441), ('واديي', 0.25627487897872925), ('شادويسك', 0.2517807185649872), ('سلاسيس', 0.24846717715263367), ('سوفيا', 0.2473089098930359), ('باللير', 0.24609500169754028)]
Token 'الخدمة' not found in the vocabulary.
Tokens most similar to 'كود': [('ئشه', 0.3136468529701233), ('بريسينت', 0.29064151644706726), ('كاراشي', 0.27888602018356323), ('خطر', 0.27258092164993286), ('تلوات', 0.26717591285705566), ('نعطيها', 0.26664310693740845), ('عتق', 0.2652406096458435), ('راشد', 0.2650744915008545), ('سدا', 0.2580049932003021), ('تاين', 0.25442740321159363)]
Tokens most similar to 'سياره': [('قفط', 0.28444251