In [2]:
import logging
import os
from gensim.models import Word2Vec
import multiprocessing




input_corpus_file = 'processed_darija_v2/darija_bpe_tokenized_for_w2v.txt'

output_model_file = 'darija_word2vec_bpe_sg_ns.model'


vector_size = 300
window = 7
min_count = 2
sg = 1
negative = 15
workers = multiprocessing.cpu_count()
epochs = 15


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


class SentenceIterator:
    def __init__(self, filepath):
        self.filepath = filepath

    def __iter__(self):
        try:
            with open(self.filepath, 'r', encoding='utf-8') as f:
                for line in f:
                    tokens = line.split()
                    if tokens:
                        yield tokens
        except FileNotFoundError:
            logging.error(f"Error: Input corpus file not found at {self.filepath}")
            raise


print(f"Starting Word2Vec training on BPE tokens...")
print(f"Parameters: vector_size={vector_size}, window={window}, min_count={min_count}, sg={sg}, negative={negative}, workers={workers}, epochs={epochs}")
print(f"Reading corpus from: {input_corpus_file}")

sentences = SentenceIterator(input_corpus_file)

model = Word2Vec(
    sentences=sentences,
    vector_size=vector_size,
    window=window,
    min_count=min_count,
    sg=sg,
    negative=negative,
    workers=workers,
    epochs=epochs
)

print("Training complete.")


print(f"Saving model to: {output_model_file}")
model.save(output_model_file)
print("Model saved successfully.")


vocab_size = len(model.wv.key_to_index)
print(f"\nVocabulary size: {vocab_size} unique BPE tokens (after applying min_count={min_count})")





example_tokens = ['غابه', 'الخدمة', 'كود', 'سياره', 'سيارة']

print("\nTesting similarity for some example BPE tokens:")
for token in example_tokens:
    if token in model.wv:
        try:
            similar_tokens = model.wv.most_similar(token, topn=10)
            print(f"Tokens most similar to '{token}': {similar_tokens}")
        except Exception as e:
            print(f"Could not get similar tokens for '{token}': {e}")
    else:
        print(f"Token '{token}' not found in the vocabulary.")

2025-05-06 22:47:19,950 : INFO : collecting all words and their counts


Starting Word2Vec training on BPE tokens...
Parameters: vector_size=300, window=7, min_count=2, sg=1, negative=15, workers=32, epochs=15
Reading corpus from: processed_darija_v2/darija_bpe_tokenized_for_w2v.txt


2025-05-06 22:47:30,555 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-05-06 22:47:48,960 : INFO : collected 34918 word types from a corpus of 117711433 raw words and 1 sentences
2025-05-06 22:47:48,961 : INFO : Creating a fresh vocabulary
2025-05-06 22:47:49,039 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 34865 unique words (99.85% of original 34918, drops 53)', 'datetime': '2025-05-06T22:47:49.039361', 'gensim': '4.3.3', 'python': '3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-1082-aws-x86_64-with-glibc2.31', 'event': 'prepare_vocab'}
2025-05-06 22:47:49,039 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 117711380 word corpus (100.00% of original 117711433, drops 53)', 'datetime': '2025-05-06T22:47:49.039890', 'gensim': '4.3.3', 'python': '3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-1082-aws-x86_64-with-glibc2.31', 'event': 'prepar

Training complete.
Saving model to: darija_word2vec_bpe_sg_ns.model
Model saved successfully.

Vocabulary size: 34865 unique BPE tokens (after applying min_count=2)

Testing similarity for some example BPE tokens:
Tokens most similar to 'غابه': [('مهرس', 0.24660369753837585), ('بونسيس', 0.21888893842697144), ('شاشينك', 0.21241189539432526), ('غيق', 0.21021458506584167), ('كتوره', 0.2086116373538971), ('بحا', 0.20843039453029633), ('مايي', 0.20755049586296082), ('دوب', 0.2046542912721634), ('رشا', 0.20246878266334534), ('سيروم', 0.20173980295658112)]
Token 'الخدمة' not found in the vocabulary.
Tokens most similar to 'كود': [('نهايه', 0.21241676807403564), ('فيرري', 0.20696209371089935), ('شديد', 0.1996350884437561), ('وناضت', 0.1964312493801117), ('كيزدح', 0.19475223124027252), ('اااار', 0.19466352462768555), ('غانس', 0.19454215466976166), ('يرين', 0.1923326849937439), ('فومينت', 0.18977361917495728), ('هيراك', 0.18859979510307312)]
Tokens most similar to 'سياره': [('شيلو', 0.2574664950