In [2]:
import logging
import os
from gensim.models import Word2Vec
# Optional: If you want to use all available cores for faster training
import multiprocessing

# --- Configuration ---
# Input file: Contains the preprocessed text (one sentence/document per line, tokens separated by space)
# This file was generated by your previous preprocessing script.
input_corpus_file = '/teamspace/studios/this_studio/preprocessed_darija_corpus.txt'

# Output file: Where to save the trained Word2Vec model
output_model_file = 'darija_word2vec_sg_ns.model'

# Word2Vec Parameters (as requested)
vector_size = 100  # Dimensionality of the word vectors (common: 100-300)
window = 5         # Max distance between current and predicted word within a sentence
min_count = 8      # Ignores all words with total frequency lower than this
sg = 1             # Use Skip-Gram algorithm (0 for CBOW)
negative = 5       # Number of negative samples (common: 5-20)
workers = multiprocessing.cpu_count() # Number of worker threads to train the model (use all available cores)
epochs = 5        # Number of iterations (epochs) over the corpus (common: 5-10)

# --- Setup Logging ---
# Gensim uses Python's logging module to report progress
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# --- Create Sentence Iterator ---
# Word2Vec requires the input corpus to be an iterable of lists of tokens (sentences).
# We create a simple iterator class that reads your preprocessed file line-by-line.
class SentenceIterator:
    def __init__(self, filepath):
        self.filepath = filepath

    def __iter__(self):
        try:
            with open(self.filepath, 'r', encoding='utf-8') as f:
                for line in f:
                    # Split the line into tokens based on whitespace
                    tokens = line.split()
                    if tokens: # Avoid yielding empty lists if there are blank lines
                        yield tokens
        except FileNotFoundError:
            logging.error(f"Error: Input corpus file not found at {self.filepath}")
            raise # Re-raise the exception to stop execution


# --- Train the Model ---
print(f"Starting Word2Vec training...")
print(f"Parameters: vector_size={vector_size}, window={window}, min_count={min_count}, sg={sg}, negative={negative}, workers={workers}, epochs={epochs}")
print(f"Reading corpus from: {input_corpus_file}")

# Instantiate the sentence iterator
sentences = SentenceIterator(input_corpus_file)

# Instantiate and train the Word2Vec model
model = Word2Vec(
    sentences=sentences,
    vector_size=vector_size,
    window=window,
    min_count=min_count,
    sg=sg,
    negative=negative,
    workers=workers,
    epochs=epochs # Use 'iter' in older gensim versions
)

print("Training complete.")

# --- Save the Model ---
print(f"Saving model to: {output_model_file}")
model.save(output_model_file)
print("Model saved successfully.")

# --- Basic Model Testing (Optional) ---
# You can now load the model later using: model = Word2Vec.load(output_model_file)

# Check vocabulary size
vocab_size = len(model.wv.key_to_index) # Use model.wv.index_to_key in newer gensim
print(f"\nVocabulary size: {vocab_size} unique tokens (after applying min_count={min_count})")

# Example: Find words similar to a given word (if it's in the vocabulary)
# Try some common words from your romanized corpus analysis
example_words = ['ana', 'bzaf', 'mzn', 'maroc'] # Adjust based on your actual frequent tokens
print("\nTesting similarity for some example words:")

for word in example_words:
    if word in model.wv.key_to_index: # Use model.wv.__contains__ in newer gensim
        try:
            similar_words = model.wv.most_similar(word, topn=5)
            print(f"Words most similar to '{word}': {similar_words}")
        except Exception as e:
            print(f"Could not get similar words for '{word}': {e}")
    else:
        print(f"Word '{word}' not found in the vocabulary (likely below min_count or not in corpus).")

2025-05-05 14:09:45,870 : INFO : collecting all words and their counts


Starting Word2Vec training...
Parameters: vector_size=100, window=5, min_count=8, sg=1, negative=5, workers=4, epochs=5
Reading corpus from: /teamspace/studios/this_studio/preprocessed_darija_corpus.txt


2025-05-05 14:09:58,970 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-05-05 14:10:29,437 : INFO : collected 4975067 word types from a corpus of 101229938 raw words and 1 sentences
2025-05-05 14:10:29,437 : INFO : Creating a fresh vocabulary
2025-05-05 14:10:32,700 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=8 retains 485571 unique words (9.76% of original 4975067, drops 4489496)', 'datetime': '2025-05-05T14:10:32.700308', 'gensim': '4.3.3', 'python': '3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-1082-aws-x86_64-with-glibc2.31', 'event': 'prepare_vocab'}
2025-05-05 14:10:32,701 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=8 leaves 94248266 word corpus (93.10% of original 101229938, drops 6981672)', 'datetime': '2025-05-05T14:10:32.701352', 'gensim': '4.3.3', 'python': '3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-1082-aws-x86_64-with-glibc2.31', 'eve

Training complete.
Saving model to: darija_word2vec_sg_ns.model


2025-05-05 14:11:30,384 : INFO : not storing attribute cum_table
2025-05-05 14:11:30,808 : INFO : saved darija_word2vec_sg_ns.model


Model saved successfully.

Vocabulary size: 485571 unique tokens (after applying min_count=8)

Testing similarity for some example words:
Words most similar to 'ana': [('wana', 0.9797440767288208), ('allh', 0.974860668182373), ('raj3wn', 0.9737260937690735), ('mn', 0.9695346355438232), ('alyh', 0.9680131673812866)]
Words most similar to 'bzaf': [('mn', 0.8952898979187012), ('dyal', 0.8940796256065369), ('alb7r', 0.8913525938987732), ('3ly', 0.8908437490463257), ('fy', 0.8885353207588196)]
Words most similar to 'mzn': [('wanted', 0.4465177357196808), ('israelis', 0.4374731779098511), ('alzmrdytyn', 0.43694591522216797), ('wt7naa', 0.4257921278476715), ('sharly', 0.4121866524219513)]
Words most similar to 'maroc': [('timei', 0.44330522418022156), ('httpstcolwsh7pznen', 0.4180850684642792), ('wnghwt', 0.4113446772098541), ('mextiby83', 0.4098399877548218), ('imaanne', 0.40979671478271484)]
