In [None]:
import logging
import os
from gensim.models import Word2Vec
# Optional: If you want to use all available cores for faster training
import multiprocessing

# --- Configuration ---
# Input file: Contains the preprocessed text (one sentence/document per line, tokens separated by space)
# This file was generated by your previous preprocessing script.
input_corpus_file = '/teamspace/studios/this_studio/preprocessed_darija_corpus_arabic.txt'

# Output file: Where to save the trained Word2Vec model
output_model_file = 'darija_word2vec_sg_ns.model'

# Word2Vec Parameters (as requested)
vector_size = 100  # Dimensionality of the word vectors (common: 100-300)
window = 5         # Max distance between current and predicted word within a sentence
min_count = 8      # Ignores all words with total frequency lower than this
sg = 1             # Use Skip-Gram algorithm (0 for CBOW)
negative = 5       # Number of negative samples (common: 5-20)
workers = multiprocessing.cpu_count() # Number of worker threads to train the model (use all available cores)
epochs = 5        # Number of iterations (epochs) over the corpus (common: 5-10)

# --- Setup Logging ---
# Gensim uses Python's logging module to report progress
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# --- Create Sentence Iterator ---
# Word2Vec requires the input corpus to be an iterable of lists of tokens (sentences).
# We create a simple iterator class that reads your preprocessed file line-by-line.
class SentenceIterator:
    def __init__(self, filepath):
        self.filepath = filepath

    def __iter__(self):
        try:
            with open(self.filepath, 'r', encoding='utf-8') as f:
                for line in f:
                    # Split the line into tokens based on whitespace
                    tokens = line.split()
                    if tokens: # Avoid yielding empty lists if there are blank lines
                        yield tokens
        except FileNotFoundError:
            logging.error(f"Error: Input corpus file not found at {self.filepath}")
            raise # Re-raise the exception to stop execution


# --- Train the Model ---
print(f"Starting Word2Vec training...")
print(f"Parameters: vector_size={vector_size}, window={window}, min_count={min_count}, sg={sg}, negative={negative}, workers={workers}, epochs={epochs}")
print(f"Reading corpus from: {input_corpus_file}")

# Instantiate the sentence iterator
sentences = SentenceIterator(input_corpus_file)

# Instantiate and train the Word2Vec model
model = Word2Vec(
    sentences=sentences,
    vector_size=vector_size,
    window=window,
    min_count=min_count,
    sg=sg,
    negative=negative,
    workers=workers,
    epochs=epochs # Use 'iter' in older gensim versions
)

print("Training complete.")

# --- Save the Model ---
print(f"Saving model to: {output_model_file}")
model.save(output_model_file)
print("Model saved successfully.")

# --- Basic Model Testing (Optional) ---
# You can now load the model later using: model = Word2Vec.load(output_model_file)

# Check vocabulary size
vocab_size = len(model.wv.key_to_index) # Use model.wv.index_to_key in newer gensim
print(f"\nVocabulary size: {vocab_size} unique tokens (after applying min_count={min_count})")

# Example: Find words similar to a given word (if it's in the vocabulary)
# Try some common words from your romanized corpus analysis
example_words = ['عيقتو', 'بزاف', 'سكواتش', 'استغلال'] # Adjust based on your actual frequent tokens
print("\nTesting similarity for some example words:")

for word in example_words:
    if word in model.wv.key_to_index: # Use model.wv.__contains__ in newer gensim
        try:
            similar_words = model.wv.most_similar(word, topn=5)
            print(f"Words most similar to '{word}': {similar_words}")
        except Exception as e:
            print(f"Could not get similar words for '{word}': {e}")
    else:
        print(f"Word '{word}' not found in the vocabulary (likely below min_count or not in corpus).")