In [8]:
import os
import regex as re # Make sure regex is imported if not already
from gensim.models import Word2Vec
from tokenizers import Tokenizer
import unicodedata
# Optional: For RTL display in terminal/output
try:
    import arabic_reshaper
    from bidi.algorithm import get_display
    RTL_DISPLAY_AVAILABLE = True
    print("arabic_reshaper and python-bidi found. RTL display enabled.")
except ImportError:
    RTL_DISPLAY_AVAILABLE = False
    print("Warning: arabic_reshaper or python-bidi not found. RTL display might not be optimal.")

arabic_reshaper and python-bidi found. RTL display enabled.


In [9]:
WORD2VEC_MODEL_PATH = '/teamspace/studios/this_studio/darija_word2vec_bpe_sg_ns.model'

# ==============================================================================
# PASTE YOUR PREPROCESSING FUNCTION DEFINITIONS HERE
# Make sure the following functions (and ARABIZI_TO_ARABIC_MAP) are defined
# in cells ABOVE this script in your notebook:
#
# 1. ARABIZI_TO_ARABIC_MAP = { ... }
# 2. def clean_text_initial(text: str) -> str: ...
# 3. def arabize_text(text: str) -> str: ...
# 4. def normalize_arabic_text(text: str) -> str: ...
#
# (The script uses a simplified final clean, so clean_text_final is not strictly required here
# unless your other functions depend on it in a way not covered)
# ==============================================================================

In [10]:
ARABIZI_TO_ARABIC_MAP = {
    # Digits (ensure these are processed before any general digit removal)
    '2': 'ء', '3': 'ع', '4': 'غ', '6': 'ط', '8': 'ق', # Added some other common ones
    '7': 'ح', '5': 'خ', '9': 'ق', # 9 can be ق or ص, user asked for ق, but ص is also common. Let's use ق as requested.
                                # The user's original request had '9' -> 'ق'. I'll stick to that. '5' -> 'خ'
    # Common multi-character sequences (longest first for correct replacement)
    'ch': 'ش', 'sh': 'ش', 'kh': 'خ', 'gh': 'غ',
    'th': 'ث', 'dh': 'ذ', 'ou': 'و', 'oo': 'و',
    # Single letters (ensure input text is lowercased before this)
    'a': 'ا', 'b': 'ب', 'c': 'س', # 'c' can be tricky, 'س' is a common default
    'd': 'د', 'e': 'ي', # 'e' often like 'i' or kasra, 'ي' is a placeholder. Can also be 'ا'.
    'f': 'ف', 'g': 'ڭ', # Moroccan Gaf. normalize_arabic can convert ڭ to ك or ج later if needed.
    'h': 'ه', 'i': 'ي', 'j': 'ج', 'k': 'ك', 'l': 'ل', 'm': 'م',
    'n': 'ن', 'o': 'و', 'p': 'ب', # 'پ' is not standard, so 'ب'
    'q': 'ق', 'r': 'ر', 's': 'س',
    't': 'ت', 'u': 'و', 'v': 'ف', # 'ڤ' is not standard, so 'ف'
    'w': 'و', 'x': 'كس', 'y': 'ي', 'z': 'ز',
}
# # Add user's specific request for 9->ق
# ARABIZI_TO_ARABIC_MAP['9'] = 'ق'

In [11]:
def clean_text_initial(text: str) -> str:
    """Performs initial cleaning of the text."""
    # Remove email addresses
    text = re.sub(r'\S+@\S+', ' ', text)
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    # Remove file markup like [[File:…]]
    text = re.sub(r'\[\[File:[^\]]*\]\]', ' ', text)
    # Remove other generic wiki-like markups (e.g., [[...]], but not [[word]])
    # This regex looks for markups with colons or pipes, common in metadata
    text = re.sub(r'\[\[(?:[^\]]*:|[^\]]*\|[^\]]*)\]\]', ' ', text)
    # Remove simple [[markup]] if it's not just a word
    text = re.sub(r'\[\[([^\]]{20,})\]\]', ' ', text) # Example: if content is too long
    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)
    # Remove script-mismatches (e.g., "100px")
    text = re.sub(r'\b\d+px\b', ' ', text)
    # Normalize Unicode to NFKC form for consistency
    text = unicodedata.normalize('NFKC', text)
    # Normalize whitespace early
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [12]:
def arabize_text(text: str) -> str:
    """Converts Arabizi (Latin script Darija with numbers) to Arabic script."""
    text = text.lower() # Important for consistent mapping

    # Create a list of keys sorted by length (descending) to handle multi-char keys first
    sorted_keys = sorted(ARABIZI_TO_ARABIC_MAP.keys(), key=len, reverse=True)

    for key in sorted_keys:
        text = text.replace(key, ARABIZI_TO_ARABIC_MAP[key])
    
    # Specific case for 'g', if it was mapped to 'ڭ' and needs to be 'غ' or 'ج'
    # For now, 'ڭ' is kept, and normalize_arabic can handle it.
    # If user specifically wants 'g' -> 'غ', then ARABIZI_TO_ARABIC_MAP['g'] = 'غ'

    return text

In [13]:
def normalize_arabic_text(text: str) -> str:
    """Normalizes Arabic script."""
    # Remove diacritics (tashkeel)
    text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
    # Remove tatweel (ـ)
    text = text.replace('\u0640', '')
    
    # Normalize Alef forms to plain Alef (ا)
    text = text.replace('أ', 'ا').replace('إ', 'ا').replace('آ', 'ا').replace('ٱ', 'ا')
    
    # Normalize common variants
    text = text.replace('ة', 'ه')  # Ta marbuta to Ha
    text = text.replace('ى', 'ي')  # Alef maksura to Ya
    
    # Normalize Perso-Arabic letters to common Arabic equivalents if desired
    text = text.replace('گ', 'ك')  # Persian Gaf to Kaf
    text = text.replace('ڭ', 'ك')  # Moroccan Gaf (if produced by arabize_text) to Kaf. Or map to ج or غ if preferred.
                                  # Let's map ڭ to ك as it's a common normalization.
    text = text.replace('چ', 'ش')  # Cheh to Shin (if 'ch' was mapped to 'چ')
    text = text.replace('پ', 'ب')  # Peh to Ba
    text = text.replace('ڤ', 'ف')  # Veh to Fa
        
    return text

In [14]:
word2vec_model = None
model_loaded_successfully = False

try:
    if os.path.exists(WORD2VEC_MODEL_PATH):
        print(f"Loading Word2Vec model from: {WORD2VEC_MODEL_PATH}")
        word2vec_model = Word2Vec.load(WORD2VEC_MODEL_PATH)
        print("Word2Vec model loaded successfully.")
        model_loaded_successfully = True
    else:
        print(f"Error: Word2Vec model file not found at {WORD2VEC_MODEL_PATH}")

except Exception as e:
    print(f"An error occurred during model loading: {e}")


def preprocess_input_phrase_no_bpe(text: str) -> list[str]:
    """
    Applies necessary preprocessing to a single input word/phrase
    and returns a list of processed words.
    Assumes clean_text_initial, arabize_text, normalize_arabic_text are defined.
    """
    # 1. Initial clean
    try:
        processed_text = clean_text_initial(text)
    except NameError:
        print("Error: `clean_text_initial` function is not defined. Please define it in a cell above.")
        return []
    
    # 2. Arabize if it contains Latin characters or Arabizi numbers
    if re.search(r'[a-zA-Z0-9]', processed_text):
        try:
            processed_text = arabize_text(processed_text.lower())
        except NameError:
            print("Error: `arabize_text` function (and `ARABIZI_TO_ARABIC_MAP`) is not defined. Please define it.")
            return []

    # 3. Normalize Arabic script
    try:
        processed_text = normalize_arabic_text(processed_text)
    except NameError:
        print("Error: `normalize_arabic_text` function is not defined. Please define it.")
        return []

    # 4. Final minimal clean (remove anything not Arabic or whitespace)
    #    and split into words.
    processed_text = re.sub(r'[^\p{Arabic}\s]', '', processed_text, flags=re.UNICODE)
    processed_text = re.sub(r'\s+', ' ', processed_text).strip()

    if not processed_text:
        return []

    # 5. Split into words
    words = processed_text.split()
    return words


def display_arabic(text: str) -> str:
    """Helper function to display Arabic text correctly in RTL terminals/outputs."""
    if RTL_DISPLAY_AVAILABLE:
        # return get_display(arabic_reshaper.reshape(text))
        pass
    return text


# --- Main Interaction Loop ---
if model_loaded_successfully:
    print("\n--- Darija Word Similarity Finder (No BPE) ---")
    print("Type a word or phrase (Darija in Arabic/Latin script, or French) and press Enter.")
    print("Type 'exit' or 'quit' to close.")

    while True:
        try:
            user_input = input("\nEnter word or phrase: ").strip()

            if not user_input:
                continue
            if user_input.lower() in ['exit', 'quit']:
                print("Exiting...")
                break

            print(f"Original input: '{user_input}'")

            # Preprocess the input to get a list of words
            input_words = preprocess_input_phrase_no_bpe(user_input)
            
            if not input_words:
                print("Input became empty after preprocessing or resulted in no words.")
                continue

            print(f"Processed words: {', '.join([display_arabic(w) for w in input_words])}")

            # Filter out words not in Word2Vec vocabulary
            # Note: Word2Vec models usually don't have a specific "[UNK]" token unless you add one.
            # They simply don't include OOV words in their vocabulary.
            valid_words_for_model = [
                word for word in input_words 
                if word in word2vec_model.wv
            ]

            if not valid_words_for_model:
                print(f"None of the processed words ({', '.join([display_arabic(w) for w in input_words])}) are in the Word2Vec model's vocabulary.")
                print("This might happen if the word(s) are very rare or out-of-domain.")
                continue
            
            if len(valid_words_for_model) < len(input_words):
                print(f"Note: Some words were Out-Of-Vocabulary (OOV) and were excluded from similarity search.")
                print(f"Using valid words for similarity: {', '.join([display_arabic(w) for w in valid_words_for_model])}")


            # Get similar words from Word2Vec model
            # If multiple valid_words_for_model, gensim averages their vectors.
            similar_items = word2vec_model.wv.most_similar(positive=valid_words_for_model, topn=5)

            print(f"\nTop 5 most similar words to '{display_arabic(' '.join(valid_words_for_model))}':")
            if similar_items:
                for i, (word, score) in enumerate(similar_items):
                    print(f"{i+1}. {display_arabic(word):<15} (Score: {score:.4f})")
            else:
                print("No similar words found.")

        except KeyError as e: # Should be rare if we check `word in word2vec_model.wv`
            print(f"Error: One of the processed words '{e}' was not found in the model's vocabulary (unexpected).")
        except NameError as e:
            print(f"Error: A preprocessing function might be missing. {e}. Please define it in a cell above.")
            break # Stop the loop if core functions are missing
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            import traceback
            traceback.print_exc()

else:
    print("\nCannot start similarity finder: Word2Vec model not loaded or core preprocessing functions are missing.")
    print("Please ensure the path to the model is correct and preprocessing functions are defined in cells above.")

Loading Word2Vec model from: /teamspace/studios/this_studio/darija_word2vec_bpe_sg_ns.model
Word2Vec model loaded successfully.

--- Darija Word Similarity Finder (No BPE) ---
Type a word or phrase (Darija in Arabic/Latin script, or French) and press Enter.
Type 'exit' or 'quit' to close.
Original input: 'syara'
Processed words: سيارا

Top 5 most similar words to 'سيارا':
1. ياهياتين        (Score: 0.3318)
2. سيوجيبايليي     (Score: 0.3273)
3. خطيريين         (Score: 0.3188)
4. ووتفففبي        (Score: 0.3104)
5. زومم            (Score: 0.3089)
Original input: '7rb'
Processed words: حرب

Top 5 most similar words to 'حرب':
1. عليك            (Score: 0.9970)
2. بارك            (Score: 0.9970)
3. عليهم           (Score: 0.9970)
4. تشوفو           (Score: 0.9970)
5. الحق            (Score: 0.9969)
Original input: 'bark'
Processed words: بارك

Top 5 most similar words to 'بارك':
1. عليك            (Score: 0.9997)
2. مبروك           (Score: 0.9996)
3. شاء             (Score: 0.9996)
4. فيكم  