In [None]:
import os
import regex as re # Make sure regex is imported if not already
from gensim.models import Word2Vec
from tokenizers import Tokenizer
import unicodedata
# Optional: For RTL display in terminal/output
try:
    import arabic_reshaper
    from bidi.algorithm import get_display
    RTL_DISPLAY_AVAILABLE = True
    print("arabic_reshaper and python-bidi found. RTL display enabled.")
except ImportError:
    RTL_DISPLAY_AVAILABLE = False
    print("Warning: arabic_reshaper or python-bidi not found. RTL display might not be optimal.")

In [2]:
# --- Configuration ---
# *** Path to your trained Word2Vec model ***
WORD2VEC_MODEL_PATH = 'darija_word2vec_bpe_sg_ns.model' # !!! UPDATE THIS PATH !!!
# *** Path to your BPE tokenizer file ***
BPE_TOKENIZER_PATH = 'processed_darija_v2/darija_bpe_tokenizer.json'      # !!! UPDATE THIS PATH !!!

# ==============================================================================
# PASTE YOUR PREPROCESSING FUNCTION DEFINITIONS HERE
# Make sure the following functions (and ARABIZI_TO_ARABIC_MAP) are defined
# in cells ABOVE this script in your notebook:
#
# 1. ARABIZI_TO_ARABIC_MAP = { ... }
# 2. def clean_text_initial(text: str) -> str: ...
# 3. def arabize_text(text: str) -> str: ...
# 4. def normalize_arabic_text(text: str) -> str: ...
#
# (The script uses a simplified final clean, so clean_text_final is not strictly required here
# unless your other functions depend on it in a way not covered)
# ==============================================================================

In [3]:
ARABIZI_TO_ARABIC_MAP = {
    # Digits (ensure these are processed before any general digit removal)
    '2': 'ء', '3': 'ع', '4': 'غ', '6': 'ط', '8': 'ق', # Added some other common ones
    '7': 'ح', '5': 'خ', '9': 'ق', # 9 can be ق or ص, user asked for ق, but ص is also common. Let's use ق as requested.
                                # The user's original request had '9' -> 'ق'. I'll stick to that. '5' -> 'خ'
    # Common multi-character sequences (longest first for correct replacement)
    'ch': 'ش', 'sh': 'ش', 'kh': 'خ', 'gh': 'غ',
    'th': 'ث', 'dh': 'ذ', 'ou': 'و', 'oo': 'و',
    # Single letters (ensure input text is lowercased before this)
    'a': 'ا', 'b': 'ب', 'c': 'س', # 'c' can be tricky, 'س' is a common default
    'd': 'د', 'e': 'ي', # 'e' often like 'i' or kasra, 'ي' is a placeholder. Can also be 'ا'.
    'f': 'ف', 'g': 'ڭ', # Moroccan Gaf. normalize_arabic can convert ڭ to ك or ج later if needed.
    'h': 'ه', 'i': 'ي', 'j': 'ج', 'k': 'ك', 'l': 'ل', 'm': 'م',
    'n': 'ن', 'o': 'و', 'p': 'ب', # 'پ' is not standard, so 'ب'
    'q': 'ق', 'r': 'ر', 's': 'س',
    't': 'ت', 'u': 'و', 'v': 'ف', # 'ڤ' is not standard, so 'ف'
    'w': 'و', 'x': 'كس', 'y': 'ي', 'z': 'ز',
}
# # Add user's specific request for 9->ق
# ARABIZI_TO_ARABIC_MAP['9'] = 'ق'

In [4]:
def clean_text_initial(text: str) -> str:
    """Performs initial cleaning of the text."""
    # Remove email addresses
    text = re.sub(r'\S+@\S+', ' ', text)
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    # Remove file markup like [[File:…]]
    text = re.sub(r'\[\[File:[^\]]*\]\]', ' ', text)
    # Remove other generic wiki-like markups (e.g., [[...]], but not [[word]])
    # This regex looks for markups with colons or pipes, common in metadata
    text = re.sub(r'\[\[(?:[^\]]*:|[^\]]*\|[^\]]*)\]\]', ' ', text)
    # Remove simple [[markup]] if it's not just a word
    text = re.sub(r'\[\[([^\]]{20,})\]\]', ' ', text) # Example: if content is too long
    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)
    # Remove script-mismatches (e.g., "100px")
    text = re.sub(r'\b\d+px\b', ' ', text)
    # Normalize Unicode to NFKC form for consistency
    text = unicodedata.normalize('NFKC', text)
    # Normalize whitespace early
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [5]:
def arabize_text(text: str) -> str:
    """Converts Arabizi (Latin script Darija with numbers) to Arabic script."""
    text = text.lower() # Important for consistent mapping

    # Create a list of keys sorted by length (descending) to handle multi-char keys first
    sorted_keys = sorted(ARABIZI_TO_ARABIC_MAP.keys(), key=len, reverse=True)

    for key in sorted_keys:
        text = text.replace(key, ARABIZI_TO_ARABIC_MAP[key])
    
    # Specific case for 'g', if it was mapped to 'ڭ' and needs to be 'غ' or 'ج'
    # For now, 'ڭ' is kept, and normalize_arabic can handle it.
    # If user specifically wants 'g' -> 'غ', then ARABIZI_TO_ARABIC_MAP['g'] = 'غ'

    return text

In [6]:
def normalize_arabic_text(text: str) -> str:
    """Normalizes Arabic script."""
    # Remove diacritics (tashkeel)
    text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
    # Remove tatweel (ـ)
    text = text.replace('\u0640', '')
    
    # Normalize Alef forms to plain Alef (ا)
    text = text.replace('أ', 'ا').replace('إ', 'ا').replace('آ', 'ا').replace('ٱ', 'ا')
    
    # Normalize common variants
    text = text.replace('ة', 'ه')  # Ta marbuta to Ha
    text = text.replace('ى', 'ي')  # Alef maksura to Ya
    
    # Normalize Perso-Arabic letters to common Arabic equivalents if desired
    text = text.replace('گ', 'ك')  # Persian Gaf to Kaf
    text = text.replace('ڭ', 'ك')  # Moroccan Gaf (if produced by arabize_text) to Kaf. Or map to ج or غ if preferred.
                                  # Let's map ڭ to ك as it's a common normalization.
    text = text.replace('چ', 'ش')  # Cheh to Shin (if 'ch' was mapped to 'چ')
    text = text.replace('پ', 'ب')  # Peh to Ba
    text = text.replace('ڤ', 'ف')  # Veh to Fa
        
    return text

In [None]:





# --- Load Models ---
word2vec_model = None
bpe_tokenizer = None
models_loaded_successfully = False

try:
    if os.path.exists(WORD2VEC_MODEL_PATH):
        print(f"Loading Word2Vec model from: {WORD2VEC_MODEL_PATH}")
        word2vec_model = Word2Vec.load(WORD2VEC_MODEL_PATH)
        print("Word2Vec model loaded successfully.")
    else:
        print(f"Error: Word2Vec model file not found at {WORD2VEC_MODEL_PATH}")

    if os.path.exists(BPE_TOKENIZER_PATH):
        print(f"Loading BPE tokenizer from: {BPE_TOKENIZER_PATH}")
        bpe_tokenizer = Tokenizer.from_file(BPE_TOKENIZER_PATH)
        print("BPE tokenizer loaded successfully.")
    else:
        print(f"Error: BPE tokenizer file not found at {BPE_TOKENIZER_PATH}")

    if word2vec_model and bpe_tokenizer:
        models_loaded_successfully = True

except Exception as e:
    print(f"An error occurred during model loading: {e}")


def preprocess_input_word_for_similarity(text: str, tokenizer: Tokenizer) -> list[str]:
    """
    Applies necessary preprocessing to a single input word/phrase
    and returns its BPE tokens.
    Assumes clean_text_initial, arabize_text, normalize_arabic_text are defined.
    """
    # 1. Initial clean (less critical for single words, but for consistency)
    try:
        processed_text = clean_text_initial(text)
    except NameError:
        print("Error: `clean_text_initial` function is not defined. Please define it in a cell above.")
        return []
    
    # 2. Arabize if it contains Latin characters
    #    Make it lowercase before arabization as arabize_text expects it
    if re.search(r'[a-zA-Z0-9]', processed_text): # Check for Latin or Arabizi numbers
        try:
            processed_text = arabize_text(processed_text.lower())
        except NameError:
            print("Error: `arabize_text` function (and `ARABIZI_TO_ARABIC_MAP`) is not defined. Please define it.")
            return []

    # 3. Normalize Arabic script
    try:
        processed_text = normalize_arabic_text(processed_text)
    except NameError:
        print("Error: `normalize_arabic_text` function is not defined. Please define it.")
        return []

    # 4. Final minimal clean for BPE (remove anything not Arabic or whitespace)
    # This ensures only valid characters for an Arabic BPE model are passed.
    processed_text = re.sub(r'[^\p{Arabic}\s]', '', processed_text, flags=re.UNICODE)
    processed_text = re.sub(r'\s+', ' ', processed_text).strip()

    if not processed_text:
        return []

    # 5. BPE Tokenize
    bpe_tokens = tokenizer.encode(processed_text).tokens
    return bpe_tokens


def display_arabic(text: str) -> str:
    """Helper function to display Arabic text correctly in RTL terminals/outputs."""
    if RTL_DISPLAY_AVAILABLE:
        # return get_display(arabic_reshaper.reshape(text))
        pass
    return text


# --- Main Interaction Loop ---
if models_loaded_successfully:
    print("\n--- Darija Word Similarity Finder ---")
    print("Type a word (Darija in Arabic/Latin script, or French) and press Enter.")
    print("Type 'exit' or 'quit' to close.")

    while True:
        try:
            user_input = input("\nEnter word: ").strip()

            if not user_input:
                continue
            if user_input.lower() in ['exit', 'quit']:
                print("Exiting...")
                break

            print(f"Original input: '{user_input}'")

            # Preprocess the input word to get BPE tokens
            input_bpe_tokens = preprocess_input_word_for_similarity(user_input, bpe_tokenizer)
            
            if not input_bpe_tokens:
                print("Input word became empty after preprocessing or resulted in no BPE tokens.")
                continue

            print(f"Processed BPE tokens: {', '.join([display_arabic(t) for t in input_bpe_tokens])}")

            # Filter out BPE tokens not in Word2Vec vocabulary and [UNK]
            # Get the UNK token string from the BPE tokenizer
            unk_token_string = bpe_tokenizer.model.unk_token
            
            valid_bpe_tokens_for_model = [
                token for token in input_bpe_tokens 
                if token in word2vec_model.wv and token != unk_token_string
            ]

            if not valid_bpe_tokens_for_model:
                print(f"None of the BPE tokens ({', '.join([display_arabic(t) for t in input_bpe_tokens])}) are in the Word2Vec model's vocabulary or they are all [UNK].")
                print("This might happen if the word is very rare, out-of-domain, or results only in [UNK] tokens.")
                continue
            
            if len(valid_bpe_tokens_for_model) < len(input_bpe_tokens):
                print(f"Note: Some BPE sub-tokens were Out-Of-Vocabulary or [UNK] and were excluded from similarity search.")
                print(f"Using valid BPE tokens for similarity: {', '.join([display_arabic(t) for t in valid_bpe_tokens_for_model])}")


            # Get similar tokens from Word2Vec model
            similar_items = word2vec_model.wv.most_similar(positive=valid_bpe_tokens_for_model, topn=5)

            print(f"\nTop 5 most similar BPE tokens to '{display_arabic(' '.join(valid_bpe_tokens_for_model))}':")
            if similar_items:
                for i, (token, score) in enumerate(similar_items):
                    print(f"{i+1}. {display_arabic(token):<15} (Score: {score:.4f})")
            else:
                print("No similar tokens found.")

        except KeyError as e:
            print(f"Error: One of the processed BPE tokens '{e}' was not found in the model's vocabulary.")
        except NameError as e:
            print(f"Error: A preprocessing function might be missing. {e}. Please define it in a cell above.")
            break # Stop the loop if core functions are missing
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            import traceback
            traceback.print_exc() # For detailed debugging during development

else:
    print("\nCannot start similarity finder: Word2Vec model or BPE tokenizer not loaded, or core preprocessing functions are missing.")
    print("Please ensure the paths to models are correct and preprocessing functions are defined in cells above.")