In [None]:
import os
import regex as re 
from gensim.models import Word2Vec
from tokenizers import Tokenizer
import unicodedata

try:
    import arabic_reshaper
    from bidi.algorithm import get_display
    RTL_DISPLAY_AVAILABLE = True
    print("arabic_reshaper and python-bidi found. RTL display enabled.")
except ImportError:
    RTL_DISPLAY_AVAILABLE = False
    print("Warning: arabic_reshaper or python-bidi not found. RTL display might not be optimal.")

In [2]:


WORD2VEC_MODEL_PATH = 'darija_word2vec_bpe_sg_ns.model' 

BPE_TOKENIZER_PATH = 'processed_darija_v2/darija_bpe_tokenizer.json'      















In [3]:
ARABIZI_TO_ARABIC_MAP = {
    
    '2': 'ء', '3': 'ع', '4': 'غ', '6': 'ط', '8': 'ق', 
    '7': 'ح', '5': 'خ', '9': 'ق', 
                                
    
    'ch': 'ش', 'sh': 'ش', 'kh': 'خ', 'gh': 'غ',
    'th': 'ث', 'dh': 'ذ', 'ou': 'و', 'oo': 'و',
    
    'a': 'ا', 'b': 'ب', 'c': 'س', 
    'd': 'د', 'e': 'ي', 
    'f': 'ف', 'g': 'ڭ', 
    'h': 'ه', 'i': 'ي', 'j': 'ج', 'k': 'ك', 'l': 'ل', 'm': 'م',
    'n': 'ن', 'o': 'و', 'p': 'ب', 
    'q': 'ق', 'r': 'ر', 's': 'س',
    't': 'ت', 'u': 'و', 'v': 'ف', 
    'w': 'و', 'x': 'كس', 'y': 'ي', 'z': 'ز',
}



In [4]:
def clean_text_initial(text: str) -> str:
    """Performs initial cleaning of the text."""
    
    text = re.sub(r'\S+@\S+', ' ', text)
    
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    
    text = re.sub(r'\[\[File:[^\]]*\]\]', ' ', text)
    
    
    text = re.sub(r'\[\[(?:[^\]]*:|[^\]]*\|[^\]]*)\]\]', ' ', text)
    
    text = re.sub(r'\[\[([^\]]{20,})\]\]', ' ', text) 
    
    text = re.sub(r'<.*?>', ' ', text)
    
    text = re.sub(r'\b\d+px\b', ' ', text)
    
    text = unicodedata.normalize('NFKC', text)
    
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [5]:
def arabize_text(text: str) -> str:
    """Converts Arabizi (Latin script Darija with numbers) to Arabic script."""
    text = text.lower() 

    
    sorted_keys = sorted(ARABIZI_TO_ARABIC_MAP.keys(), key=len, reverse=True)

    for key in sorted_keys:
        text = text.replace(key, ARABIZI_TO_ARABIC_MAP[key])
    
    
    
    

    return text

In [6]:
def normalize_arabic_text(text: str) -> str:
    """Normalizes Arabic script."""
    
    text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
    
    text = text.replace('\u0640', '')
    
    
    text = text.replace('أ', 'ا').replace('إ', 'ا').replace('آ', 'ا').replace('ٱ', 'ا')
    
    
    text = text.replace('ة', 'ه')  
    text = text.replace('ى', 'ي')  
    
    
    text = text.replace('گ', 'ك')  
    text = text.replace('ڭ', 'ك')  
                                  
    text = text.replace('چ', 'ش')  
    text = text.replace('پ', 'ب')  
    text = text.replace('ڤ', 'ف')  
        
    return text

In [None]:






word2vec_model = None
bpe_tokenizer = None
models_loaded_successfully = False

try:
    if os.path.exists(WORD2VEC_MODEL_PATH):
        print(f"Loading Word2Vec model from: {WORD2VEC_MODEL_PATH}")
        word2vec_model = Word2Vec.load(WORD2VEC_MODEL_PATH)
        print("Word2Vec model loaded successfully.")
    else:
        print(f"Error: Word2Vec model file not found at {WORD2VEC_MODEL_PATH}")

    if os.path.exists(BPE_TOKENIZER_PATH):
        print(f"Loading BPE tokenizer from: {BPE_TOKENIZER_PATH}")
        bpe_tokenizer = Tokenizer.from_file(BPE_TOKENIZER_PATH)
        print("BPE tokenizer loaded successfully.")
    else:
        print(f"Error: BPE tokenizer file not found at {BPE_TOKENIZER_PATH}")

    if word2vec_model and bpe_tokenizer:
        models_loaded_successfully = True

except Exception as e:
    print(f"An error occurred during model loading: {e}")


def preprocess_input_word_for_similarity(text: str, tokenizer: Tokenizer) -> list[str]:
    """
    Applies necessary preprocessing to a single input word/phrase
    and returns its BPE tokens.
    Assumes clean_text_initial, arabize_text, normalize_arabic_text are defined.
    """
    
    try:
        processed_text = clean_text_initial(text)
    except NameError:
        print("Error: `clean_text_initial` function is not defined. Please define it in a cell above.")
        return []
    
    
    
    if re.search(r'[a-zA-Z0-9]', processed_text): 
        try:
            processed_text = arabize_text(processed_text.lower())
        except NameError:
            print("Error: `arabize_text` function (and `ARABIZI_TO_ARABIC_MAP`) is not defined. Please define it.")
            return []

    
    try:
        processed_text = normalize_arabic_text(processed_text)
    except NameError:
        print("Error: `normalize_arabic_text` function is not defined. Please define it.")
        return []

    
    
    processed_text = re.sub(r'[^\p{Arabic}\s]', '', processed_text, flags=re.UNICODE)
    processed_text = re.sub(r'\s+', ' ', processed_text).strip()

    if not processed_text:
        return []

    
    bpe_tokens = tokenizer.encode(processed_text).tokens
    return bpe_tokens


def display_arabic(text: str) -> str:
    """Helper function to display Arabic text correctly in RTL terminals/outputs."""
    if RTL_DISPLAY_AVAILABLE:
        
        pass
    return text



if models_loaded_successfully:
    print("\n--- Darija Word Similarity Finder ---")
    print("Type a word (Darija in Arabic/Latin script, or French) and press Enter.")
    print("Type 'exit' or 'quit' to close.")

    while True:
        try:
            user_input = input("\nEnter word: ").strip()

            if not user_input:
                continue
            if user_input.lower() in ['exit', 'quit']:
                print("Exiting...")
                break

            print(f"Original input: '{user_input}'")

            
            input_bpe_tokens = preprocess_input_word_for_similarity(user_input, bpe_tokenizer)
            
            if not input_bpe_tokens:
                print("Input word became empty after preprocessing or resulted in no BPE tokens.")
                continue

            print(f"Processed BPE tokens: {', '.join([display_arabic(t) for t in input_bpe_tokens])}")

            
            
            unk_token_string = bpe_tokenizer.model.unk_token
            
            valid_bpe_tokens_for_model = [
                token for token in input_bpe_tokens 
                if token in word2vec_model.wv and token != unk_token_string
            ]

            if not valid_bpe_tokens_for_model:
                print(f"None of the BPE tokens ({', '.join([display_arabic(t) for t in input_bpe_tokens])}) are in the Word2Vec model's vocabulary or they are all [UNK].")
                print("This might happen if the word is very rare, out-of-domain, or results only in [UNK] tokens.")
                continue
            
            if len(valid_bpe_tokens_for_model) < len(input_bpe_tokens):
                print(f"Note: Some BPE sub-tokens were Out-Of-Vocabulary or [UNK] and were excluded from similarity search.")
                print(f"Using valid BPE tokens for similarity: {', '.join([display_arabic(t) for t in valid_bpe_tokens_for_model])}")


            
            similar_items = word2vec_model.wv.most_similar(positive=valid_bpe_tokens_for_model, topn=5)

            print(f"\nTop 5 most similar BPE tokens to '{display_arabic(' '.join(valid_bpe_tokens_for_model))}':")
            if similar_items:
                for i, (token, score) in enumerate(similar_items):
                    print(f"{i+1}. {display_arabic(token):<15} (Score: {score:.4f})")
            else:
                print("No similar tokens found.")

        except KeyError as e:
            print(f"Error: One of the processed BPE tokens '{e}' was not found in the model's vocabulary.")
        except NameError as e:
            print(f"Error: A preprocessing function might be missing. {e}. Please define it in a cell above.")
            break 
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            import traceback
            traceback.print_exc() 

else:
    print("\nCannot start similarity finder: Word2Vec model or BPE tokenizer not loaded, or core preprocessing functions are missing.")
    print("Please ensure the paths to models are correct and preprocessing functions are defined in cells above.")