In [1]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("arbml/Tunisian_Dialect_Corpus")
df = pd.DataFrame(dataset["train"])
df.to_csv("tunisian_dialect_corpus.csv", index=False)


Repo card metadata block was not found. Setting CardData to empty.


In [2]:
df


Unnamed: 0,Tweet,label
0,Nn mouch 7louwa faza,1
1,mabladkom 3bed tfouuuuh,1
2,تفووووووه عليك و علا والديك على عايلتك و على ا...,1
3,لا يليق بهذا البرنامج,1
4,رهدان,1
...,...,...
49884,الله يستر ن,0
49885,الله يستر ن,0
49886,ربي اكون فى عونكم بالحق ربي ابقي الستر على تون...,0
49887,,0


In [3]:
import re
import unicodedata

# One preprocessing function (keep meaning, reduce social-media noise)
# Goals:
# - remove only low-value noise (URLs, mentions, ZWJ, tatweel, extra spaces)
# - normalize Arabic variants without changing meaning
# - reduce elongation (ههههه, loooool) but keep the word
# - handle Tunisian Arabizi digits (7=ح, 3=ع...) conservatively
# - keep emojis and punctuation (they can carry sentiment)

_ARABIC_DIACRITICS_RE = re.compile(r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]")
_TATWEEL_RE = re.compile(r"\u0640")
_URL_RE = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
_MENTION_RE = re.compile(r"@\w+")

# collapse 3+ repeats -> 2 (works for Arabic + Latin)
_REPEAT_CHARS_RE = re.compile(r"(.)\1{2,}")

# collapse repeated punctuation !!!! -> !
_REPEAT_PUNCT_RE = re.compile(r"([!?.,؛،])\1{1,}")

# keep: Arabic letters, Latin letters, digits, whitespace, common punctuation, emojis
# We'll *remove* only control chars and rare symbols later.
_CONTROL_CHARS_RE = re.compile(r"[\u200b\u200c\u200d\ufeff]")  # ZWSP/ZWNJ/ZWJ/BOM

_ARABIZI_DIGIT_MAP = str.maketrans(
    {
        "2": "ء",
        "3": "ع",
        "4": "غ",  # sometimes used
        "5": "خ",
        "6": "ط",
        "7": "ح",
        "8": "ق",
        "9": "ق",
        "0": "0",
    }
)

_ARABIC_INDIC_DIGITS = str.maketrans("٠١٢٣٤٥٦٧٨٩", "0123456789")


def preprocess_text(text: str) -> str:
    if not isinstance(text, str):
        return ""

    text = text.strip()

    # remove URLs + mentions (keep hashtags content)
    text = _URL_RE.sub(" ", text)
    text = _MENTION_RE.sub(" ", text)
    text = text.replace("#", "")

    # remove invisible control chars (common in copy/pasted tweets)
    text = _CONTROL_CHARS_RE.sub("", text)

    # normalize unicode form
    text = unicodedata.normalize("NFKC", text)

    # remove Arabic diacritics + tatweel
    text = _ARABIC_DIACRITICS_RE.sub("", text)
    text = _TATWEEL_RE.sub("", text)

    # normalize Arabic variants (low-risk)
    text = re.sub(r"[أإآ]", "ا", text)
    text = text.replace("ى", "ي")
    text = text.replace("ؤ", "و")
    text = text.replace("ئ", "ي")

    # normalize Arabic punctuation variants
    text = text.replace("؟", "?").replace("،", ",").replace("؛", ";")

    # convert Arabic-Indic digits -> Western
    text = text.translate(_ARABIC_INDIC_DIGITS)

    # (Removed simple Arabizi mapping to allow advanced token-level normalization later)

    # lower ONLY latin letters (keep Arabic as-is)
    # this is safer than .lower() on the whole string for some unicode edge cases
    text = "".join(ch.lower() if "A" <= ch <= "Z" else ch for ch in text)

    # reduce elongations (3+ repeats -> 2)
    text = _REPEAT_CHARS_RE.sub(r"\1\1", text)

    # reduce repeated punctuation
    text = _REPEAT_PUNCT_RE.sub(r"\1", text)

    # remove leftover junk symbols but keep emojis:
    # keep Arabic letters, Latin letters, digits, spaces, and a small punctuation set.
    # Anything else becomes a space.
    text = re.sub(r"[^ء-يA-Za-z0-9\s!?.,;:'\"()\[\]{}<>+-=_/\\]", " ", text)

    # collapse spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text

# keep raw Tweet; write cleaned text in a new column
df["text"] = df["Tweet"].apply(preprocess_text)

# drop empty after preprocessing
df = df[df["text"].str.strip().ne("")].reset_index(drop=True)

df[["Tweet", "text", "label"]].head()


Unnamed: 0,Tweet,text,label
0,Nn mouch 7louwa faza,nn mouch 7louwa faza,1
1,mabladkom 3bed tfouuuuh,mabladkom 3bed tfouuh,1
2,تفووووووه عليك و علا والديك على عايلتك و على ا...,تفووه عليك و علا والديك علي عايلتك و علي اصلك ...,1
3,لا يليق بهذا البرنامج,لا يليق بهذا البرنامج,1
4,رهدان,رهدان,1


In [4]:
# ========================================================================
# TEST: Translation using Dictionary from "Translation arabizi to arabic.ipynb"
# ========================================================================

# Dictionary from the translation file
buck2uni = { "e":"ا",
            "é":"ا",
            "a":"ا",

     "7":"ح",
             "7a": "ح",
        "7e": "ح",
        "7i": "ح",
        "7o": "ح",
        "7u": "ح",
            "5":"خ",
            "3":"ع",
     "3":"ع",
            "9":"ق",
    "9":"ق",
    "8":"غ",
            "3a": "ع",
        "3e": "ع",
        "3i":"ع",
        "3o": "ع",
        "3u": "ع",
        "5a": "خ",
        "5e": "خ",
        "5i": "خ",
        "5o": "خ",
        "5u": "خ",
            "8":"غ",
            "2": "ا",
   "a": "ا",

        "b": "ب",
        "ba": "ب",
        "be": "ب",
        "bi": "ب",
        "bo": "ب",
        "bu": "ب",
        "ch": "ش",
        "cha": "ش",
        "che":"ش",
        "chi": "ش",
        "cho": "ش",
        "chu": "ش",

        "b": "ب",

        "ch": "ش",

        "d": "د",

       "c" : "ك",

       "ai": "ي",

        "ou" : "و",
        "th": "ذ",
             "tha": "ذ",
             "tha": "ث",
            "the":"ذ",
             "the":  "ث",
             "the":"ذ",
             "tho":   "ث",
             "tho":"ذ",
            "thi":  "ث",
            "thi":"ذ",
             "the":  "ث",
        "dh":  "ظ",
       "dh"  : "ض",
         "dha":  "ظ",
       "dha"  : "ض",
              "dhe":  "ظ",
       "dhe"  : "ض",

                          "dhe":  "ظ",
       "dhe"  : "ض",


            "f": "ف",
"r":  "ر",
        "ra": "ر",
        "re": "ر",
        "ri": "ر",
        "ro": "ر",
        "ru": "ر",
        "fa": "ف",
        "fe": "ف",
        "fi": "ف",
        "fo": "ف",
        "fu": "ف",
        "gh": "غ",
 "k": "ك",
        "ka": "ك",
        "ke": "ك",
        "ki":"ك",
        "ko": "ك",
        "ku":"ك",
        "kh":  "خ",
        "kha": "خ",
        "khe": "خ",
        "khi": "خ",
        "kho": "خ",
        "khu": "خ",
        "h": "ه",
        "ha": "ه",
         "gh": "غ",
        "gha": "غ",
        "ghe": "غ",
        "ghi": "غ",
        "gho":"غ",
        "ghu": "غ",
        "h": "ه",
        "ha": "ه",
        "he": "ه",
        "hi": "ه",
        "ho": "ه",
        "hu": "ه",
        "i": "ى",
                    "i": "ي",

        "ia": "ي",
        "ie": "ي",
        "ii": "ي",
        "io": "ي",
        "iu": "ي",
        "i": "ي",
        "j": "ج",

        "k": "ك",
        "ka": "",

        "kh":  "خ",
        "ch":"ش",

        "l":  "ل",
        "l": "ل",

        "m":  "م",

        "n":  "ن",

        "o":  "و",

        "ou": "و",

        "p":  "ب",


        "q":  "ك",

        "r":  "ر" ,
        "ra": "ر",

        "s":  "س",

     "ch": "ش",
        "sh": "ش",
        "t":  "ت",

     "t":  "ط",
        "ti": "ت",
               "ti":  "ط",
        "to": "ت",
             "to":  "ط",
        "tu": "ت",
             "tu":  "ط",
        "ta":  "ط",

        "ta": "ت",
        "te": "ت",
            "te": "ط",

        "th":  "ث" ,
        "th":  "ذ",

     "t": "ت",
         "t": "ط",
        "w": "و",
        "g": "ق",
        "ga": "ق",
        "ge": "ج",
        "y": "ي",

        "v" :"ف",
         "ph" :"ف",
        "z":  "ز",
                   "l":  "ل",
        "la": "ل",
        "le": "ل",
        "li": "ل",
        "lo": "ل",
        "lu": "ل",
        "m":  "م",
        "ma": "م",
        "me": "م",
        "mi": "م",
        "mo": "م",
        "mu": "م",
        "n":  "ن",
        "na": "ن",
        "ne": "ن",
        "ni": "ن",
        "no": "ن",
        "nu": "ن",
        "o":  "ا",
        "p":  "ب",
        "pa": "ب",
        "pe": "ب",
        "pi": "ب",
        "po": "ب",
        "pu": "ب",
             "u":  "و",
        "ua": "و",
        "ue": "و",
        "ui": "و",
        "uo": "و",
        "uu": "و",
        "w":  "و",
        "wa": "و",
        "we": "و",
        "wi": "و",
        "wo": "و",
        "wu": "و",
            "y": "ي",
            "y": "ى",
        "ya": "ي",
        "ye": "ي",
        "yi": "ي",

 }


def transString1(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    if not isinstance(string, str):
        return ""

    for k, v in buck2uni.items():
        if not reverse:
            string = string.replace(v, k)
        else:
            string = string.replace(k, v)

    return string


# Test on a sample of 20 rows
print("Testing Dictionary-Based Translation on Sample Data")
print("="*80)

# Apply translation (reverse=1 means Arabizi to Arabic)
df['translated_dict'] = df['text'].astype(str).str.lower().apply(lambda x: transString1(x, reverse=1))

# Display results
comparison_df = df[['text', 'translated_dict']].copy()

print("\nOriginal vs Dictionary Translation:")
print("-"*80)

# Display as dataframe
print("\n\nComparison Table:")
comparison_df


Testing Dictionary-Based Translation on Sample Data

Original vs Dictionary Translation:
--------------------------------------------------------------------------------


Comparison Table:


Unnamed: 0,text,translated_dict
0,nn mouch 7louwa faza,نن موش حلووا فازا
1,mabladkom 3bed tfouuh,مابلادكام عباد طفووه
2,تفووه عليك و علا والديك علي عايلتك و علي اصلك ...,تفووه عليك و علا والديك علي عايلتك و علي اصلك ...
3,لا يليق بهذا البرنامج,لا يليق بهذا البرنامج
4,رهدان,رهدان
...,...,...
47790,يااحسرة اليوم 200,يااحسرة اليوم ا00
47791,الله يستر ن,الله يستر ن
47792,الله يستر ن,الله يستر ن
47793,ربي اكون في عونكم بالحق ربي ابقي الستر علي تون...,ربي اكون في عونكم بالحق ربي ابقي الستر علي تون...


In [5]:
# ---------------------------------------------------------# IMPROVED CODA-Compliant Tunisian Arabizi -> Arabic Transliteration System
# ---------------------------------------------------------
# Following Conventional Orthography for Dialectal Arabic (CODA)
# Enhanced with dictionary mappings from the translation file
# Handles mixed languages (Tunisian Arabizi, French, English)

def levenshtein_distance(s1, s2):
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    if len(s2) == 0:
        return len(s1)
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]

class CODATunisianTransliterator:
    """
    Improved CODA-compliant Tunisian Arabizi to Arabic transliterator.
    Uses dictionary mappings from translation file with context-aware rules.
    Handles mixed content: Tunisian Arabizi, French, and English.
    """

    def __init__(self, vocab_list=None):
        self.vocab = set(vocab_list) if vocab_list else set()
        self.cache = {}

        # CODA Exception Lexicon: Common Tunisian expressions and abbreviations
        self.exception_lexicon = {
            'nchal': 'إن شاء الله',
            'nchalah': 'إن شاء الله',
            'inchallah': 'إن شاء الله',
            'inshallah': 'إن شاء الله',
            'md': 'الحمد لله',
            'hamdoulah': 'الحمد لله',
            'hamdoullah': 'الحمد لله',
            'mdl': 'الحمد لله',
            'slt': 'سلام',
            'slm': 'سلام',
            'salam': 'سلام',
            'cv': 'كيفاش',
            'chbi': 'شبي',
            'chbeik': 'شبيك',
            'chkoun': 'شكون',
            'chkun': 'شكون',
            'chwaya': 'شوية',
            'chwiya': 'شوية',
            'barcha': 'برشا',
            'behi': 'باهي',
            'bahi': 'باهي',
            'wa9tach': 'وقتاش',
            'wa9tech': 'وقتاش',
            'win': 'وين',
            'winek': 'وينك',
            'kifech': 'كيفاش',
            'kifech': 'كيفاش',
            'ey': 'إي',
            'ay': 'أي',
            'yezzi': 'يزي',
            'wala': 'ولا',
            'walla': 'ولا',
            'ama': 'أما',
            'yaser': 'ياسر',
            'yacer': 'ياسر',
            'tounes': 'تونس',
            'tounsi': 'تونسي',
        }

        # Enhanced digit mappings from dictionary (CODA standard + Tunisian variants)
        self.digit_map = {
            '2': 'ء',
            '3': 'ع',
            '4': 'غ',
            '5': 'خ',
            '6': 'ط',
            '7': 'ح',
            '8': 'غ',
            '9': 'ق',
            '0': 'و',  # Sometimes used for و
        }

        # Multi-character patterns from dictionary (order matters - longest first)
        # These should be checked before single characters
        self.multi_char_patterns = [
            # Consonant combinations
            ('kha', 'خ'),
            ('khe', 'خ'),
            ('khi', 'خ'),
            ('kho', 'خ'),
            ('khu', 'خ'),
            ('kh', 'خ'),

            ('cha', 'ش'),
            ('che', 'ش'),
            ('chi', 'ش'),
            ('cho', 'ش'),
            ('chu', 'ش'),
            ('ch', 'ش'),
            ('sh', 'ش'),

            ('gha', 'غ'),
            ('ghe', 'غ'),
            ('ghi', 'غ'),
            ('gho', 'غ'),
            ('ghu', 'غ'),
            ('gh', 'غ'),

            ('tha', 'ث'),
            ('the', 'ث'),
            ('thi', 'ث'),
            ('tho', 'ث'),
            ('thu', 'ث'),
            ('th', 'ث'),

            ('dha', 'ض'),
            ('dhe', 'ض'),
            ('dhi', 'ض'),
            ('dho', 'ض'),
            ('dhu', 'ض'),
            ('dh', 'ذ'),

            # Digit combinations
            ('7a', 'ح'),
            ('7e', 'ح'),
            ('7i', 'ح'),
            ('7o', 'ح'),
            ('7u', 'ح'),

            ('3a', 'ع'),
            ('3e', 'ع'),
            ('3i', 'ع'),
            ('3o', 'ع'),
            ('3u', 'ع'),

            ('5a', 'خ'),
            ('5e', 'خ'),
            ('5i', 'خ'),
            ('5o', 'خ'),
            ('5u', 'خ'),

            # Vowel combinations
            ('ou', 'و'),
            ('oo', 'و'),
            ('ai', 'ي'),
            ('ei', 'ي'),
            ('aa', 'ا'),
            ('ee', 'ي'),
            ('ii', 'ي'),
            ('uu', 'و'),
            ('ph', 'ف'),
        ]

        # Single character mappings (based on dictionary)
        self.char_map = {
            'a': 'ا',
            'e': 'ا',
            'é': 'ا',
            'b': 'ب',
            'c': 'ك',
            'd': 'د',
            'f': 'ف',
            'g': 'ق',  # or ج depending on context
            'h': 'ه',
            'i': 'ي',
            'j': 'ج',
            'k': 'ك',
            'l': 'ل',
            'm': 'م',
            'n': 'ن',
            'o': 'و',
            'p': 'ب',
            'q': 'ك',
            'r': 'ر',
            's': 'س',
            't': 'ت',
            'u': 'و',
            'v': 'ف',
            'w': 'و',
            'x': 'كس',
            'y': 'ي',
            'z': 'ز',
        }

        # French/English common words to preserve (not translate)
        self.foreign_words = {
            # French
            'ok', 'oui', 'non', 'merci', 'bonjour', 'bonsoir', 'salut',
            'bien', 'mal', 'bon', 'mauvais', 'super', 'cool', 'top',
            'parce', 'que', 'je', 'tu', 'il', 'elle', 'nous', 'vous',
            'tout', 'rien', 'plus', 'moins', 'tres', 'beaucoup',
            'comment', 'quoi', 'qui', 'ou', 'quand', 'pourquoi',
            'la', 'le', 'les', 'un', 'une', 'des', 'de', 'du',
            'et', 'ou', 'mais', 'donc', 'car', 'ni', 'or',
            'meme', 'deja', 'encore', 'toujours', 'jamais',
            'aujourd', 'hui', 'demain', 'hier',
            # English
            'yes', 'no', 'okay', 'hi', 'hello', 'bye', 'thanks',
            'sorry', 'please', 'good', 'bad', 'nice', 'great',
            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at',
            'to', 'for', 'of', 'with', 'from', 'by',
            'this', 'that', 'these', 'those',
            'what', 'when', 'where', 'why', 'how', 'who',
            'lol', 'omg', 'wtf', 'tbh', 'btw',
        }

    def preprocess_arabizi(self, text):
        """
        Step 1: Preprocessing according to CODA rules
        - Remove repeated letters for emphasis
        - Replace emoticons/emojis with #
        - Handle CODA markers (+ for joining, - for splitting)
        """
        if not isinstance(text, str):
            return ""

        # Replace emoticons and emojis with #
        # Basic emoticons
        text = re.sub(r'[:;=]-?[\)\(DPpOo\[\]{}|\\\/]', '#', text)
        # Emoji range (basic coverage)
        text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]', '#', text)

        # Remove repeated letters (3+ -> 2 for Arabic, 1 for emphasis)
        # More conservative: 3+ same letter -> 2
        text = re.sub(r'(.)\1{2,}', r'\1\1', text)

        return text.strip()

    def is_foreign_word(self, word):
        """Check if word is French/English and should be skipped"""
        return word.lower() in self.foreign_words

    def handle_coda_markers(self, tokens):
        """
        Handle CODA-specific markers:
        - '+' joins words into single Arabic word
        - '-' splits into two Arabic words
        """
        result = []
        i = 0
        while i < len(tokens):
            token = tokens[i]

            # Handle joining with '+'
            if '+' in token:
                parts = token.split('+')
                # Transliterate each part then join without space
                joined = ''.join([self.transliterate_token(p) for p in parts if p])
                result.append(joined)
            # Handle splitting with '-'
            elif '-' in token:
                parts = token.split('-')
                # Transliterate each part as separate word
                result.extend([self.transliterate_token(p) for p in parts if p])
            else:
                result.append(token)

            i += 1

        return result

    def is_arabizi(self, token):
        """Check if token contains Latin/digits (Arabizi) and no Arabic"""
        if not token:
            return False
        has_latin = bool(re.search(r'[a-zA-Z0-9]', token))
        has_arabic = bool(re.search(r'[ء-ي]', token))
        return has_latin and not has_arabic

    def transliterate_token(self, token):
        """
        Core transliteration using dictionary-based approach with CODA rules
        """
        if not token or not isinstance(token, str):
            return ""

        # Step 2: Check exception lexicon first
        token_lower = token.lower()
        if token_lower in self.exception_lexicon:
            return self.exception_lexicon[token_lower]

        # Skip if foreign word
        if self.is_foreign_word(token_lower):
            return token

        # Skip pure numbers
        if token.isdigit():
            return token

        # If already Arabic, return as-is
        if not self.is_arabizi(token):
            return token

        # Step 3: Apply phonetic rules using dictionary mappings
        text = token_lower
        result = ""
        i = 0

        while i < len(text):
            matched = False

            # Try multi-character patterns first (longest match priority)
            for pattern, arabic in self.multi_char_patterns:
                if text[i:i+len(pattern)] == pattern:
                    result += arabic
                    i += len(pattern)
                    matched = True
                    break

            if matched:
                continue

            # Try digit mappings
            if text[i] in self.digit_map:
                result += self.digit_map[text[i]]
                i += 1
                continue

            # Single character mapping
            char = text[i]
            if char in self.char_map:
                result += self.char_map[char]
                i += 1
            else:
                # Unknown character (punctuation, etc.), keep as-is
                result += char
                i += 1

        return result if result else token

    def get_best_vocab_match(self, transliterated, threshold=0.35):
        """
        Match transliterated word against vocabulary using edit distance
        """
        if not self.vocab or not transliterated:
            return transliterated

        if transliterated in self.cache:
            return self.cache[transliterated]

        best_word = transliterated
        min_dist = float('inf')

        # Filter candidates by length similarity
        candidates = [w for w in self.vocab if abs(len(w) - len(transliterated)) <= 2]

        for word in candidates:
            d = levenshtein_distance(transliterated, word)
            norm_d = d / max(len(transliterated), len(word))

            if norm_d < min_dist:
                min_dist = norm_d
                best_word = word

        if min_dist <= threshold:
            self.cache[transliterated] = best_word
            return best_word
        else:
            self.cache[transliterated] = transliterated
            return transliterated

    def transliterate_text(self, text, use_vocab_matching=True):
        """
        Full pipeline: preprocess -> transliterate -> vocab match
        """
        # Step 1: Preprocess
        text = self.preprocess_arabizi(text)

        if not text:
            return ""

        # Tokenize
        tokens = text.split()

        # Handle CODA markers (+, -)
        tokens = self.handle_coda_markers(tokens)

        # Transliterate each token
        result_tokens = []
        for token in tokens:
            transliterated = self.transliterate_token(token)

            # Optional: match against vocabulary
            if use_vocab_matching and self.vocab and self.is_arabizi(token):
                transliterated = self.get_best_vocab_match(transliterated)

            result_tokens.append(transliterated)

        return " ".join(result_tokens)

# Build Vocab from the dataset (pure Arabic tokens only)
all_text = " ".join(df["text"].tolist())
all_tokens = all_text.split()
# Filter: must be Arabic chars only, length > 1
arabic_vocab = set(t for t in all_tokens if re.match(r'^[ء-ي]+$', t) and len(t) > 1)

print(f"Built Arabic Vocab: {len(arabic_vocab)} words")

# Initialize CODA-compliant Transliterator
transliterator = CODATunisianTransliterator(list(arabic_vocab))

# Comprehensive Testing Examples
print("\n" + "="*80)
print("CODA-COMPLIANT TUNISIAN ARABIZI TRANSLITERATION TESTS")
print("="*80)

test_cases = [
    # Basic digit mappings
    ("7keya 3la tounes", "Story about Tunisia"),
    ("9rib men dar", "Close to home"),

    # Exception lexicon
    ("nchal ça va", "Inshallah how are you"),
    ("md rabbi", "Thank God"),

    # Multi-character patterns
    ("chkoun khali", "Who is my uncle"),
    ("dhaw ghali", "Light is expensive"),

    # Emphatic context (s->ص, t->ط, d->ض)
    ("saber wa nasser", "Saber and Nasser - emphatic context"),

    # CODA markers: + for joining
    ("3al+tawla", "On the table - joined with +"),
    ("fel+dar", "In the house - joined with +"),

    # CODA markers: - for splitting
    ("wa9t-el-3achia", "Evening time - split with -"),

    # Repeated letters (emphasis)
    ("bniiiiiina", "We built - with emphasis"),
    ("7loooow", "Sweet - with elongation"),

    # Mixed content with emoticons
    ("barcha behi :) merci", "Very good - with emoticon and French"),

    # Complex sentence
    ("nchal rabi y7afdh tounes 9wiya", "Inshallah God protects strong Tunisia"),

    # Common Tunisian expressions
    ("chbeik chwaya barcha", "What's wrong with you a bit a lot"),
    ("kifech win winek", "How where are you"),
]

print("\nTest Results:")
print("-" * 80)
for arabizi_text, description in test_cases:
    result = transliterator.transliterate_text(arabizi_text, use_vocab_matching=False)
    print(f"Input:  {arabizi_text}")
    print(f"Output: {result}")
    print(f"Note:   {description}")
    print()


Built Arabic Vocab: 34734 words

CODA-COMPLIANT TUNISIAN ARABIZI TRANSLITERATION TESTS

Test Results:
--------------------------------------------------------------------------------
Input:  7keya 3la tounes
Output: حكايا علا تونس
Note:   Story about Tunisia

Input:  9rib men dar
Output: قريب مان دار
Note:   Close to home

Input:  nchal ça va
Output: إن شاء الله çا فا
Note:   Inshallah how are you

Input:  md rabbi
Output: الحمد لله راببي
Note:   Thank God

Input:  chkoun khali
Output: شكون خلي
Note:   Who is my uncle

Input:  dhaw ghali
Output: ضو غلي
Note:   Light is expensive

Input:  saber wa nasser
Output: سابار وا ناسسار
Note:   Saber and Nasser - emphatic context

Input:  3al+tawla
Output: علتاولا
Note:   On the table - joined with +

Input:  fel+dar
Output: فالدار
Note:   In the house - joined with +

Input:  wa9t-el-3achia
Output: واقت ال عشا
Note:   Evening time - split with -

Input:  bniiiiiina
Output: بنينا
Note:   We built - with emphasis

Input:  7loooow
Output: حلوو
Not

In [None]:
# ========================================================================
# DATABASE SAMPLE TESTING: Apply CODA Translation to Real Dataset
# ========================================================================
print("\n" + "="*80)
print("TESTING ON DATABASE SAMPLE (30 tweets)")
print("="*80)

# Select a diverse sample with different patterns
sample_indices = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70,
                  100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800]
test_sample_df = df.iloc[sample_indices[:30]].copy()

# Apply CODA transliteration without vocab matching first (to see pure dictionary translation)
print("\nTranslating sample tweets...")
test_sample_df['coda_translation'] = test_sample_df['text'].apply(
    lambda x: transliterator.transliterate_text(str(x), use_vocab_matching=False)
)

# Display results with comparison
print("\n" + "="*80)
print("TRANSLATION RESULTS")
print("="*80)

for idx, row in test_sample_df.head(15).iterrows():
    print(f"\n[Row {idx}] Label: {row['label']}")
    print(f"Original Tweet: {row['Tweet'][:80]}...")
    print(f"Preprocessed:   {row['text'][:80]}...")
    print(f"CODA Arabic:    {row['coda_translation'][:80]}...")
    print("-" * 80)

# Save detailed results to CSV
output_file = "coda_translation_sample_results.csv"
test_sample_df[['Tweet', 'text', 'coda_translation', 'label']].to_csv(
    output_file, index=False, encoding='utf-8-sig'
)
print(f"\n✓ Detailed results saved to: {output_file}")

# Statistics
print("\n" + "="*80)
print("TRANSLATION STATISTICS")
print("="*80)
print(f"Total tweets tested: {len(test_sample_df)}")
print(f"Successfully translated: {test_sample_df['coda_translation'].notna().sum()}")

# Show distribution by label
print("\nDistribution by label:")
print(test_sample_df['label'].value_counts())

# Analyze translation patterns
def analyze_translation(original, translated):
    """Simple analysis of what changed"""
    has_arabic_numbers = bool(re.search(r'[0-9]', original))
    has_latin = bool(re.search(r'[a-zA-Z]', original))
    has_arabic_script = bool(re.search(r'[ء-ي]', translated))
    return {
        'had_numbers': has_arabic_numbers,
        'had_latin': has_latin,
        'now_arabic': has_arabic_script
    }

print("\n" + "="*80)
print("SAMPLE COMPARISONS (First 10)")
print("="*80)

comparison_results = []
for idx, row in test_sample_df.head(10).iterrows():
    analysis = analyze_translation(row['text'], row['coda_translation'])
    comparison_results.append({
        'index': idx,
        'original': row['text'][:50],
        'translated': row['coda_translation'][:50],
        **analysis
    })

comparison_df = pd.DataFrame(comparison_results)
print(comparison_df.to_string())

# Now apply to full dataset with vocab matching
print("\n" + "="*80)
print("Applying CODA transliteration to FULL dataset with vocabulary matching...")
print("="*80)

df["text_coda"] = df["text"].apply(
    lambda x: transliterator.transliterate_text(str(x), use_vocab_matching=True)
)

print(f"\n✓ Translation complete! Processed {len(df)} texts.")
print("\nPreview of full dataset results:")
preview_df = df[["Tweet", "text", "text_coda","translated_dict", "label"]].head(10)
print(preview_df.to_string())



TESTING ON DATABASE SAMPLE (30 tweets)

Translating sample tweets...

TRANSLATION RESULTS

[Row 0] Label: 1
Original Tweet: Nn mouch 7louwa faza...
Preprocessed:   nn mouch 7louwa faza...
CODA Arabic:    نن موش حلووا فازا...
--------------------------------------------------------------------------------

[Row 5] Label: 1
Original Tweet: b rjoulia stoufa to7t men3ini ..........
Preprocessed:   b rjoulia stoufa to7t men3ini ....
CODA Arabic:    ب رجوليا ستوفا توحت مانعني ....
--------------------------------------------------------------------------------

[Row 10] Label: 1
Original Tweet: كلاب أولاد كلاب...
Preprocessed:   كلاب اولاد كلاب...
CODA Arabic:    كلاب اولاد كلاب...
--------------------------------------------------------------------------------

[Row 15] Label: 1
Original Tweet: المسلسل لي تحكو عليه والله سمعت بيه كان من تعليقاتكم ههههه كفاش تتفرجو عليه و كف...
Preprocessed:   المسلسل لي تحكو عليه والله سمعت بيه كان من تعليقاتكم هه كفاش تتفرجو عليه و كفاش ...
CODA Arabic:  

In [6]:
import numpy as np
from transformers import AutoTokenizer
from typing import Any, Dict

tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/MARBERT")

# token length stats on *CODA-transliterated* text
lengths = [len(tokenizer(t, add_special_tokens=True).input_ids) for t in df["translated_dict"]]
percentiles = np.percentile(lengths, [50, 75, 90, 95, 99])
print(f"Token Length Percentiles (50, 75, 90, 95, 99): {percentiles}")


Token Length Percentiles (50, 75, 90, 95, 99): [ 9. 15. 26. 37. 79.]


In [7]:
# Show tokenization as tokens (strings), not only IDs
sample = df.loc[0, "translated_dict"]

tokens = tokenizer.tokenize(sample)
encoded: Dict[str, Any] = tokenizer(sample, add_special_tokens=True)

tokens_with_special = tokenizer.convert_ids_to_tokens(encoded["input_ids"])

print("TEXT:", sample)
print("\nTOKENS (no special tokens):")
print(tokens)
print("\nTOKENS (with special tokens):")
print(tokens_with_special)


TEXT: نن موش حلووا فازا

TOKENS (no special tokens):
['نن', 'موش', 'حلوو', '##ا', 'فاز', '##ا']

TOKENS (with special tokens):
['[CLS]', 'نن', 'موش', 'حلوو', '##ا', 'فاز', '##ا', '[SEP]']


In [12]:
# ========================================================================
# PHASE 2: COMPLETE TOKENIZATION PIPELINE + CLASS IMBALANCE HANDLING
# ========================================================================

import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter
import numpy as np

print("="*80)
print("PHASE 2: TOKENIZATION & DATA PREPARATION")
print("="*80)

# ========================================================================
# Step 1: Analyze Class Imbalance
# ========================================================================

print("\n" + "-"*80)
print("STEP 1: CLASS IMBALANCE ANALYSIS")
print("-"*80)

# Current distribution (labels are already integers 0 and 1)
label_counts = df['label'].value_counts()
print("\nOriginal Distribution:")
print(label_counts)
print(f"\nTotal samples: {len(df)}")
print(f"Positive (0): {(df['label']==0).sum()} ({(df['label']==0).sum()/len(df)*100:.2f}%)")
print(f"Negative (1): {(df['label']==1).sum()} ({(df['label']==1).sum()/len(df)*100:.2f}%)")

# Label mapping for reference
label_map = {0: 'positive', 1: 'negative'}
print(f"\nLabel mapping: {label_map}")
print(f"Imbalance ratio: {(df['label']==0).sum() / (df['label']==1).sum():.2f}:1")

# Compute class weights for training
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(df['label']),
    y=df['label']
)

class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

print("\n✓ Class weights computed for balanced training:")
print(f"  Class 0 (positive): {class_weight_dict[0]:.4f}")
print(f"  Class 1 (negative): {class_weight_dict[1]:.4f}")

# ========================================================================
# Step 2: Train/Test Split with Stratification
# ========================================================================

print("\n" + "-"*80)
print("STEP 2: STRATIFIED TRAIN/TEST SPLIT")
print("-"*80)

# Use translated_dict as input (best translation)
X = df['translated_dict'].values
y = df['label'].values  # Labels are already integers (0, 1)

# Stratified split to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print(f"\nTrain set: {len(X_train)} samples")
print(f"  Positive (0): {(y_train==0).sum()} ({(y_train==0).sum()/len(y_train)*100:.1f}%)")
print(f"  Negative (1): {(y_train==1).sum()} ({(y_train==1).sum()/len(y_train)*100:.1f}%)")

print(f"\nTest set: {len(X_test)} samples")
print(f"  Positive (0): {(y_test==0).sum()} ({(y_test==0).sum()/len(y_test)*100:.1f}%)")
print(f"  Negative (1): {(y_test==1).sum()} ({(y_test==1).sum()/len(y_test)*100:.1f}%)")

# ========================================================================
# Step 3: Initialize MARBERT Tokenizer
# ========================================================================

print("\n" + "-"*80)
print("STEP 3: INITIALIZE MARBERT TOKENIZER")
print("-"*80)

MODEL_NAME = "UBC-NLP/MARBERT"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print(f"✓ Tokenizer loaded: {MODEL_NAME}")
print(f"  Vocab size: {tokenizer.vocab_size}")
print(f"  Model max length: {tokenizer.model_max_length}")

# Determine optimal max_length based on token length analysis
print("\n✓ Analyzing token lengths...")
sample_texts = X_train[:1000]  # Sample for speed
sample_lengths = [len(tokenizer(t, add_special_tokens=True).input_ids) for t in sample_texts]
percentiles = np.percentile(sample_lengths, [50, 75, 90, 95, 99])

print(f"Token length percentiles (50, 75, 90, 95, 99): {percentiles}")
print(f"Recommended max_length: 64 (covers ~90% of data)")

MAX_LENGTH = 64

# ========================================================================
# Step 4: Create PyTorch Dataset
# ========================================================================

print("\n" + "-"*80)
print("STEP 4: CREATE PYTORCH DATASET")
print("-"*80)

class TunisianSentimentDataset(Dataset):
    """PyTorch Dataset for Tunisian dialect sentiment analysis"""

    def __init__(self, texts, labels, tokenizer, max_length=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Tokenize with padding and truncation
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = TunisianSentimentDataset(X_train, y_train, tokenizer, MAX_LENGTH)
test_dataset = TunisianSentimentDataset(X_test, y_test, tokenizer, MAX_LENGTH)

print(f"✓ Train dataset created: {len(train_dataset)} samples")
print(f"✓ Test dataset created: {len(test_dataset)} samples")

# ========================================================================
# Step 5: Tokenize Sample and Inspect
# ========================================================================

print("\n" + "-"*80)
print("STEP 5: TOKENIZATION INSPECTION (Sample)")
print("-"*80)

# Get a sample from training data
sample_idx = 3
sample_data = train_dataset[sample_idx]

print(f"\nOriginal text: {X_train[sample_idx][:100]}...")
print(f"Label: {y_train[sample_idx]} ({'positive' if y_train[sample_idx]==0 else 'negative'})")

print(f"\nTokenized output:")
print(f"  input_ids shape: {sample_data['input_ids'].shape}")
print(f"  attention_mask shape: {sample_data['attention_mask'].shape}")
print(f"  label: {sample_data['label'].item()}")

# Convert IDs to tokens
tokens = tokenizer.convert_ids_to_tokens(sample_data['input_ids'])
print(f"\nTokens (first 20): {tokens[:20]}")

# Count non-padding tokens
non_padding = sample_data['attention_mask'].sum().item()
print(f"Non-padding tokens: {non_padding}/{MAX_LENGTH}")

# ========================================================================
# Step 6: Create DataLoaders with Weighted Sampling
# ========================================================================

print("\n" + "-"*80)
print("STEP 6: CREATE DATALOADERS (with weighted sampling)")
print("-"*80)

BATCH_SIZE = 16

# Create weighted sampler for training to handle class imbalance
# Calculate weights for each sample based on its class
sample_weights = [class_weight_dict[label] for label in y_train]
sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True
)

# Training DataLoader with weighted sampler
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    sampler=sampler,
    num_workers=0  # Set to 0 for Windows compatibility
)

# Test DataLoader (no sampling needed)
test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

print(f"✓ Train DataLoader: {len(train_loader)} batches (batch_size={BATCH_SIZE})")
print(f"✓ Test DataLoader: {len(test_loader)} batches (batch_size={BATCH_SIZE})")
print(f"✓ Weighted sampling enabled for class balance")

# ========================================================================
# Step 7: Verify Batch Structure
# ========================================================================

print("\n" + "-"*80)
print("STEP 7: VERIFY BATCH STRUCTURE")
print("-"*80)

# Get one batch
batch = next(iter(train_loader))

print(f"\nBatch structure:")
print(f"  input_ids: {batch['input_ids'].shape} (batch_size, max_length)")
print(f"  attention_mask: {batch['attention_mask'].shape}")
print(f"  labels: {batch['label'].shape} (batch_size,)")

print(f"\nBatch label distribution:")
unique, counts = torch.unique(batch['label'], return_counts=True)
for label, count in zip(unique, counts):
    label_name = 'positive' if label == 0 else 'negative'
    print(f"  {label_name}: {count.item()}/{BATCH_SIZE}")

# ========================================================================
# Step 8: Full Dataset Tokenization Summary
# ========================================================================

print("\n" + "="*80)
print("TOKENIZATION SUMMARY")
print("="*80)

summary = {
    "Model": MODEL_NAME,
    "Tokenizer vocab size": tokenizer.vocab_size,
    "Max length": MAX_LENGTH,
    "Total samples": len(df),
    "Train samples": len(train_dataset),
    "Test samples": len(test_dataset),
    "Batch size": BATCH_SIZE,
    "Train batches": len(train_loader),
    "Test batches": len(test_loader),
    "Class 0 weight": f"{class_weight_dict[0]:.4f}",
    "Class 1 weight": f"{class_weight_dict[1]:.4f}",
}

for key, value in summary.items():
    print(f"{key:25s}: {value}")

print("\n" + "="*80)
print("✓ PHASE 2 COMPLETE: Ready for model training!")
print("="*80)

# Save tokenization config for backend
tokenization_config = {
    'model_name': MODEL_NAME,
    'max_length': MAX_LENGTH,
    'batch_size': BATCH_SIZE,
    'class_weights': class_weight_dict,
    'label_map': label_map,
    'train_size': len(train_dataset),
    'test_size': len(test_dataset),
}

print("\n✓ Configuration saved for backend integration")


PHASE 2: TOKENIZATION & DATA PREPARATION

--------------------------------------------------------------------------------
STEP 1: CLASS IMBALANCE ANALYSIS
--------------------------------------------------------------------------------

Original Distribution:
label
0    35239
1    12556
Name: count, dtype: int64

Total samples: 47795
Positive (0): 35239 (73.73%)
Negative (1): 12556 (26.27%)

Label mapping: {0: 'positive', 1: 'negative'}
Imbalance ratio: 2.81:1

✓ Class weights computed for balanced training:
  Class 0 (positive): 0.6782
  Class 1 (negative): 1.9033

--------------------------------------------------------------------------------
STEP 2: STRATIFIED TRAIN/TEST SPLIT
--------------------------------------------------------------------------------

Train set: 38236 samples
  Positive (0): 28191 (73.7%)
  Negative (1): 10045 (26.3%)

Test set: 9559 samples
  Positive (0): 7048 (73.7%)
  Negative (1): 2511 (26.3%)

-----------------------------------------------------------