In [1]:
!pip install datasets tqdm pandas numpy uuid datetime requests groq transformers torch sentence-transformers groq camel-tools

Collecting uuid
  Downloading uuid-1.30.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datetime
  Downloading DateTime-5.5-py3-none-any.whl.metadata (33 kB)
Collecting groq
  Downloading groq-0.31.0-py3-none-any.whl.metadata (16 kB)
Collecting camel-tools
  Downloading camel_tools-1.5.6-py3-none-any.whl.metadata (10 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting zope.interface (from datetime)
  Downloading zope.interface-7.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-run

In [8]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
GROQ_API_KEY = user_secrets.get_secret("GROQ_API_KEY")
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

In [9]:
from huggingface_hub import login
login(new_session=False,token=HF_TOKEN)

## Imports

In [10]:
# parallel_corpus_generator.py
# --- For Scoring (Examples - you'll need to implement these) ---
from sentence_transformers import SentenceTransformer
import torch
import os
import uuid
import json
import datetime
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
# --- For Groq API ---
# pip install groq
try:
    from groq import Groq
    GROQ_AVAILABLE = True
except ImportError:
    print("Groq library not found. Install with 'pip install groq' if you plan to use Groq API.")
    GROQ_AVAILABLE = False



### --- Configuration ---

In [11]:
DATASET_NAME = "hamzabouajila/tunisian-derja-unified-raw-corpus"
OUTPUT_FILE = "tunisian_msa_parallel_corpus.jsonl"
MAX_ATTEMPTS = 3
SCORE_THRESHOLD = 0.7 # Composite score threshold for acceptance
USE_GROQ = True # Set to False to use local LLM function
LOCAL_LLM_NAME = "your_local_model_name_or_path" # Specify if not using Groq
GROQ_MODEL = "llama3-70b-8192" # Example Groq model
# GROQ_MODEL = "mixtral-8x7b-32768" # Another option
BATCH_SIZE = 10 # Process sentences in batches for efficiency (adjust as needed)
SPLIT_RATIO = {"train": 0.8, "validation": 0.1, "test": 0.1} # Approximate split ratio
# --- Initialize Groq Client (if using) ---
if USE_GROQ and GROQ_AVAILABLE:
    try:
        groq_client = Groq(api_key=GROQ_API_KEY)
        if not groq_client.api_key:
            raise ValueError("GROQ_API_KEY environment variable not set.")
    except Exception as e:
        print(f"Error initializing Groq client: {e}")
        USE_GROQ = False
elif USE_GROQ and not GROQ_AVAILABLE:
    USE_GROQ = False



### --- Translation ---

In [10]:

def translate_tn_to_msa_local(tn_sentence):
    """Placeholder for translating using a local LLM."""
    # Implement your local LLM translation logic here.
    # Example using transformers pipeline:
    try:
        messages = [
            {"role": "user", "content": f"ترجمة الجملة التونسية إلى العربية الفصحى: {tn_sentence}"},
                ]
        result = pipe(messages, temperature=0.5)
        return result[0]['generated_text'][-1]["content"].split("\n\n")[-1].split("الترجمة هي:")[-1].strip().replace("**","")
    # .split("MSA Translation:")[-1].strip()
    except Exception as e:
         print(f"Error translating '{tn_sentence}' with local LLM: {e}")
         return None
    #Example using llama-cpp-python (requires setup):
    # from llama_cpp import Llama
    # llm = Llama(model_path="path/to/your/gguf/model")
    # prompt = f"Translate the following Tunisian Arabic text to Modern Standard Arabic:\n\n{tn_sentence}\n\nMSA Translation:"
    # output = llm(prompt, max_tokens=512, stop=["\n"], echo=True)
    # return output['choices'][0]['text'].split("MSA Translation:")[-1].strip()

    # Placeholder return
    # print(f"Local LLM translation not implemented for: {tn_sentence}")
    # return f"[LOCAL_LLM_TRANSLATION_PLACEHOLDER] {tn_sentence}" # Indicate it's a placeholder


# --- Translation Functions ---
def translate_tn_to_msa_groq(tn_sentence):
    """Translate Tunisian Arabic to MSA using Groq API."""
    if not USE_GROQ or not GROQ_AVAILABLE:
        raise ValueError("Groq API not configured or available.")
    try:
        prompt = f"Translate the following Tunisian Arabic text to Modern Standard Arabic:\n\n{tn_sentence}\n\nMSA Translation:"
        chat_completion = groq_client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model=GROQ_MODEL,
            temperature=0.2, # Lower temperature for more deterministic output
            max_tokens=512, # Adjust based on expected sentence length
            top_p=1,
            stop=None,
            stream=False,
        )
        msa_translation = chat_completion.choices[0].message.content.strip()
        return msa_translation
    except Exception as e:
        print(f"Error translating '{tn_sentence}' with Groq: {e}")
        return None




def translate_tn_to_msa(tn_sentence):
    """Wrapper to select translation method."""
    if USE_GROQ and GROQ_AVAILABLE:
        return translate_tn_to_msa_groq(tn_sentence)
    else:
        return translate_tn_to_msa_local(tn_sentence)


In [11]:
translate_tn_to_msa("شنوة أحوالك؟")

'The Tunisian Arabic text "شنوة أحوالك؟" can be translated to Modern Standard Arabic as:\n\nكيف أحوالك؟\n\n(Kayf aḥwāluk?)\n\nWhich means "How are you?"'

## Preprocessing Functions

### preprocess_sentence

In [None]:
def preprocess_sentence(text):
    """Basic preprocessing for Tunisian Arabic text."""
    if not isinstance(text, str):
        return None
    # Basic cleaning: strip whitespace, remove extra newlines
    text = text.strip()
    text = re.sub(r'\s+', ' ', text) # Replace multiple whitespaces with single space
    # Add more sophisticated cleaning if needed (e.g., normalize specific characters)
    # Filter out very short sentences or sentences with only numbers/punctuation?
    if len(text) < 5: # Example filter
        return None
    return text

### is_valid_tunisian_arabic

In [None]:
model_di = pipeline('text-classification', model='Ammar-alhaj-ali/arabic-MARBERT-dialect-identification-city')


def is_valid_tunisian_arabic(text):
    """
    Determines if the input text is Tunisian Arabic using a pre-trained model.
    
    Args:
        text (str): The text to classify.
        
    Returns:
        bool: True if the text is identified as Tunisian Arabic, False otherwise.
    """
    if not text or not isinstance(text, str):
        return False
        
    try:
        predictions = model_di([text])
        if not predictions or 'label' not in predictions[0]:
            return False
        predictions = predictions[0]['label']
        return predictions in ["Tunis","Sfax"]
        
    except Exception as e:
        # Handle any potential errors during prediction
        print(f"Error in dialect identification: {e}")
        return False

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [16]:
is_valid_tunisian_arabic( "أنا راجل تونسي")

predincting أنا راجل تونسي


True

### calculate_semantic_similarity

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
# Load the pre-trained Arabic embedding model once, outside the function, for efficiency.
# 'UBC-NLP/MARBERT' or 'UBC-NLP/AraT5-base' are strong choices for Arabic.
# As per the search results, newer models like the Arabic Matryoshka Embedding Models 
# or OMER NACAR models are also excellent options.
model = SentenceTransformer('aubmindlab/bert-base-arabertv02') # Replace with your chosen model



def calculate_semantic_similarity(tn_sentence, msa_candidate, model=model):
    """
    Calculate the cosine similarity between the embeddings of a Tunisian Arabic sentence 
    and an MSA candidate sentence.

    Args:
        tn_sentence (str): The Tunisian Arabic sentence.
        msa_candidate (str): The Modern Standard Arabic sentence.
        model: A pre-trained SentenceTransformer model.

    Returns:
        float: The cosine similarity score between 0.0 (no similarity) and 1.0 (identical).
    """
    if not tn_sentence or not msa_candidate or not isinstance(tn_sentence, str) or not isinstance(msa_candidate, str):
        return 0.0

    try:
        # Encode both sentences into dense vector embeddings.
        embeddings = model.encode([tn_sentence, msa_candidate])
        
        # Extract the embeddings for each sentence.
        emb1 = embeddings[0]  # Embedding for Tunisian sentence
        emb2 = embeddings[1]  # Embedding for MSA sentence

        # Calculate cosine similarity.
        cos_sim = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
        
        # Ensure the result is a float and clip to valid range due to potential floating-point errors.
        return float(np.clip(cos_sim, 0.0, 1.0))

    except Exception as e:
        print(f"Error calculating semantic similarity: {e}")
        return 0.0


In [None]:
calculate_semantic_similarity("أنا راجل تونسي", "أنا رجل تونسي",model)

### calculate_backtranslation_score

In [None]:
def calculate_backtranslation_score(original_tn, backtranslated_tn, model=None):
    """Calculate similarity between original TN and back-translated TN."""
    # Requires a TN -> MSA -> TN pipeline or a dedicated TN-MSA-TN model
    # Example similarity calculation (e.g., using embedding similarity again)
    try:
        similarity = calculate_semantic_similarity(original_tn, backtranslated_tn, model=model) # Reuse semantic sim function
        return similarity
    except Exception as e:
        print(f"Error calculating backtranslation score: {e}")
        return 0.0


In [None]:
calculate_backtranslation_score("أنا راجل تونسي", "أنا رجل تونسي",model)

### calculate_lm_fluency

In [46]:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch



# Load the tokenizer and model once, outside the function, for efficiency.
# 'aubmindlab/aragpt2-base' is a good choice for Modern Standard Arabic.
# Other models like 'marefa-nlp/ajeeb-gpt2-large-ar' or 'CAMeL-Lab/bert-base-arabic' 
# (adapted for sequence scoring) are also available.

tokenizer = AutoTokenizer.from_pretrained('aubmindlab/aragpt2-large', trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained('aubmindlab/aragpt2-large', trust_remote_code=True)

def calculate_lm_fluency(msa_candidate, model=model, tokenizer=tokenizer):
    """
    Calculate fluency score for an MSA sentence using a pre-trained Arabic Language Model.
    The score is the negative average log-likelihood. Lower perplexity (higher log-likelihood) 
    indicates higher fluency.
    """
    if not msa_candidate or not isinstance(msa_candidate, str):
        return float('-inf')

    try:
        # Tokenize the input text.
        inputs = tokenizer(msa_candidate, return_tensors="pt", truncation=True, max_length=512)
        
        with torch.no_grad():  # Disable gradient calculation for inference.
            # Get the model's outputs. Pass the input_ids as labels to compute the loss.
            outputs = model(**inputs, labels=inputs["input_ids"])
            # The loss is the cross-entropy loss, which is the average negative log-likelihood.
            avg_log_likelihood = -outputs.loss.item() # Use .item() to get a Python float

        return avg_log_likelihood

    except Exception as e:
        print(f"Error calculating LM fluency: {e}")
        return float('-inf')

In [47]:
calculate_lm_fluency("أنا رجل تونسي",model,tokenizer)

-8.509749412536621

In [None]:


def calculate_ensemble_agreement(msa_candidates, model=None):
    """Calculate agreement among multiple MSA candidates."""
    # Example: Generate N candidates (e.g., with different sampling params)
    # Calculate pairwise similarities and average them
    if len(msa_candidates) < 2:
        return 1.0 # Perfect agreement if only one
    try:
        similarities = []
        for i in range(len(msa_candidates)):
            for j in range(i+1, len(msa_candidates)):
                sim = calculate_semantic_similarity(msa_candidates[i], msa_candidates[j], model=model)
                similarities.append(sim)
        avg_agreement = np.mean(similarities) if similarities else 0.0
        return float(avg_agreement)
    except Exception as e:
        print(f"Error calculating ensemble agreement: {e}")
        return 0.0


def calculate_composite_score(scores_dict):
    """Calculate the final composite score based on individual scores."""
    # Define weights for each component (you can adjust these)
    w_semantic = 0.3
    w_fluency = 0.2
    w_backtrans = 0.3
    w_ensemble = 0.2

    # Normalize scores if needed (especially logprob)
    semantic_score = scores_dict.get('semantic_similarity', 0.0)
    fluency_score = scores_dict.get('lm_logprob', float('-inf'))
    backtrans_score = scores_dict.get('backtranslation_score', 0.0)
    ensemble_score = scores_dict.get('ensemble_agreement', 0.0)

    # Simple normalization for logprob (example - adjust based on your model's range)
    # Assuming logprob is negative, higher (closer to 0) is better
    normalized_fluency = 1.0 / (1.0 + np.exp(-fluency_score)) if fluency_score != float('-inf') else 0.0

    composite = (
        w_semantic * semantic_score +
        w_fluency * normalized_fluency +
        w_backtrans * backtrans_score +
        w_ensemble * ensemble_score
    )
    return composite

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="google/gemma-3-270m-it")

Error calculating LM fluency: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)


-inf

In [12]:

# --- Main Pipeline ---
def generate_parallel_corpus():
    """Main function to run the corpus generation pipeline."""
    print("Loading raw Tunisian corpus...")
    try:
        raw_dataset = load_dataset(DATASET_NAME, split='train') # Assuming 'train' split contains the data
        print(f"Loaded {len(raw_dataset)} examples from {DATASET_NAME}")
    except Exception as e:
        print(f"Error loading dataset {DATASET_NAME}: {e}")
        return

    processed_data = []
    print("Starting translation and scoring pipeline...")
    for i, example in enumerate(tqdm(raw_dataset, desc="Processing")):
        raw_tn_text = example.get('text', example.get('sentence', None)) # Adjust key based on dataset structure
        if not raw_tn_text:
            continue

        processed_tn = preprocess_sentence(raw_tn_text)
        if not processed_tn or not is_valid_tunisian_arabic(processed_tn):
            continue # Skip invalid or filtered sentences

        best_msa_candidate = None
        best_scores = {}
        final_composite_score = 0.0
        accepted = False
        attempts = 0
        all_candidates = [] # For ensemble agreement

        while attempts < MAX_ATTEMPTS and not accepted:
            attempts += 1
            msa_candidate = translate_tn_to_msa(processed_tn)
            if not msa_candidate:
                continue # Skip if translation failed

            all_candidates.append(msa_candidate)

            # --- Scoring (Replace with actual model calls) ---
            scores = {}
            #scores['semantic_similarity'] = calculate_semantic_similarity(processed_tn, msa_candidate) # Pass model if needed
            #scores['lm_logprob'] = calculate_lm_fluency(msa_candidate) # Pass model if needed
            # Placeholder for backtranslation - would require another LLM call TN -> MSA -> TN
            #scores['backtranslation_score'] = calculate_backtranslation_score(processed_tn, f"[BACKTRANSLATED_{processed_tn}]") # Placeholder
            # Placeholder for ensemble - would require generating multiple candidates
            #scores['ensemble_agreement'] = calculate_ensemble_agreement([msa_candidate, f"[CANDIDATE_2_{processed_tn}]", f"[CANDIDATE_3_{processed_tn}]"]) # Placeholder

            #composite_score = calculate_composite_score(scores)

            #if composite_score > final_composite_score:
             #   final_composite_score = composite_score
              #  best_msa_candidate = msa_candidate
               # best_scores = scores.copy() # Store the best scores

            #if composite_score >= SCORE_THRESHOLD:
            #    accepted = True
            #    break # Acceptable candidate found

        # If no acceptable candidate found after MAX_ATTEMPTS, use the best one
        #if not accepted and best_msa_candidate:
         #   accepted = False # Explicitly mark as not accepted if below threshold
            # You might choose to include low-scoring examples with accepted=False
            # For now, let's include the best attempt even if below threshold
            # If you want to discard them, add a condition here.

        # --- Prepare final data record ---
        record = {
            "id": str(uuid.uuid4()),
            "source": best_msa_candidate if best_msa_candidate else "", # MSA as source
            "target": processed_tn, # TN as target
            "source_dialect": raw_tn_text, # Original raw TN text
            "msa_generated": best_msa_candidate if best_msa_candidate else "", # Initial (best) MSA candidate
            "score_composite": final_composite_score,
            "cosine_similarity": best_scores.get('semantic_similarity', 0.0),
            "lm_logprob": best_scores.get('lm_logprob', float('-inf')),
            "backtranslation_score": best_scores.get('backtranslation_score', 0.0),
            "ensemble_agreement": best_scores.get('ensemble_agreement', 0.0),
            "num_attempts": attempts,
            "accepted": accepted,
            "split": "train", # Placeholder, will assign splits later
            "date_generated": datetime.datetime.utcnow().isoformat() + 'Z',
            "model_used": GROQ_MODEL if USE_GROQ else LOCAL_LLM_NAME
        }
        processed_data.append(record)

        # Optional: Save periodically to avoid losing data
        if (i + 1) % 100 == 0:
             print(f"Processed {i+1} examples. Saving checkpoint...")
             # Save checkpoint logic here if needed (e.g., save every 100 examples)

    print(f"Pipeline completed. Total processed examples: {len(processed_data)}")

    # --- Assign Splits ---
    print("Assigning dataset splits...")
    df = pd.DataFrame(processed_data)
    if not df.empty:
        # Shuffle the dataframe
        df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
        total_size = len(df_shuffled)
        train_end = int(SPLIT_RATIO['train'] * total_size)
        val_end = train_end + int(SPLIT_RATIO['validation'] * total_size)

        df_shuffled.loc[:train_end, 'split'] = 'train'
        df_shuffled.loc[train_end:val_end, 'split'] = 'validation'
        df_shuffled.loc[val_end:, 'split'] = 'test'

        # Convert back to list of dicts
        processed_data = df_shuffled.to_dict('records')
    else:
         print("No data to assign splits to.")

    # --- Save to JSONL ---
    print(f"Saving final dataset to {OUTPUT_FILE}...")
    try:
        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
            for record in processed_data:
                f.write(json.dumps(record, ensure_ascii=False) + '\n')
        print("Dataset saved successfully.")
    except Exception as e:
        print(f"Error saving dataset: {e}")

In [None]:
generate_parallel_corpus()