# Data Processing Pipeline
This notebook is an interactive version of the data processing files, and also allows to explore the dataset interactively.

## Create Dataset
This script assume the dataset has already been downloaded

Since the sentences are placed in different files in the source, we haev to match each row from different language files using ids files.

In [10]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict
from tqdm import tqdm

def load_text_file(file_path):
    """Load a text file and return a list of lines."""
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f.readlines()]

def load_ids_file(file_path):
    """
    Load an .ids file and return a list of mappings.
    
    Each line in the file is expected to have the format:
    source_file\ttarget_file\tsource_positions\ttarget_positions
    
    Returns:
        List of dictionaries with parsed mapping information
    """
    mappings = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            parts = line.strip().split('\t')
            if len(parts) == 4:
                mapping = {
                    'source_file': parts[0],
                    'target_file': parts[1],
                    'source_positions': parts[2].split(),
                    'target_positions': parts[3].split()
                }
                mappings.append(mapping)
            else:
                print(f"Warning: Skipping malformed line in {file_path}: {line.strip()}")
    return mappings

def create_north_levantine_dataset(data_dir):
    """
    Create a Hugging Face dataset from the North Levantine parallel corpus.
    
    Args:
        data_dir: Directory containing the dataset files
    
    Returns:
        A DatasetDict containing the aligned parallel corpus
    """
    # ISO codes for all languages in the dataset
    languages = ['eng', 'arb', 'apc', 'deu', 'ell', 'fra', 'spa']
    
    # Step 1: Discover and load all language files
    text_data = {}
    
    # Find all language files in the directory
    language_files = {}
    for file in tqdm(os.listdir(data_dir), desc="Finding languages"):
        if file.endswith(tuple(languages)) and not file.endswith(".ids"):
            lang_code = file.split(".")[-1]
            language_files[lang_code] = os.path.join(data_dir, file)
    
    # Load discovered language files
    for lang in tqdm(languages, desc="Loading language files"):
        if lang in language_files:
            text_data[lang] = load_text_file(language_files[lang])
            print(f"Loaded {lang} file with {len(text_data[lang])} lines")
        else:
            print(f"Warning: File for language '{lang}' not found in {data_dir}")
    
    # Step 2: Create individual datasets with index column
    language_datasets = {}
    for lang, lines in tqdm(text_data.items(),desc="Creating individual datasets"):
        df = pd.DataFrame({
            'text': lines,
            'index': list(range(len(lines)))
        })
        language_datasets[lang] = Dataset.from_pandas(df)
    
    # Step 3: Create a merged dataset
    # Start with English dataset as base
    if 'eng' not in language_datasets:
        raise ValueError("English dataset is required as the base for alignment")
    
    # Get the number of examples in the English dataset
    num_examples = len(language_datasets['eng'])
    
    # Verify all languages have the same number of examples
    for lang, dataset in language_datasets.items():
        if len(dataset) != num_examples:
            print(f"Warning: {lang} dataset has {len(dataset)} examples, but English has {num_examples}")
    
    # Create a dictionary for the merged dataset
    merged_data = {
        'line_idx': list(range(num_examples)),
        'eng': language_datasets['eng']['text']
    }
    
    # Add other languages
    for lang in languages:
        if lang != 'eng' and lang in language_datasets:
            if len(language_datasets[lang]) == num_examples:
                merged_data[lang] = language_datasets[lang]['text']
            else:
                # Handle mismatched sizes by padding with empty strings
                padded_texts = language_datasets[lang]['text'] + [''] * (num_examples - len(language_datasets[lang]))
                merged_data[lang] = padded_texts[:num_examples]
    
    # Create the merged dataset
    merged_dataset = Dataset.from_dict(merged_data)
    
    # Create a dataset dictionary with train split
    dataset_dict = DatasetDict({
        'train': merged_dataset
    })
    
    return dataset_dict

def get_parallel_sentences(dataset, index, languages=None):
    """
    Get parallel sentences for a specific example across all or specified languages.
    
    Args:
        dataset: The dataset containing parallel sentences
        index: The index of the example to retrieve
        languages: Optional list of language codes to include (default: all available)
    
    Returns:
        Dictionary mapping language codes to sentences
    """
    if index >= len(dataset) or index < 0:
        raise ValueError(f"Index {index} out of range (0-{len(dataset)-1})")
    
    example = dataset[index]
    
    if languages is None:
        # Get all available languages excluding non-text columns
        languages = [col for col in example.keys() if col not in ['line_idx', 'index']]
    
    return {lang: example[lang] for lang in languages if lang in example}


In [11]:

data_dir = ".\\..\\..\\data\\dataset\\UFAL Parallel Corpus of North Levantine 1.0"

# Create the dataset
dataset = create_north_levantine_dataset(data_dir)


Finding languages: 100%|██████████| 13/13 [00:00<?, ?it/s]
Loading language files:  43%|████▎     | 3/7 [00:00<00:00, 19.30it/s]

Loaded eng file with 120600 lines
Loaded arb file with 120600 lines
Loaded apc file with 120600 lines
Loaded deu file with 120600 lines
Loaded ell file with 120600 lines


Loading language files: 100%|██████████| 7/7 [00:00<00:00, 20.39it/s]


Loaded fra file with 120600 lines
Loaded spa file with 120600 lines


Creating individual datasets: 100%|██████████| 7/7 [00:00<00:00,  9.99it/s]


## View Dataset

In [12]:
dataset["train"][0]

{'line_idx': 0,
 'eng': "Let's talk about the time Moldova made Romania a birthday cake and Romania said it tasted good even though it didn't.",
 'arb': 'دعونا نتحدّث عن الوقت الذي (قدّمت فيه (مالدوفا) لـ(رومانيا ،كعكة عيد ميلاد و (رومانيا) قالت أنّ طعمها جيّد حتّى و إن لم تكن كذلك',
 'apc': 'خلونا نحكي عن الوقت اللي قدمت فيه مالدوفا لرومانيا، كعكة عيد ميلاد ورومانيا قالت إنو طيبة حتى لو ما كانت طيبة',
 'deu': 'Reden wir darüber, als Moldavien Rumänien einen Geburtstagskuchen machte und Rumänien meinte, er wäre lecker, obwohl er das überhaupt nicht war.',
 'ell': 'Ας μιλήσουμε για το όταν η Μολδαβία έκανε στη Ρουμανία μια τούρτα και η Ρουμανία είπε ότι είναι νόστιμη, αν και δεν ήταν.',
 'fra': "Parlons de l'époque Moldova a fait la Roumanie un gâteau d'anniversaire et de la Roumanie a déclaré qu'il avait bon goût même si elle n'a pas fait.",
 'spa': 'Hablemos de la vez que Moldavia le hizo a Rumanía un pastel de cumpleaños y Rumanía dijo que sabía bien aunque no era verdad.'}

In [14]:
# Print some information about the dataset
print("\nDataset Information:")
print(f"Number of examples: {len(dataset['train'])}")
print(f"Features: {dataset['train'].features}")

# Show some examples
print(f"\nShowing {5} random examples:")
import random
sample_indices = random.sample(range(len(dataset['train'])), min(5, len(dataset['train'])))

for i, idx in enumerate(sample_indices):
    parallel_sentences = get_parallel_sentences(dataset['train'], idx,["eng","arb","apc"])
    print(f"\nExample {i+1} (Index {idx}):")
    for lang, sentence in parallel_sentences.items():
        print(f"  {lang}: {sentence}")




Dataset Information:
Number of examples: 120600
Features: {'line_idx': Value(dtype='int64', id=None), 'eng': Value(dtype='string', id=None), 'arb': Value(dtype='string', id=None), 'apc': Value(dtype='string', id=None), 'deu': Value(dtype='string', id=None), 'ell': Value(dtype='string', id=None), 'fra': Value(dtype='string', id=None), 'spa': Value(dtype='string', id=None)}

Showing 5 random examples:

Example 1 (Index 62789):
  eng: Well, even using the ring will take its toll.
  arb: حتى استخدام الخاتم لن يفيدك
  apc: حتى استخدام الخاتم مارح يفيدك.

Example 2 (Index 38197):
  eng: It's good to see that not everybody's put off their dinner by the weather forecast.
  arb: إنه من الجيّد رؤية ليس هناك أحد يترك عشائه جراء الأرصاد الجوية.
  apc: منيح شوفة ما في حدا بيترك العشا تبعو بسبب الأرصاد الجوية.

Example 3 (Index 79060):
  eng: Someone on our team took a laptop from Mark's safe house.
  arb: شخصٌ ما في فريقنا أخذ حاسوب محمول من منزل (مارك) الأمن
  apc: شخص بفريقنا أخد حاسوب من بيت ما

## Save to disk

In [69]:
dataset = dataset["train"].train_test_split(test_size=0.15, seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['line_idx', 'eng', 'arb', 'apc', 'deu', 'ell', 'fra', 'spa'],
        num_rows: 102510
    })
    test: Dataset({
        features: ['line_idx', 'eng', 'arb', 'apc', 'deu', 'ell', 'fra', 'spa'],
        num_rows: 18090
    })
})

In [68]:
# Save the dataset locally
dataset.save_to_disk("../../Data/UFAL Parallel Corpus of North Levantine 1.0/Processed")


Saving the dataset (1/1 shards): 100%|██████████| 102510/102510 [00:01<00:00, 82004.78 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 18090/18090 [00:00<00:00, 71924.37 examples/s]


## Transliterate dataset

In [16]:
from camel_tools.utils.charmap import CharMapper

sentence = "ذهبت إلى المكتبة."
print(sentence)

ar2bw = CharMapper.builtin_mapper('ar2hsb')
bw2ar = CharMapper.builtin_mapper('hsb2ar')

sent_bw = ar2bw(sentence)
sent_ar = bw2ar(sent_bw)
print(sent_bw)
print(sent_ar)


ذهبت إلى المكتبة.
ðhbt Ălý Almktbħ.
ذهبت إلى المكتبةْ


In [17]:
def format_fn(row):
    return {"eng":row["eng"], "arb": [ar2bw(ar) for ar in row["arb"]] if isinstance(row["arb"], list) else ar2bw(row["arb"])}

In [22]:
roman_dataset = dataset.map(format_fn,remove_columns=dataset["train"].column_names,batched=True,batch_size=128)

Map: 100%|██████████| 120600/120600 [00:01<00:00, 88680.57 examples/s]


In [23]:
# Show some examples
print(f"\nShowing {5} random examples:")
import random
sample_indices = random.sample(range(len(roman_dataset['train'])), min(5, len(roman_dataset['train'])))

for i, idx in enumerate(sample_indices):
    parallel_sentences = get_parallel_sentences(roman_dataset['train'], idx,["eng","arb","apc"])
    print(f"\nExample {i+1} (Index {idx}):")
    for lang, sentence in parallel_sentences.items():
        print(f"  {lang}: {sentence}")


Showing 5 random examples:

Example 1 (Index 67074):
  eng: It's about the Pan Am flight from Lockerbie.
  arb: AnhA ttHdθ ςn AxtTAf TAŷrħ lwkrby

Example 2 (Index 117225):
  eng: I said I'm gonna write a book someday, use all our hard work, take the credit for myself.
  arb: qultu sa Ăktb. a ktAb ywmAã mA، Ăstςml. kul~ ςmlnA AlšAq~، xuð. AlfDl fy nfsy.

Example 3 (Index 35784):
  eng: Then get the hell out of here!
  arb: ! ĂðAa Axrji mn hnA

Example 4 (Index 44728):
  eng: He tried to rape you.
  arb: hw HAwla ĂγtiSAbk

Example 5 (Index 78549):
  eng: I piss on the beards of those selfrighteous monkeys.
  arb: Âbwl ςlý lHý hŵlA' Alqrwd Almtςjrfyn


## Pretraining dataset

In [26]:
def format_fn(row):
    comp = [row["eng"],ar2bw(row["arb"],ar2bw(row["apc"]))] if not isinstance(row["arb"], list) else row["eng"]+[ar2bw(ar) for ar in row["arb"]]+[ar2bw(ar) for ar in row["apc"]]
    promp = [""]*len(comp)
    return {"prompt":promp,"completion":comp }

In [27]:
pretrain_dataset = dataset.map(format_fn,remove_columns=dataset["train"].column_names,batched=True,batch_size=128)

Map: 100%|██████████| 120600/120600 [00:02<00:00, 53256.34 examples/s]


In [30]:
# Show some examples
print(f"\nShowing {5} random examples:")
import random
sample_indices = random.sample(range(len(pretrain_dataset['train'])), min(5, len(pretrain_dataset['train'])))

for i, idx in enumerate(sample_indices):
    print(f"\nExample {i+1} (Index {idx}):")
    print(pretrain_dataset["train"][idx])


Showing 5 random examples:

Example 1 (Index 355528):
{'prompt': '', 'completion': 'mA Ălhn Ây Âhmyħ bAlςAlm AlHqyqy'}

Example 2 (Index 159478):
{'prompt': '', 'completion': 'And if I searched it without one, then any evidence I would find would be inadmissible.'}

Example 3 (Index 254657):
{'prompt': '', 'completion': 'Poor Mrs. Cross was obliged to accept a paid position in Buckinghamshire.'}

Example 4 (Index 113366):
{'prompt': '', 'completion': "It's impossible to steal."}

Example 5 (Index 42849):
{'prompt': '', 'completion': 'knt Âfkr fy jmςhm swyħ.'}


## Translation dataset

In [36]:
def format_fn(row):
    prompts = []
    completions = []
    
    # Check if the row values are lists or single strings
    if not isinstance(row["eng"], list):
        # English to Arabic translations
        prompts.append(f"english: {row['eng']}\nclassical arabic:")
        completions.append(f"{row['arb']}")
        
        prompts.append(f"english: {row['eng']}\nlevantine arabic:")
        completions.append(f"{row['apc']}")
        
        # Classical Arabic to other languages
        prompts.append(f"classical arabic: {row['arb']}\nenglish:")
        completions.append(f"{row['eng']}")
        
        prompts.append(f"classical arabic: {row['arb']}\nlevantine arabic:")
        completions.append(f"{row['apc']}")
        
        # Levantine Arabic to other languages
        prompts.append(f"levantine arabic: {row['apc']}\nenglish:")
        completions.append(f"{row['eng']}")
        
        prompts.append(f"levantine arabic: {row['apc']}\nclassical arabic:")
        completions.append(f"{row['arb']}")
    else:
        # Handle list case (though less likely in this context)
        for i in range(len(row["eng"])):
            # English to Arabic translations
            prompts.append(f"english: {row['eng'][i]}\nclassical arabic:")
            completions.append(f"{row['arb'][i]}")
            
            prompts.append(f"english: {row['eng'][i]}\nlevantine arabic:")
            completions.append(f"{row['apc'][i]}")
            
            # Classical Arabic to other languages
            prompts.append(f"classical arabic: {row['arb'][i]}\nenglish:")
            completions.append(f"{row['eng'][i]}")
            
            prompts.append(f"classical arabic: {row['arb'][i]}\nlevantine arabic:")
            completions.append(f"{row['apc'][i]}")
            
            # Levantine Arabic to other languages
            prompts.append(f"levantine arabic: {row['apc'][i]}\nenglish:")
            completions.append(f"{row['eng'][i]}")
            
            prompts.append(f"levantine arabic: {row['apc'][i]}\nclassical arabic:")
            completions.append(f"{row['arb'][i]}")
    
    return {"prompt": prompts, "completion": completions}



In [33]:
translation_dataset = dataset.map(format_fn,remove_columns=dataset["train"].column_names,batched=True,batch_size=128)

Map: 100%|██████████| 120600/120600 [00:01<00:00, 81307.26 examples/s]


In [37]:
# Show some examples
print(f"\nShowing {5} random examples:")
import random
sample_indices = random.sample(range(len(translation_dataset['train'])), min(5, len(translation_dataset['train'])))

for i, idx in enumerate(sample_indices):
    print(f"\nExample {i+1} (Index {idx}):")
    print(translation_dataset["train"][idx])


Showing 5 random examples:

Example 1 (Index 288211):
{'prompt': 'english: Here is a hot beverage to comfort you.\nlevantine arabic:', 'completion': 'هاد مشروب ساخن لترتاح'}

Example 2 (Index 203638):
{'prompt': 'levantine arabic: إنتي ملكة وكلشي بتحكميه هالك\nenglish:', 'completion': 'Kill a queen, and all queens are mortal.'}

Example 3 (Index 604281):
{'prompt': 'classical arabic: لاأريد أن أجرح مشاعرك\nlevantine arabic:', 'completion': 'ما بدي أجرح مشاعرك.'}

Example 4 (Index 643991):
{'prompt': 'levantine arabic: أكتر شي فينا نساويه أنو نوصلك لهنيك، و.. حتى أقدر عيش بسجن؟\nclassical arabic:', 'completion': 'جلماعلينافعلههوإيصالكهناك, و. . لأتمكن من العيش في زنزانه ؟'}

Example 5 (Index 721383):
{'prompt': 'classical arabic: (كان يجب ان اخبرك عن (اليكس\nlevantine arabic:', 'completion': 'كان لازم خبرك عن أليكس'}
