# Data Processing Pipeline

## Source data

### Download

In [26]:
import os
import urllib.request
from urllib.parse import urlparse

def download_file(url, output_path):
    """Download a file from a URL to the specified path."""
    try:
        # Create parent directories if they don't exist
        os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
        
        # Download the file
        print(f"Downloading {url} to {output_path}...")
        urllib.request.urlretrieve(url, output_path)
        print("Download complete!")
        return True
    except Exception as e:
        print(f"Error downloading file: {e}")
        return False

def get_filename_from_url(url):
    """Extract the filename from a URL."""
    path = urlparse(url).path
    return os.path.basename(path) or "download"



In [None]:
url = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-5033#"
output = "../../Data"
download_file(url,output)


Downloading https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-5033# to ../../Data...
Error downloading file: [Errno 13] Permission denied: '../../Data'


False

### Create Dataset

In [None]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict
from tqdm import tqdm

def load_text_file(file_path):
    """Load a text file and return a list of lines."""
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f.readlines()]

def load_ids_file(file_path):
    """
    Load an .ids file and return a list of mappings.
    
    Each line in the file is expected to have the format:
    source_file\ttarget_file\tsource_positions\ttarget_positions
    
    Returns:
        List of dictionaries with parsed mapping information
    """
    mappings = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            parts = line.strip().split('\t')
            if len(parts) == 4:
                mapping = {
                    'source_file': parts[0],
                    'target_file': parts[1],
                    'source_positions': parts[2].split(),
                    'target_positions': parts[3].split()
                }
                mappings.append(mapping)
            else:
                print(f"Warning: Skipping malformed line in {file_path}: {line.strip()}")
    return mappings

def create_north_levantine_dataset(data_dir):
    """
    Create a Hugging Face dataset from the North Levantine parallel corpus.
    
    Args:
        data_dir: Directory containing the dataset files
    
    Returns:
        A DatasetDict containing the aligned parallel corpus
    """
    # ISO codes for all languages in the dataset
    languages = ['eng', 'arb', 'apc', 'deu', 'ell', 'fra', 'spa']
    
    # Step 1: Discover and load all language files
    text_data = {}
    
    # Find all language files in the directory
    language_files = {}
    for file in tqdm(os.listdir(data_dir), desc="Finding languages"):
        if file.endswith(tuple(languages)) and not file.endswith(".ids"):
            lang_code = file.split(".")[-1]
            language_files[lang_code] = os.path.join(data_dir, file)
    
    # Load discovered language files
    for lang in tqdm(languages, desc="Loading language files"):
        if lang in language_files:
            text_data[lang] = load_text_file(language_files[lang])
            print(f"Loaded {lang} file with {len(text_data[lang])} lines")
        else:
            print(f"Warning: File for language '{lang}' not found in {data_dir}")
    
    # Step 2: Create individual datasets with index column
    language_datasets = {}
    for lang, lines in tqdm(text_data.items(),desc="Creating individual datasets"):
        df = pd.DataFrame({
            'text': lines,
            'index': list(range(len(lines)))
        })
        language_datasets[lang] = Dataset.from_pandas(df)
    
    # Step 3: Create a merged dataset
    # Start with English dataset as base
    if 'eng' not in language_datasets:
        raise ValueError("English dataset is required as the base for alignment")
    
    # Get the number of examples in the English dataset
    num_examples = len(language_datasets['eng'])
    
    # Verify all languages have the same number of examples
    for lang, dataset in language_datasets.items():
        if len(dataset) != num_examples:
            print(f"Warning: {lang} dataset has {len(dataset)} examples, but English has {num_examples}")
    
    # Create a dictionary for the merged dataset
    merged_data = {
        'line_idx': list(range(num_examples)),
        'eng': language_datasets['eng']['text']
    }
    
    # Add other languages
    for lang in languages:
        if lang != 'eng' and lang in language_datasets:
            if len(language_datasets[lang]) == num_examples:
                merged_data[lang] = language_datasets[lang]['text']
            else:
                # Handle mismatched sizes by padding with empty strings
                padded_texts = language_datasets[lang]['text'] + [''] * (num_examples - len(language_datasets[lang]))
                merged_data[lang] = padded_texts[:num_examples]
    
    # Create the merged dataset
    merged_dataset = Dataset.from_dict(merged_data)
    
    '''
    # Add OpenSubtitles reference information if available
    if ids_data:
        print("Adding OpenSubtitles reference information...")
        # This could be expanded to add the mapping information to the dataset
        # For now, we just note that it's available
    
    # Step 4: Load the .ids files (optional)
    ids_data = {}
    
    # Discover all .ids files
    ids_files = {}
    for file in os.listdir(data_dir):
        if file.endswith(".ids"):
            lang_pair = file.split(".")[-2]  # Get the lang-eng part
            ids_files[lang_pair] = os.path.join(data_dir, file)
    
    for lang_pair, file_path in ids_files.items():
        ids_data[lang_pair] = load_ids_file(file_path)
        print(f"Loaded {lang_pair} file with {len(ids_data[lang_pair])} lines")
   ''' 
    # Create a dataset dictionary with train split
    dataset_dict = DatasetDict({
        'train': merged_dataset
    })
    
    return dataset_dict

def get_parallel_sentences(dataset, index, languages=None):
    """
    Get parallel sentences for a specific example across all or specified languages.
    
    Args:
        dataset: The dataset containing parallel sentences
        index: The index of the example to retrieve
        languages: Optional list of language codes to include (default: all available)
    
    Returns:
        Dictionary mapping language codes to sentences
    """
    if index >= len(dataset) or index < 0:
        raise ValueError(f"Index {index} out of range (0-{len(dataset)-1})")
    
    example = dataset[index]
    
    if languages is None:
        # Get all available languages excluding non-text columns
        languages = [col for col in example.keys() if col not in ['line_idx', 'index']]
    
    return {lang: example[lang] for lang in languages if lang in example}


In [65]:

data_dir = ".\\..\\..\\Data\\UFAL Parallel Corpus of North Levantine 1.0"

# Create the dataset
dataset = create_north_levantine_dataset(data_dir)


Finding languages: 100%|██████████| 14/14 [00:00<?, ?it/s]
Loading language files:  29%|██▊       | 2/7 [00:00<00:00, 13.19it/s]

Loaded eng file with 120600 lines
Loaded arb file with 120600 lines
Loaded apc file with 120600 lines


Loading language files:  86%|████████▌ | 6/7 [00:00<00:00, 12.72it/s]

Loaded deu file with 120600 lines
Loaded ell file with 120600 lines
Loaded fra file with 120600 lines


Loading language files: 100%|██████████| 7/7 [00:00<00:00, 13.29it/s]


Loaded spa file with 120600 lines


Creating individual datasets: 100%|██████████| 7/7 [00:00<00:00,  9.06it/s]


In [66]:
dataset["train"][0]

{'line_idx': 0,
 'eng': "Let's talk about the time Moldova made Romania a birthday cake and Romania said it tasted good even though it didn't.",
 'arb': 'دعونا نتحدّث عن الوقت الذي (قدّمت فيه (مالدوفا) لـ(رومانيا ،كعكة عيد ميلاد و (رومانيا) قالت أنّ طعمها جيّد حتّى و إن لم تكن كذلك',
 'apc': 'خلونا نحكي عن الوقت اللي قدمت فيه مالدوفا لرومانيا، كعكة عيد ميلاد ورومانيا قالت إنو طيبة حتى لو ما كانت طيبة',
 'deu': 'Reden wir darüber, als Moldavien Rumänien einen Geburtstagskuchen machte und Rumänien meinte, er wäre lecker, obwohl er das überhaupt nicht war.',
 'ell': 'Ας μιλήσουμε για το όταν η Μολδαβία έκανε στη Ρουμανία μια τούρτα και η Ρουμανία είπε ότι είναι νόστιμη, αν και δεν ήταν.',
 'fra': "Parlons de l'époque Moldova a fait la Roumanie un gâteau d'anniversaire et de la Roumanie a déclaré qu'il avait bon goût même si elle n'a pas fait.",
 'spa': 'Hablemos de la vez que Moldavia le hizo a Rumanía un pastel de cumpleaños y Rumanía dijo que sabía bien aunque no era verdad.'}

In [29]:
# Print some information about the dataset
print("\nDataset Information:")
print(f"Number of examples: {len(dataset['train'])}")
print(f"Features: {dataset['train'].features}")

# Show some examples
print(f"\nShowing {5} random examples:")
import random
sample_indices = random.sample(range(len(dataset['train'])), min(5, len(dataset['train'])))

for i, idx in enumerate(sample_indices):
    parallel_sentences = get_parallel_sentences(dataset['train'], idx)
    print(f"\nExample {i+1} (Index {idx}):")
    for lang, sentence in parallel_sentences.items():
        print(f"  {lang}: {sentence}")




Dataset Information:
Number of examples: 120600
Features: {'line_idx': Value(dtype='int64', id=None), 'eng': Value(dtype='string', id=None), 'arb': Value(dtype='string', id=None), 'apc': Value(dtype='string', id=None), 'deu': Value(dtype='string', id=None), 'ell': Value(dtype='string', id=None), 'fra': Value(dtype='string', id=None), 'spa': Value(dtype='string', id=None)}

Showing 5 random examples:

Example 1 (Index 7138):
  eng: Oh, my God, what is that smell?
  arb: يا إلهي ما هذه الرائحة ؟
  apc: يا ربي شو هالريحة؟
  deu: Ich habe da immerhin 2 Jahre gelebt. Soll ich etwa so tun, als ob ich da nie gewesen wäre?
  ell: Τι είναι αυτή η μυρωδιά
  fra: Oh, mon Dieu, c'est quoi cette odeur ?
  spa: Dios mío, ¿qué es ese olor?

Example 2 (Index 36021):
  eng: Darren, you don't owe these cops anything.
  arb: دارين) أنت لا تدين للشرطة بأيّ شيء) .
  apc: دارين أنت ما بتدين للشرطة بشي.
  deu: Darren, du bist diesen Cops überhaupt nichts schuldig.
  ell: Ντάρεν, δεν χρωστάς σε αυτούς τους μ

In [67]:

dataset = dataset["train"].train_test_split(test_size=0.15, seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['line_idx', 'eng', 'arb', 'apc', 'deu', 'ell', 'fra', 'spa'],
        num_rows: 102510
    })
    test: Dataset({
        features: ['line_idx', 'eng', 'arb', 'apc', 'deu', 'ell', 'fra', 'spa'],
        num_rows: 18090
    })
})

In [68]:
# Save the dataset locally
dataset.save_to_disk("../../Data/UFAL Parallel Corpus of North Levantine 1.0/Processed")


Saving the dataset (1/1 shards): 100%|██████████| 102510/102510 [00:01<00:00, 82004.78 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 18090/18090 [00:00<00:00, 71924.37 examples/s]


In [69]:
dataset.push_to_hub("KHuss/UFAL_levantine",private=True)

Creating parquet from Arrow format: 100%|██████████| 103/103 [00:08<00:00, 11.66ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:15<00:00, 15.62s/it]
Creating parquet from Arrow format: 100%|██████████| 19/19 [00:01<00:00, 11.74ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.05s/it]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/datasets/KHuss/UFAL_levantine/commit/a24bbab9bc0e167a1ae1e45d377808cbed6b8e6c', commit_message='Upload dataset', commit_description='', oid='a24bbab9bc0e167a1ae1e45d377808cbed6b8e6c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/KHuss/UFAL_levantine', endpoint='https://huggingface.co', repo_type='dataset', repo_id='KHuss/UFAL_levantine'), pr_revision=None, pr_num=None)

DatasetDict({
    train: Dataset({
        features: ['line_idx', 'eng', 'arb', 'apc', 'deu', 'ell', 'fra', 'spa'],
        num_rows: 102510
    })
    test: Dataset({
        features: ['line_idx', 'eng', 'arb', 'apc', 'deu', 'ell', 'fra', 'spa'],
        num_rows: 18090
    })
})

## Preprocess data
in our case, this will involve transliteration, and ensuring that each row has the appropriate features, i.e target and source languages, etc..

In [36]:
from camel_tools.utils.charmap import CharMapper

sentence = "ذهبت إلى المكتبة."
print(sentence)

ar2bw = CharMapper.builtin_mapper('ar2bw')
bw2ar = CharMapper.builtin_mapper('bw2ar')

sent_bw = ar2bw(sentence)
sent_ar = bw2ar(sent_bw)
print(sent_bw)
print(sent_ar)

ذهبت إلى المكتبة.
*hbt <lY Almktbp.
ذهبت إلى المكتبة.


In [52]:
def process_row(row):
    return {"eng":row["eng"], "arb": [ar2bw(ar) for ar in row["arb"]] if isinstance(row["arb"], list) else ar2bw(row["arb"])}
    

In [56]:
dataset_test = dataset.map(process_row,remove_columns=dataset["train"].column_names,batched=True,batch_size=128)

Map: 100%|██████████| 120600/120600 [00:01<00:00, 92329.21 examples/s]


In [57]:
dataset_test["train"][0]

{'eng': "Let's talk about the time Moldova made Romania a birthday cake and Romania said it tasted good even though it didn't.",
 'arb': 'dEwnA ntHd~v En Alwqt Al*y (qd~mt fyh (mAldwfA) l_(rwmAnyA ،kEkp Eyd mylAd w (rwmAnyA) qAlt >n~ TEmhA jy~d Ht~Y w <n lm tkn k*lk'}

In [None]:
def format_fn(row):
    comp = [row["eng"],ar2bw(row["arb"])] if not isinstance(row["arb"], list) else row["eng"].extend([ar2bw(ar) for ar in row["arb"]])  
    promp = [""]*len(comp)
    import pdb
    pdb.set_trace()
    return {"prompt":promp,"completion":comp }

In [None]:
dataset_test_2 = dataset.map(format_fn,remove_columns=dataset["train"].column_names)#,batched=True,batch_size=128)

Map:   0%|          | 0/102510 [00:00<?, ? examples/s]

> [32mc:\users\karim - work\appdata\local\temp\ipykernel_49892\99433667.py[39m([92m4[39m)[36mformat_fn[39m[34m()[39m

{'line_idx': 73903, 'eng': "That farmer's dream is all some people have.", 'arb': 'وحُلم المزارع هذا هوَ كلّ ما لدى بعض الناس', 'apc': 'حلم المزارع هاد كلّ يلي عند بعض الناس.', 'deu': 'Dieser Traum eines Farmers ist für viele alles, was sie haben.', 'ell': 'To όvειρo τoυ αγρότη είναι τo μόvo πoυ έχoυv κάπoιoι.', 'fra': 'Ce rêve représente tout pour certains.', 'spa': 'Hay personas que sólo tienen ese sueño de granjero.'}
*** NameError: name 'comp' is not defined
*** SyntaxError: unmatched ']'
["That farmer's dream is all some people have.", 'wHulm AlmzArE h*A hwa kl~ mA ldY bED AlnAs']
False
*** NameError: name 'vomp' is not defined
["That farmer's dream is all some people have.", 'wHulm AlmzArE h*A hwa kl~ mA ldY bED AlnAs']
['', '']
