In [None]:
!pip install -q wget

In [None]:
import os
import sys
import torch
import wget
import zipfile
import regex as re

# Define URLs
ted_moses_url = "https://object.pouta.csc.fi/OPUS-TED2020/v1/moses/en-fr.txt.zip"
europarl_moses_url = "https://object.pouta.csc.fi/OPUS-Europarl/v8/moses/en-fr.txt.zip"

download_dir = "."

# File paths for downloads
ted_zip = os.path.join(download_dir, "ted.zip")
europarl_zip = os.path.join(download_dir, "europarl.zip")


if not os.path.exists(ted_zip):
    print("Downloading TED parallel data...")
    ted_path = wget.download(ted_moses_url, out=ted_zip)
    print("\nTED download complete.")
else:
    print("TED zip already exists, skipping download.")
    ted_path = ted_zip

if not os.path.exists(europarl_zip):
    print("\nDownloading Europarl parallel data...")
    europarl_path = wget.download(europarl_moses_url, out=europarl_zip)
    print("\nEuroparl download complete.")
else:
    print("Europarl zip already exists, skipping download.")
    europarl_path = europarl_zip

# Define extraction directories
ted_dir = os.path.join(download_dir, "TED")
europarl_dir = os.path.join(download_dir, "Europarl")

# Define extraction directories
ted_dir = os.path.join(download_dir, "TED")
europarl_dir = os.path.join(download_dir, "Europarl")

# Create extraction directories if they don't exist
os.makedirs(ted_dir, exist_ok=True)
os.makedirs(europarl_dir, exist_ok=True)

# Extract Ted data
if not os.listdir(ted_dir):
    print("\nExtracting TED parallel data...")
    with zipfile.ZipFile(ted_path, 'r') as zip_ref:
        zip_ref.extractall(ted_dir)
    print("TED extraction complete.")
else:
    print("TED directory already contains files, skipping extraction.")

# Extract Europarl data
if not os.listdir(europarl_dir):
    print("\nExtracting Europarl parallel data...")
    with zipfile.ZipFile(europarl_path, 'r') as zip_ref:
        zip_ref.extractall(europarl_dir)
    print("Europarl extraction complete.")
else:
    print("Europarl directory already contains files, skipping extraction.")


#  Define file paths

ted_fr = "TED2020.en-fr.fr"
ted_en = "TED2020.en-fr.en"
europarl_fr = "Europarl.en-fr.fr"
europarl_en = "Europarl.en-fr.en"

# Full paths to
ted_fr_path = os.path.join(ted_dir, ted_fr)
ted_en_path = os.path.join(ted_dir, ted_en)
europarl_fr_path = os.path.join(europarl_dir, europarl_fr)
europarl_en_path = os.path.join(europarl_dir, europarl_en)


# ensure lines match for all
def count_lines(file_path):
    """
    Count the number of lines in a file.

    Args:
        file_path (str): Path to the file

    Returns:
        int: Number of lines in the file
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        return sum(1 for _ in f)

def verify_parallel_corpus(files):
    """
    Verify that all files have the same number of lines.

    Args:
        files (list): List of file paths to compare

    Raises:
        ValueError: If files have different line counts
    """
    line_counts = {}

    # Count lines for each file
    for file_path in files:
        line_counts[file_path] = count_lines(file_path)

    # Get the first line count to compare against
    first_file = list(line_counts.keys())[0]
    first_count = line_counts[first_file]

    # Verify all files have the same line count
    mismatched_files = []
    for file_path, count in line_counts.items():
        if count != first_count:
            mismatched_files.append((file_path, count))

    # Raise an error if any mismatches found
    if mismatched_files:
        error_message = "Line count mismatch in parallel corpus:\n"
        for file_path, count in mismatched_files:
            error_message += f"{file_path}: {count} lines (expected {first_count})\n"
        raise ValueError(error_message)

    print("All files have matching line counts:")
    for file_path, count in line_counts.items():
        print(f"{os.path.basename(file_path)}: {count} lines")

# Check that all files have matching lines
verify_parallel_corpus([
    ted_fr_path,
    ted_en_path,
])
verify_parallel_corpus([
    europarl_en_path, europarl_fr_path])

def print_parallel_samples(en_path, fr_path, num_samples=5):
    """
    Print parallel sentence samples from English and French files.

    Args:
        en_path (str): Path to English file
        fr_path (str): Path to French file
        num_samples (int): Number of samples to print
    """
    with open(en_path, 'r', encoding='utf-8') as en_file, \
         open(fr_path, 'r', encoding='utf-8') as fr_file:

        print(f"Parallel Corpus Samples ({num_samples} lines):")
        print("-" * 50)

        for i in range(num_samples):
            en_line = en_file.readline().strip()
            fr_line = fr_file.readline().strip()

            print(f"EN [{i+1}]: {en_line}")
            print(f"FR [{i+1}]: {fr_line}")
            print()

# Print samples for TED corpus
print("TED Corpus Samples:")
print_parallel_samples(ted_en_path, ted_fr_path, 20)

# Print samples for Europarl corpus
print("\nEuroparl Corpus Samples:")
print_parallel_samples(europarl_en_path, europarl_fr_path, 20)

# def is_valid(line):
#     """
#     Determines if a line is likely valid based on specific patterns.
#     Returns True if the line contains Fongbe, False otherwise.
#     """
#     # Skip empty lines
#     if not line.strip():
#         return False

#     # Check for non-Fongbe scripts
#     if re.search(r'[\u0400-\u04FF]', line):  # Cyrillic
#         return False
#     if re.search(r'[\u4E00-\u9FFF]', line):  # Chinese
#         return False
#     if re.search(r'[\u0600-\u06FF]', line):  # Arabic
#         return False
#     if re.search(r'[ÂÃÇÊÎÏÑÔÖ×ÜÞßåæçðñõ÷øýþÿİŞ]', line):
#         return False
#     # Default to not Fongbe if no clear indicators
#     return True

# dirty_lines_fon = []
# dirty_lines_bem = []

# # Print a random sample from each pair to verify content
# def print_sample(file1_path, file2_path, lang1, lang2, sample_count=3):
#     with open(file1_path, 'r', encoding='utf-8') as f1, open(file2_path, 'r', encoding='utf-8') as f2:
#         lines1 = f1.readlines()
#         lines2 = f2.readlines()

#         import random
#         indices = random.sample(range(min(len(lines1), len(lines2))), sample_count)

#         print(f"\nSample of {sample_count} random {lang1}-{lang2} pairs:")
#         for idx in indices:
#             print(f"{lang1}: {lines1[idx].strip()}")
#             print(f"{lang2}: {lines2[idx].strip()}")
#             print("-" * 40)
#         countr=0
#         for i,line1,line2 in enumerate(lines1, lines2)
#           valid = is_valid(line1)
#           if valid is False:
#             countr+=1
#             dirty_lines.append(i)
#         print(f"found {countr} lines")

# print_sample(bem_path, en_path, "Bemba", "English")
# print_sample(fon_path, fra_path, "Fon", "French")

In [None]:
import re
from typing import Dict, List, Optional, Set, Tuple, Union
from tqdm import tqdm

class MultilingualInverseNumberNormalizer:
    """
    Base class for language-specific inverse number normalizers
    """
    def __init__(self):
        # Mappings for converting digits/numbers to words
        self.digits: Dict[int, str] = {}
        self.teens: Dict[int, str] = {}
        self.tens: Dict[int, str] = {}
        self.multipliers: Dict[int, str] = {}

        # Special cases
        self.ordinal_suffixes: Dict[str, str] = {}
        self.currency_symbols: Dict[str, str] = {}

        # Patterns for detecting numbers, currencies, etc.
        self.number_pattern = r'\b\d+\b'
        self.decimal_pattern = r'\b\d+\.\d+\b'
        self.ordinal_pattern = r'\b\d+(st|nd|rd|th)\b'
        self.currency_pattern = r'(?:([$€£])(\d+(?:[,.]\d+)?)|(\d+(?:[,.]\d+)?)([€£$]))'
        self.percentage_pattern = r'(\d+(?:\.\d+)?)%'

    def convert_number(self, num: int) -> str:
        """
        Convert a number to its word representation
        Must be implemented by subclasses
        """
        raise NotImplementedError("Subclasses must implement this method")

    def convert_decimal(self, num: float) -> str:
        """
        Convert a decimal number to its word representation
        Must be implemented by subclasses
        """
        raise NotImplementedError("Subclasses must implement this method")

    def convert_ordinal(self, match: re.Match) -> str:
        """
        Convert an ordinal number (e.g. "1st") to its word representation
        Must be implemented by subclasses
        """
        raise NotImplementedError("Subclasses must implement this method")

    def convert_currency(self, match: re.Match) -> str:
        """
        Convert a currency amount to its word representation
        Must be implemented by subclasses
        """
        raise NotImplementedError("Subclasses must implement this method")

    def convert_percentage(self, match: re.Match) -> str:
        """
        Convert a percentage to its word representation
        Must be implemented by subclasses
        """
        raise NotImplementedError("Subclasses must implement this method")

    def __call__(self, text: str) -> str:
        """
        Main method to perform inverse text normalization
        """
        # Process ordinals first (to avoid conflict with basic numbers)
        text = re.sub(self.ordinal_pattern, lambda m: self.convert_ordinal(m), text)

        # Process currency
        text = re.sub(self.currency_pattern, lambda m: self.convert_currency(m), text)

        # Process percentages
        text = re.sub(self.percentage_pattern, lambda m: self.convert_percentage(m), text)

        # Process decimals - pass the whole string for handling comma/period
        text = re.sub(self.decimal_pattern,
                      lambda m: self.convert_decimal(m.group(0)), text)

        # Process regular numbers
        text = re.sub(self.number_pattern,
                     lambda m: self.convert_number(int(m.group(0))), text)

        return text


class EnglishInverseNumberNormalizer(MultilingualInverseNumberNormalizer):
    """
    Convert arabic numbers into spelled-out English text
    """
    def __init__(self):
        super().__init__()

        self.digits = {
            0: "zero",
            1: "one",
            2: "two",
            3: "three",
            4: "four",
            5: "five",
            6: "six",
            7: "seven",
            8: "eight",
            9: "nine"
        }

        self.teens = {
            10: "ten",
            11: "eleven",
            12: "twelve",
            13: "thirteen",
            14: "fourteen",
            15: "fifteen",
            16: "sixteen",
            17: "seventeen",
            18: "eighteen",
            19: "nineteen"
        }

        self.tens = {
            2: "twenty",
            3: "thirty",
            4: "forty",
            5: "fifty",
            6: "sixty",
            7: "seventy",
            8: "eighty",
            9: "ninety"
        }

        self.multipliers = {
            100: "hundred",
            1_000: "thousand",
            1_000_000: "million",
            1_000_000_000: "billion",
            1_000_000_000_000: "trillion"
        }

        # Sorted multipliers from largest to smallest for processing
        self.sorted_multipliers = sorted(self.multipliers.keys(), reverse=True)

        self.ordinal_suffixes = {
            "1": "st",
            "2": "nd",
            "3": "rd"
        }

        self.ordinal_words = {
            1: "first",
            2: "second",
            3: "third",
            4: "fourth",
            5: "fifth",
            8: "eighth",
            9: "ninth",
            12: "twelfth"
        }

        self.currency_symbols = {
            "$": "dollar",
            "£": "pound",
            "€": "euro",
            "¢": "cent"
        }

        # Update patterns with English-specific regex
        self.ordinal_pattern = r'\b(\d+)(st|nd|rd|th)\b'
        self.currency_pattern = r'(?:([$€£])(\d+(?:[,.]\d+)?)|(\d+(?:[,.]\d+)?)([€£$]))'
        self.cents_pattern = r'¢(\d+)'

    def convert_number(self, num: int) -> str:
        """Convert a number to its word representation in English"""
        if num == 0:
            return "zero"

        if num < 0:
            return "negative " + self.convert_number(abs(num))

        if num < 10:
            return self.digits[num]

        if num < 20:
            return self.teens[num]

        if num < 100:
            tens_digit = num // 10
            units_digit = num % 10

            if units_digit == 0:
                return self.tens[tens_digit]
            else:
                return f"{self.tens[tens_digit]}-{self.digits[units_digit]}"

        # Handle larger numbers recursively
        for multiplier in self.sorted_multipliers:
            if num >= multiplier:
                quotient, remainder = divmod(num, multiplier)

                if remainder == 0:
                    return f"{self.convert_number(quotient)} {self.multipliers[multiplier]}"
                else:
                    if multiplier == 100:
                        conjunction = " and " if remainder < 100 else " "
                    else:
                        conjunction = " "

                    return f"{self.convert_number(quotient)} {self.multipliers[multiplier]}{conjunction}{self.convert_number(remainder)}"

        # Should never reach here given the checks above
        return str(num)

    def convert_decimal(self, num: Union[float, str]) -> str:
        """Convert a decimal number to its word representation in English"""
        # Ensure num is a float
        if isinstance(num, str):
            # Replace comma with period for float conversion
            num_str = num.replace(',', '.')
            num = float(num_str)

        # Handle scientific notation by converting to regular float string
        num_str = str(num)

        if 'e' in num_str.lower():
            # For scientific notation, convert to regular notation
            num_str = f"{num:.10f}".rstrip('0').rstrip('.')

        # Split on the decimal point
        parts = num_str.split('.')
        integer_part = int(parts[0])

        integer_words = self.convert_number(integer_part)

        if len(parts) > 1:
            # Handle the decimal part digit by digit
            decimal_part = parts[1]
            decimal_words = " ".join(self.digits[int(digit)] for digit in decimal_part)
            return f"{integer_words} point {decimal_words}"
        else:
            # No decimal part
            return integer_words

    def convert_ordinal(self, match: re.Match) -> str:
        """Convert an ordinal number to its word representation in English"""
        number = int(match.group(1))

        # Special cases
        if number in self.ordinal_words:
            return self.ordinal_words[number]

        # General cases
        if number < 10:
            # Single digit ordinals
            return f"{self.digits[number]}th"
        elif number < 20:
            # Teens ordinals
            base_word = self.teens[number]
            return f"{base_word[:-1]}th" if base_word.endswith('e') else f"{base_word}th"
        elif number < 100:
            # Tens
            tens_digit = number // 10
            units_digit = number % 10

            if units_digit == 0:
                base_word = self.tens[tens_digit]
                return f"{base_word[:-1]}ieth"

            # Compound numbers
            tens_word = self.tens[tens_digit]

            if units_digit in self.ordinal_words:
                units_word = self.ordinal_words[units_digit]
            else:
                units_word = f"{self.digits[units_digit]}th"

            return f"{tens_word}-{units_word}"
        else:
            # For larger numbers, convert the number and then modify the last word
            number_words = self.convert_number(number).split()
            last_word = number_words[-1]

            # Apply ordinal rules to the last word
            if last_word == "one":
                number_words[-1] = "first"
            elif last_word == "two":
                number_words[-1] = "second"
            elif last_word == "three":
                number_words[-1] = "third"
            elif last_word == "five":
                number_words[-1] = "fifth"
            elif last_word == "eight":
                number_words[-1] = "eighth"
            elif last_word == "nine":
                number_words[-1] = "ninth"
            elif last_word == "twelve":
                number_words[-1] = "twelfth"
            elif last_word.endswith('y'):
                number_words[-1] = last_word[:-1] + "ieth"
            else:
                number_words[-1] = last_word + "th"

            return " ".join(number_words)

    def convert_currency(self, match: re.Match) -> str:
        """Convert a currency amount to its word representation in English"""
        # Handle both pattern formats (symbol before or after amount)
        if match.group(1) is not None:
            # Format: $42,50
            symbol = match.group(1)
            amount = match.group(2).replace(',', '.')
        else:
            # Format: 42,50€
            symbol = match.group(4)
            amount = match.group(3).replace(',', '.')

        if '.' in amount:
            dollars, cents = amount.split('.')
            dollars_int = int(dollars)
            cents_int = int(cents)

            dollars_text = self.convert_number(dollars_int)

            if cents_int == 0:
                return f"{dollars_text} {self.currency_symbols[symbol]}s"

            cents_text = self.convert_number(cents_int)

            if dollars_int == 1:
                return f"one {self.currency_symbols[symbol]} and {cents_text} cents"
            else:
                return f"{dollars_text} {self.currency_symbols[symbol]}s and {cents_text} cents"
        else:
            amount_int = int(amount)
            amount_text = self.convert_number(amount_int)

            if amount_int == 1:
                return f"one {self.currency_symbols[symbol]}"
            else:
                return f"{amount_text} {self.currency_symbols[symbol]}s"

    def convert_percentage(self, match: re.Match) -> str:
        """Convert a percentage to its word representation in English"""
        value = match.group(1)

        if '.' in value:
            return f"{self.convert_decimal(float(value))} percent"
        else:
            return f"{self.convert_number(int(value))} percent"


class FrenchInverseNumberNormalizer(MultilingualInverseNumberNormalizer):
    """
    Convert arabic numbers into spelled-out French text
    """
    def __init__(self):
        super().__init__()

        self.digits = {
            0: "zéro",
            1: "un",
            2: "deux",
            3: "trois",
            4: "quatre",
            5: "cinq",
            6: "six",
            7: "sept",
            8: "huit",
            9: "neuf"
        }

        self.teens = {
            10: "dix",
            11: "onze",
            12: "douze",
            13: "treize",
            14: "quatorze",
            15: "quinze",
            16: "seize",
            17: "dix-sept",
            18: "dix-huit",
            19: "dix-neuf"
        }

        self.tens = {
            2: "vingt",
            3: "trente",
            4: "quarante",
            5: "cinquante",
            6: "soixante",
            7: "soixante-dix",
            8: "quatre-vingt",
            9: "quatre-vingt-dix"
        }

        self.multipliers = {
            100: "cent",
            1_000: "mille",
            1_000_000: "million",
            1_000_000_000: "milliard",
            1_000_000_000_000: "billion"
        }

        # Special case words for French numbers
        self.special_tens = {
            71: "soixante et onze",
            72: "soixante-douze",
            73: "soixante-treize",
            74: "soixante-quatorze",
            75: "soixante-quinze",
            76: "soixante-seize",
            77: "soixante-dix-sept",
            78: "soixante-dix-huit",
            79: "soixante-dix-neuf",
            91: "quatre-vingt-onze",
            92: "quatre-vingt-douze",
            93: "quatre-vingt-treize",
            94: "quatre-vingt-quatorze",
            95: "quatre-vingt-quinze",
            96: "quatre-vingt-seize",
            97: "quatre-vingt-dix-sept",
            98: "quatre-vingt-dix-huit",
            99: "quatre-vingt-dix-neuf"
        }

        # Sorted multipliers from largest to smallest for processing
        self.sorted_multipliers = sorted(self.multipliers.keys(), reverse=True)

        self.ordinal_suffixes = {
            "1": "er",  # premier
            "default": "ème"  # deuxième, troisième, etc.
        }

        self.ordinal_words = {
            1: "premier",
            2: "deuxième",
            3: "troisième",
            4: "quatrième",
            5: "cinquième"
        }

        self.currency_symbols = {
            "$": "dollar",
            "£": "livre",
            "€": "euro",
            "¢": "centime"
        }

        # Update patterns with French-specific regex
        self.decimal_pattern = r'\b\d+[,.]\d+\b'
        self.decimal_pattern_method = lambda m: self.convert_decimal(m.group(0))
        self.ordinal_pattern = r'\b(\d+)(er|ème|e)\b'
        self.currency_pattern = r'(?:([$€£])(\d+(?:[,.]\d+)?)|(\d+(?:[,.]\d+)?)([€£$]))'
        self.cents_pattern = r'¢(\d+)'

    def convert_number(self, num: int) -> str:
        """Convert a number to its word representation in French"""
        if num == 0:
            return "zéro"

        if num < 0:
            return "moins " + self.convert_number(abs(num))

        if num < 10:
            return self.digits[num]

        if num < 20:
            return self.teens[num]

        if num < 100:
            # Special cases
            if num in self.special_tens:
                return self.special_tens[num]

            tens_digit = num // 10
            units_digit = num % 10

            if units_digit == 0:
                # Special case for 80
                if tens_digit == 8:
                    return "quatre-vingts"
                return self.tens[tens_digit]
            elif units_digit == 1 and tens_digit not in [7, 9]:
                # Special case for numbers like 21, 31, 41, etc. (but not 71, 91)
                return f"{self.tens[tens_digit]} et un"
            else:
                return f"{self.tens[tens_digit]}-{self.digits[units_digit]}"

        # Handle larger numbers recursively
        for multiplier in self.sorted_multipliers:
            if num >= multiplier:
                quotient, remainder = divmod(num, multiplier)

                # Special cases for French numbers
                if multiplier == 100:
                    if quotient == 1 and remainder == 0:
                        return "cent"
                    elif quotient == 1:
                        return f"cent {self.convert_number(remainder)}"
                    elif remainder == 0:
                        return f"{self.convert_number(quotient)} cents"
                    else:
                        return f"{self.convert_number(quotient)} cent {self.convert_number(remainder)}"

                elif multiplier == 1000:
                    if quotient == 1:
                        if remainder == 0:
                            return "mille"
                        else:
                            return f"mille {self.convert_number(remainder)}"
                    else:
                        if remainder == 0:
                            return f"{self.convert_number(quotient)} mille"
                        else:
                            return f"{self.convert_number(quotient)} mille {self.convert_number(remainder)}"

                else:  # million, milliard, etc.
                    plural_suffix = "s" if quotient > 1 else ""

                    if remainder == 0:
                        return f"{self.convert_number(quotient)} {self.multipliers[multiplier]}{plural_suffix}"
                    else:
                        return f"{self.convert_number(quotient)} {self.multipliers[multiplier]}{plural_suffix} {self.convert_number(remainder)}"

        # Should never reach here given the checks above
        return str(num)

    def convert_decimal(self, num: Union[float, str]) -> str:
        """Convert a decimal number to its word representation in English"""
        # Ensure num is a float
        if isinstance(num, str):
            # Replace comma with period for float conversion
            num_str = num.replace(',', '.')
            num = float(num_str)

        # Handle scientific notation by converting to regular float string
        num_str = str(num)

        if 'e' in num_str.lower():
            # For scientific notation, convert to regular notation
            num_str = f"{num:.10f}".rstrip('0').rstrip('.')

        # Split on the decimal point
        parts = num_str.split('.')
        integer_part = int(parts[0])

        integer_words = self.convert_number(integer_part)

        if len(parts) > 1:
            # Handle the decimal part digit by digit
            decimal_part = parts[1]
            decimal_words = " ".join(self.digits[int(digit)] for digit in decimal_part)
            return f"{integer_words} point {decimal_words}"
        else:
            # No decimal part
            return integer_words

    def convert_ordinal(self, match: re.Match) -> str:
        """Convert an ordinal number to its word representation in French"""
        number = int(match.group(1))

        # Special cases
        if number in self.ordinal_words:
            return self.ordinal_words[number]

        # General rule: use the number word + ième
        base_word = self.convert_number(number)

        if base_word.endswith('e'):
            return f"{base_word[:-1]}ième"
        else:
            return f"{base_word}ième"

    def convert_currency(self, match: re.Match) -> str:
        """Convert a currency amount to its word representation in French"""
        # Handle both pattern formats (symbol before or after amount)
        if match.group(1) is not None:
            # Format: $42,50
            symbol = match.group(1)
            amount = match.group(2).replace(',', '.')
        else:
            # Format: 42,50€
            symbol = match.group(4)
            amount = match.group(3).replace(',', '.')

        if '.' in amount:
            dollars, cents = amount.split('.')
            dollars_int = int(dollars)
            cents_int = int(cents)

            dollars_text = self.convert_number(dollars_int)

            if cents_int == 0:
                # Handle plural
                if dollars_int == 1:
                    return f"un {self.currency_symbols[symbol]}"
                else:
                    return f"{dollars_text} {self.currency_symbols[symbol]}s"

            cents_text = self.convert_number(cents_int)

            if dollars_int == 1:
                return f"un {self.currency_symbols[symbol]} et {cents_text} centimes"
            else:
                return f"{dollars_text} {self.currency_symbols[symbol]}s et {cents_text} centimes"
        else:
            amount_int = int(amount)
            amount_text = self.convert_number(amount_int)

            if amount_int == 1:
                return f"un {self.currency_symbols[symbol]}"
            else:
                return f"{amount_text} {self.currency_symbols[symbol]}s"

    def convert_percentage(self, match: re.Match) -> str:
        """Convert a percentage to its word representation in French"""
        value = match.group(1)

        if '.' in value or ',' in value:
            value = value.replace(',', '.')
            return f"{self.convert_decimal(float(value))} pour cent"
        else:
            return f"{self.convert_number(int(value))} pour cent"

class ParallelTextPreprocessor:
    """
    Preprocessor for parallel text data in MT format
    Ensures that filtering is applied consistently across language pairs
    """
    def __init__(self):
        # Compile regex patterns once
        self.non_latin_pattern = re.compile(r'[\u4e00-\u9fff\u0400-\u04FF\u0370-\u03FF\u3040-\u30FF]')
        self.double_dash_pattern = re.compile(r'--')

    def filter_lines(self, files_dict: Dict[str, List[str]]) -> Dict[str, List[str]]:
        """
        Filter lines consistently across parallel files

        Args:
            files_dict: Dictionary mapping language codes to lists of text lines

        Returns:
            Dictionary with filtered lines
        """
        result = {lang: [] for lang in files_dict.keys()}

        # Get the number of lines (should be the same for all files)
        num_lines = len(next(iter(files_dict.values())))

        # Check if all files have the same number of lines
        if not all(len(lines) == num_lines for lines in files_dict.values()):
            raise ValueError("All parallel files must have the same number of lines")

        # Process line by line across all files
        for i in range(num_lines):
            # Check if any line should be filtered out
            should_filter = False

            for lang, lines in files_dict.items():
                line = lines[i]
                # Check for non-Latin characters or double dashes
                if self.non_latin_pattern.search(line) or self.double_dash_pattern.search(line):
                    should_filter = True
                    break

            # If no filter condition was met, add lines to result
            if not should_filter:
                for lang, lines in files_dict.items():
                    result[lang].append(lines[i])

        return result

    def preprocess_files(self, file_paths: Dict[str, str]) -> Dict[str, List[str]]:
        """
        Preprocess multiple parallel files

        Args:
            file_paths: Dictionary mapping language codes to file paths

        Returns:
            Dictionary with processed lines
        """
        # Read all files
        files_dict = {}
        for lang, path in file_paths.items():
            with open(path, 'r', encoding='utf-8') as f:
                files_dict[lang] = f.read().splitlines()

        # Apply filtering
        return self.filter_lines(files_dict)

    def preprocess_text(self, text: str) -> str:
        """
        Preprocess a single text (removes parentheses and brackets)

        Args:
            text: Text to preprocess

        Returns:
            Preprocessed text
        """
        # First remove content within parentheses - handles nested parentheses
        while re.search(r'\([^()]*\)', text):
            text = re.sub(r'\([^()]*\)', '', text)

        # Then remove content within brackets - handles nested brackets
        while re.search(r'\[[^\[\]]*\]', text):
            text = re.sub(r'\[[^\[\]]*\]', '', text)

        # Remove any remaining unpaired parentheses and brackets
        text = re.sub(r'[\(\)\[\]]', '', text)

        # remove « and »
        text = re.sub(r'«|»', '', text)

        # strip leading and trailing whitespace
        text = text.strip()

        # ensure the first letter is capitalized
        if text:
            text = text[0].upper() + text[1:]

        # replace commas at the end of a sentence with fullstop
        if text.endswith(','):
            text = text[:-1] + '.'

        return text

    def normalize(self, text: str, language: str = 'en') -> str:
        """
        Perform inverse normalization on text in the specified language

        Args:
            text (str): Text to inverse normalize
            language (str): Language code ('en' for English, 'fr' for French)

        Returns:
            str: Inverse normalized text
        """
        if language not in self.normalizers:
            raise ValueError(f"Unsupported language: {language}")

        # Apply common preprocessing
        text = self.preprocess(text)

        # Apply language-specific normalization
        return self.normalizers[language](text)

    def __call__(self, text: str, language: str = 'en') -> str:
        """
        Perform inverse normalization on text in the specified language

        Args:
            text (str): Text to inverse normalize
            language (str): Language code ('en' for English, 'fr' for French)

        Returns:
            str: Inverse normalized text
        """
        return self.normalize(text, language)

class TextLengthFilter:
    """
    Filter texts based on length criteria
    """
    def __init__(self, min_length: int = 4, max_length: int = 500):
        """
        Initialize the filter with length constraints

        Args:
            min_length: Minimum allowed text length (inclusive)
            max_length: Maximum allowed text length (inclusive)
        """
        self.min_length = min_length
        self.max_length = max_length

    def filter_text(self, text: str) -> Optional[str]:
        """
        Filter a single text based on length criteria

        Args:
            text: Text to filter

        Returns:
            The original text if it meets the criteria, None otherwise
        """
        text_length = len(text)
        if self.min_length <= text_length <= self.max_length:
            return text
        return None

    def filter_texts(self, texts: List[str]) -> List[str]:
        """
        Filter a list of texts based on length criteria

        Args:
            texts: List of texts to filter

        Returns:
            List of texts that meet the criteria
        """
        return [text for text in texts if self.filter_text(text) is not None]

    def filter_files(self, file_paths: List[str], encoding: str = 'utf-8') -> Dict[str, List[str]]:
        """
        Filter multiple files line by line

        Args:
            file_paths: List of file paths to process
            encoding: File encoding (default: utf-8)

        Returns:
            Dictionary mapping file paths to lists of filtered lines
        """
        result = {}
        for path in file_paths:
            with open(path, 'r', encoding=encoding) as f:
                lines = f.read().splitlines()
                result[path] = self.filter_texts(lines)
        return result

    def filter_parallel_texts(self, texts_dict: Dict[str, List[str]]) -> Tuple[Dict[str, List[str]], Dict[str, float]]:
        """
        Filter texts consistently across parallel data

        Args:
            texts_dict: Dictionary mapping languages to lists of texts

        Returns:
            Tuple containing:
                - Dictionary with filtered texts (maintaining alignment)
                - Dictionary with retention rates for each language
        """
        result = {lang: [] for lang in texts_dict.keys()}

        # Get number of texts (should be the same for all languages)
        num_texts = len(next(iter(texts_dict.values())))

        # Check if all languages have the same number of texts
        if not all(len(texts) == num_texts for texts in texts_dict.values()):
            raise ValueError("All parallel texts must have the same number of items")

        # Track how many texts are kept for each language
        kept_count = 0

        # Process text by text across all languages
        for i in range(num_texts):
            # Check if any text should be filtered out based on length
            should_keep = True

            for lang, texts in texts_dict.items():
                text = texts[i]
                text_length = len(text)
                if text_length < self.min_length or text_length > self.max_length:
                    should_keep = False
                    break

            # If all texts pass the length filter, keep them
            if should_keep:
                kept_count += 1
                for lang, texts in texts_dict.items():
                    result[lang].append(texts[i])

        # Calculate retention rates
        retention_rates = {
            lang: (kept_count / num_texts) * 100 if num_texts > 0 else 100.0
            for lang in texts_dict.keys()
        }

        print(f"Texts retained for each language: {retention_rates}")

        return result, retention_rates

    def __call__(self, text: Union[str, List[str], Dict[str, List[str]]], return_stats: bool = False) -> Union[
        Optional[str],
        List[str],
        Dict[str, List[str]],
        Tuple[Dict[str, List[str]], Dict[str, float]]
    ]:
        """
        Filter text based on its type

        Args:
            text: Single text, list of texts, or dictionary of parallel texts
            return_stats: Whether to return statistics (only applicable for dictionary input)

        Returns:
            Filtered text(s) of the same type as input, with optional stats
        """
        if isinstance(text, str):
            return self.filter_text(text)
        elif isinstance(text, list):
            return self.filter_texts(text)
        elif isinstance(text, dict):
            result = self.filter_parallel_texts(text)
            return result if return_stats else result[0]
        else:
            raise TypeError(f"Unsupported input type: {type(text)}")


# Integration with existing code
class EnhancedMultilingualInverseTextNormalizer:
    """
    Enhanced inverse text normalizer with length filtering capabilities
    """
    def __init__(self, min_length: int = 4, max_length: int = 500):
        self.normalizers = {
            'en': EnglishInverseNumberNormalizer(),
            'fr': FrenchInverseNumberNormalizer()
        }
        self.preprocessor = ParallelTextPreprocessor()
        self.length_filter = TextLengthFilter(min_length, max_length)

    def normalize_and_filter(self, text: str, language: str = 'en') -> Optional[str]:
        """
        Normalize text and filter based on length criteria

        Args:
            text: Text to normalize and filter
            language: Language code ('en' for English, 'fr' for French)

        Returns:
            Normalized text if it meets length criteria, None otherwise
        """
        if language not in self.normalizers:
            raise ValueError(f"Unsupported language: {language}")

        # Apply preprocessing
        text = self.preprocessor.preprocess_text(text)

        # Skip normalization if text doesn't meet length criteria
        if not self.length_filter.filter_text(text):
            return None

        # Apply language-specific normalization
        normalized = self.normalizers[language](text)

        # Apply length filtering to normalized text
        return self.length_filter.filter_text(normalized)

    def normalize_and_filter_batch(self, texts: List[str], language: str = 'en') -> List[str]:
        """
        Normalize and filter a batch of texts

        Args:
            texts: List of texts to normalize and filter
            language: Language code

        Returns:
            List of normalized texts that meet length criteria
        """
        return [
            normalized for text in texts
            if (normalized := self.normalize_and_filter(text, language)) is not None
        ]

    def normalize_and_filter_parallel(self, texts_dict: Dict[str, List[str]], return_stats: bool = False):
        # Store initial counts
        initial_counts = {lang: len(texts) for lang, texts in texts_dict.items()}

        # Apply first round of filtering
        texts_dict = self.preprocessor.filter_lines(texts_dict)

        processed_dict = {}
        for lang, texts in texts_dict.items():
            processed_dict[lang] = [self.preprocessor.preprocess_text(text) for text in texts]

        filtered_dict = self.length_filter.filter_parallel_texts(processed_dict)[0]

        # Apply normalization
        normalized_dict = {}
        for lang, texts in filtered_dict.items():
            normalized_dict[lang] = [
                self.normalizers[lang](text) for text in tqdm(texts, desc=f"Normalizing {lang}")
            ]

        # Final filtering
        final_filtered, _ = self.length_filter.filter_parallel_texts(normalized_dict)

        # Calculate true retention rates based on initial counts
        final_retention_rates = {
            lang: (len(final_filtered[lang]) / initial_counts[lang]) * 100
            for lang in initial_counts
        }

        if return_stats:
            return final_filtered, final_retention_rates
        return final_filtered

    def __call__(self, text: Union[str, List[str], Dict[str, List[str]]], language: str = 'en', return_stats: bool = False) -> Union[
        Optional[str],
        List[str],
        Dict[str, List[str]],
        Tuple[Dict[str, List[str]], Dict[str, float]]
    ]:
        """
        Normalize and filter based on input type

        Args:
            text: Single text, list of texts, or dictionary of parallel texts
            language: Language code (ignored for dictionary input)
            return_stats: Whether to return retention statistics (only for dictionary input)

        Returns:
            Normalized and filtered text(s) of the same type as input, with optional stats
        """
        if isinstance(text, str):
            return self.normalize_and_filter(text, language)
        elif isinstance(text, list):
            return self.normalize_and_filter_batch(text, language)
        elif isinstance(text, dict):
            return self.normalize_and_filter_parallel(text, return_stats)
        else:
            raise TypeError(f"Unsupported input type: {type(text)}")


def test_text_length_filter():
    # Create test texts
    test_texts = [
        "abc",               # Too short
        "hello",             # OK
        "This is a test.",   # OK
        "A" * 501            # Too long
    ]

    # Create filter
    text_filter = TextLengthFilter(min_length=4, max_length=500)

    # Test single text filtering
    assert text_filter.filter_text("abc") is None
    assert text_filter.filter_text("hello") == "hello"
    assert text_filter.filter_text("A" * 501) is None

    # Test batch filtering
    filtered = text_filter.filter_texts(test_texts)
    assert len(filtered) == 2
    assert "abc" not in filtered
    assert "hello" in filtered
    assert "This is a test." in filtered
    assert "A" * 501 not in filtered

    # Test parallel text filtering
    parallel_texts = {
        'en': ["abc", "hello", "This is a test.", "A" * 501],
        'fr': ["def", "bonjour", "C'est un test.", "B" * 501]
    }

    filtered_parallel, stats = text_filter.filter_parallel_texts(parallel_texts)
    assert len(filtered_parallel['en']) == 2
    assert len(filtered_parallel['fr']) == 2
    assert filtered_parallel['en'] == ["hello", "This is a test."]
    assert filtered_parallel['fr'] == ["bonjour", "C'est un test."]

    print("All text length filter tests passed!")


def test_integration():
    # Create test texts
    test_texts = {
        'en': [
            "123",                     # OK but short
            "The price is $42.50",     # OK
            "A" * 501,                 # Too long
            "Chapter 10",                # OK
            "The House rose (and observed a minute' s silence)"
        ],
        'fr': [
            "123",                     # OK but short
            "Le prix est 42,50€",      # OK
            "B" * 501,                 # Too long
            "Chapitre 10",               # OK
            "Le Parlement, debout(, observe une minute de silence)"
        ]
    }

    # Create normalizer with filtering
    normalizer = EnhancedMultilingualInverseTextNormalizer(min_length=4, max_length=500)

    # Test single text normalization and filtering
    assert normalizer("123", 'en') is None  # Too short
    assert normalizer("The price is $42.50", 'en') == "The price is forty-two dollars and fifty cents"
    assert normalizer("A" * 501, 'en') is None  # Too long

    # Test parallel normalization and filtering with stats
    result, stats = normalizer(test_texts, return_stats=True)
    print(result)
    assert len(result['en']) == 3
    assert "one hundred and twenty-three" not in result['en']  # Filtered out (orig text too short)
    assert "The price is forty-two dollars and fifty cents" in result['en']
    assert "Chapter ten" in result['en']
    assert "The House rose" in result['en']

    assert len(result['fr']) == 3
    assert "cent vingt-trois" not in result['fr']  # Filtered out (orig text too short)
    assert "Le prix est quarante-deux euros et cinquante centimes" in result['fr']
    assert "Chapitre dix" in result['fr']
    assert "Le Parlement, debout" in result['fr']

    print(stats)
    # Check retention rates
    assert stats['en'] == 60.0  # 3 out of 5 texts retained
    assert stats['fr'] == 60.0  # 3 out of 5 texts retained

    print("All integration tests passed!")
    print(f"Retention rate: {stats['en']}% of texts were retained")


def process_parallel_corpus(file_paths, min_length=4, max_length=500, output_dir=None):
    """
    Process a parallel corpus with length filtering and report statistics

    Args:
        file_paths: Dictionary mapping language codes to file paths
        min_length: Minimum text length to keep
        max_length: Maximum text length to keep
        output_dir: Directory to write filtered files (if None, don't write files)

    Returns:
        Tuple of filtered texts and retention statistics
    """
    # Initialize filter and normalizer
    length_filter = TextLengthFilter(min_length, max_length)
    normalizer = EnhancedMultilingualInverseTextNormalizer(min_length, max_length)

    # Read all files
    texts_dict = {}
    for lang, path in file_paths.items():
        with open(path, 'r', encoding='utf-8') as f:
            texts_dict[lang] = f.read().splitlines()

    # Get initial text counts
    initial_counts = {lang: len(texts) for lang, texts in texts_dict.items()}

    # Filter and normalize
    filtered_texts, retention_stats = normalizer(texts_dict, return_stats=True)

    # Print statistics
    print("=== Parallel Corpus Processing Report ===")
    print(f"Minimum length: {min_length}, Maximum length: {max_length}")
    print("\nRetention Statistics:")
    for lang, rate in retention_stats.items():
        initial = initial_counts[lang]
        retained = len(filtered_texts[lang])
        print(f"  {lang}: {retained}/{initial} texts retained ({rate:.2f}%)")

    # Write output files if requested
    if output_dir:
        import os
        os.makedirs(output_dir, exist_ok=True)

        for lang, texts in filtered_texts.items():
            output_path = os.path.join(output_dir, f"filtered_{lang}.txt")
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write('\n'.join(texts))
            print(f"Wrote filtered texts to {output_path}")

    return filtered_texts, retention_stats


test_text_length_filter()
test_integration()

In [None]:
process_parallel_corpus({
    'en': '/content/Europarl/Europarl.en-fr.en',
    'fr': '/content/Europarl/Europarl.en-fr.fr'
}, output_dir='processed_europarl')

In [None]:
process_parallel_corpus({
    'en': '/content/TED/TED2020.en-fr.en',
    'fr': '/content/TED/TED2020.en-fr.fr'
}, output_dir='processed_ted')

In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from typing import Dict, List, Tuple, Set


def sample_sentences(texts_dict: Dict[str, List[str]], n: int = 100, seed: int = 42) -> Tuple[Dict[str, List[str]], List[int]]:
    """
    Sample n sentences from each language in the texts dictionary.

    Args:
        texts_dict: Dictionary mapping language codes to lists of texts
        n: Number of sentences to sample
        seed: Random seed for reproducibility

    Returns:
        Tuple containing:
            - Dictionary with sampled texts
            - List of indices of the sampled texts
    """
    random.seed(seed)

    # Get the number of texts (should be the same for all languages)
    num_texts = len(next(iter(texts_dict.values())))

    # Generate n random indices without replacement
    if n > num_texts:
        print(f"Warning: Requested sample size {n} is larger than available texts ({num_texts})")
        n = num_texts

    sample_indices = sorted(random.sample(range(num_texts), n))

    # Sample texts for each language
    sampled_texts = {}
    for lang, texts in texts_dict.items():
        sampled_texts[lang] = [texts[i] for i in sample_indices]

    return sampled_texts, sample_indices


def analyze_lengths(texts_dict: Dict[str, List[str]]) -> Dict[str, Dict[str, float]]:
    """
    Analyze the length distribution of texts for each language.

    Args:
        texts_dict: Dictionary mapping language codes to lists of texts

    Returns:
        Dictionary with length statistics for each language
    """
    results = {}

    for lang, texts in texts_dict.items():
        # Calculate lengths
        lengths = [len(text) for text in texts]

        # Calculate statistics
        stats = {
            'min': min(lengths),
            'max': max(lengths),
            'mean': sum(lengths) / len(lengths),
            'median': sorted(lengths)[len(lengths) // 2],
            'std': np.std(lengths),
            'count': len(lengths)
        }

        # Bin lengths in groups of n
        bins = {}
        n = 20
        for length in lengths:
            bin_key = f"{(length // n) * n}-{(length // n) * n + n-1}"
            bins[bin_key] = bins.get(bin_key, 0) + 1

        stats['bins'] = bins
        results[lang] = stats

    return results


# Main function to load files, sample sentences, and analyze lengths
def main(file_paths: Dict[str, str], n_samples: int = 100, save_indices: bool = True, seed:int = 0, json_path: str = None):
    """
    Load text files, sample sentences, analyze lengths and save results

    Args:
        file_paths: Dictionary mapping language codes to file paths
        n_samples: Number of sentences to sample
        save_indices: Whether to save indices to a file
    """
    # Load all files
    texts_dict = {}
    for lang, path in file_paths.items():
        with open(path, 'r', encoding='utf-8') as f:
            texts_dict[lang] = f.read().splitlines()
        print(f"Loaded {len(texts_dict[lang])} lines from {path}")

    # Sample sentences
    sampled_texts, sample_indices = sample_sentences(texts_dict, n_samples, seed=seed)
    print(f"Sampled {len(sample_indices)} sentences")

    # Analyze lengths
    stats = analyze_lengths(sampled_texts)

    # Save sample indices for future reference
    if save_indices:
        import json
        with open(json_path, 'w') as f:
            json.dump(sample_indices, f)
        print(f"Sample indices saved to {json_path}")

    return sampled_texts, sample_indices, stats


# Example usage with real files
if __name__ == "__main__":
    # for ted
    file_paths = {
        'ted_en': "/content/processed_ted/filtered_en.txt",
        'ted_fr': "/content/processed_ted/filtered_fr.txt"
    }
    seed = 0

    # Run the analysis with 100 samples
    sampled_texts, sample_indices, stats = main(file_paths, n_samples=25000, json_path='ted.json', seed=seed)
    print(stats)
    # for europarl
    file_paths = {
        'europarl_en': "/content/processed_europarl/filtered_en.txt",
        'europarl_fr': "/content/processed_europarl/filtered_fr.txt"}

    # Run the analysis with 100 samples
    sampled_texts, sample_indices, stats = main(file_paths, n_samples=25000, json_path='europarl.json',seed=seed)
    print(stats)


In [None]:
!ls -lh /content/processed_ted/

In [None]:
!ls -lh /content/processed_europarl/

In [None]:
! cat /content/processed_europarl/filtered_fr.txt | wc -l

In [None]:
!cat /content/processed_europarl/filtered_en.txt /content/processed_europarl/filtered_en.txt /content/processed_ted/filtered_en.txt /content/processed_ted/filtered_en.txt | wc -l

In [None]:
!cat /content/processed_europarl/filtered_en.txt /content/processed_europarl/filtered_en.txt /content/processed_ted/filtered_en.txt /content/processed_ted/filtered_en.txt | grep "||"

In [None]:
import json

In [None]:
with open("/content/processed_europarl/filtered_en.txt", 'r') as f_en, open(
    "/content/processed_europarl/filtered_fr.txt", 'r') as f_fr, open(
        "/content/europarl.json", 'r') as f_json, open("/content/europarl.txt", 'w') as f_out:

    europarl_en = f_en.readlines()
    europarl_fr = f_fr.readlines()

    # Convert JSON list to a set for O(1) lookups
    europarl_indices = set(json.load(f_json))

    for i, (line_en, line_fr) in tqdm(enumerate(zip(europarl_en, europarl_fr)), total=len(europarl_en)):
        if i in europarl_indices:  # O(1) lookup instead of search
            f_out.write(f"{line_en.strip()} || {line_fr.strip()}\n")

In [None]:
with open("/content/processed_ted/filtered_en.txt", 'r') as f_en, open(
    "/content/processed_ted/filtered_fr.txt", 'r') as f_fr, open(
        "/content/ted.json", 'r') as f_json, open("/content/ted.txt", 'w') as f_out:

    ted_en = f_en.readlines()
    ted_fr = f_fr.readlines()

    # Convert JSON list to a set for O(1) lookups
    ted_indices = set(json.load(f_json))

    for i, (line_en, line_fr) in tqdm(enumerate(zip(ted_en, ted_fr)), total=len(ted_en)):
        if i in ted_indices:  # O(1) lookup instead of search
            f_out.write(f"{line_en.strip()} || {line_fr.strip()}\n")

In [None]:
!cat europarl.txt ted.txt | wc -l

In [None]:
! cat europarl.txt > en_fr.txt
! cat ted.txt >> en_fr.txt
! cat en_fr.txt | wc -l

In [None]:
!ls -l en_fr.txt

## Translate

In [None]:
import os
from google.cloud import translate_v3

# Set this to your project ID
PROJECT_ID = "upbeat-nation-454716-d0"

# Path to your service account key file
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/upbeat-nation-454716-d0-4491769993f7.json"

def translate_text(text, target_language="fr", source_language="en-US"):
    """
    Translates text to the target language.

    Args:
        text: Text to translate
        target_language: Language code to translate to
        source_language: Language code of the source text

    Returns:
        TranslateTextResponse containing the translations
    """
    # Initialize client
    client = translate_v3.TranslationServiceClient()

    # Set parent resource (project and location)
    parent = f"projects/{PROJECT_ID}/locations/global"

    # Call the API
    response = client.translate_text(
        request={
            "parent": parent,
            "contents": text,
            "mime_type": "text/plain",
            "source_language_code": source_language,
            "target_language_code": target_language,
        }
    )

    # Print results
    # print(response)
    # for translation in response.translations:
    #     print(f"Translated text: {translation}")

    return response


texts_to_translate = [
    "Hello, how are you?",
    "First of all, this is where we're projected to go with the U.S. contribution to global warming, under business as usual.",
    "Efficiency in end-use electricity and end-use of all energy is the low-hanging fruit."
]
response = translate_text(texts_to_translate, "fon", "en")

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import time

In [None]:
input_file = "en_fr.txt"
output_file = "bem_en.txt"
source_lang = "en"
target_lang = "bem"
batch_size = 64


# Process file in batches
with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'w', encoding='utf-8') as f_out:
    lines = f_in.readlines()
    total_batches = (len(lines) + batch_size - 1) // batch_size

    for batch_idx in tqdm(range(total_batches), desc="Processing batches"):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, len(lines))

        # Extract English text from each line in this batch
        batch_texts = [line.strip().split(" || ")[0] for line in lines[start_idx:end_idx]]

        # Translate the batch
        translated_texts = translate_text(batch_texts, target_language=target_lang, source_language=source_lang)
        # write the translated bem to the output file sentence by sentence as bem || en
        for i, translation in enumerate(translated_texts.translations):
            f_out.write(f"{translation.translated_text} || {batch_texts[i]}\n")
        # sleep for 100ms
        time.sleep(0.1)

# save to gdrive
!cp bem_en.txt /content/drive/MyDrive/

In [None]:
# !ls -lh bem_en.txt

In [None]:
input_file = "en_fr.txt"
output_file = "fon_fr.txt"
source_lang = "en"
target_lang = "fon"
batch_size = 64


# Process file in batches
with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'w', encoding='utf-8') as f_out:
    lines = f_in.readlines()
    total_batches = (len(lines) + batch_size - 1) // batch_size

    for batch_idx in tqdm(range(total_batches), desc="Processing batches"):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, len(lines))

        # Extract English text from each line in this batch
        batch_texts = [line.strip().split(" || ")[0] for line in lines[start_idx:end_idx]]
        french_texts = [line.strip().split(" || ")[1] for line in lines[start_idx:end_idx]]
        # Translate the batch
        translated_texts = translate_text(batch_texts, target_language=target_lang, source_language=source_lang)
        # write the translated bem to the output file sentence by sentence as fon || fr
        for i, translation in enumerate(translated_texts.translations):
            f_out.write(f"{translation.translated_text} || {french_texts[i]}\n")
        # sleep for 100ms
        time.sleep(0.1)

# save to gdrive
!cp fon_fr.txt /content/drive/MyDrive/