In [1]:
from e2e_st.text.text_preprocessor import TranscriptionPreprocessor, TranslationPreprocessor
import os
import wget
import json
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from e2e_st.text.tokenizer import CustomTokenizer
from transformers import AutoTokenizer

# Register as a fast tokenizer in the second parameter
AutoTokenizer.register("custom", None, CustomTokenizer)


In [9]:
tokenizer = AutoTokenizer.from_pretrained("alexgichamba/iwslt25_lowres_uncased_4096", use_fast=True)

In [10]:
# find vocab size
vocab_size = tokenizer.vocab_size
print(vocab_size)

4096


In [None]:
print(tokenizer.bem_lang_token, tokenizer.eng_lang_token, tokenizer.fra_lang_token, tokenizer.fon_lang_token)
print(tokenizer.bem_lang_token_id, tokenizer.eng_lang_token_id, tokenizer.fra_lang_token_id, tokenizer.fon_lang_token_id)

<|bem|> <|eng|> <|fra|> <|fon|>
5 6 7 8


In [84]:
print(tokenizer.tokenize("I shall also refer the matter to the College of Quaestors, and I am certain that they will be keen to ensure that we comply with the regulations we ourselves vote on.".lower()))
print(len(tokenizer.tokenize("I shall also refer the matter to the College of Quaestors, and I am certain that they will be keen to ensure that we comply with the regulations we ourselves vote on.".lower())))

['i', 'sh', 'all', 'also', 'refer', 'the', 'mat', 'ter', 'to', 'the', 'colle', 'ge', 'of', 'qu', 'a', 'es', 'to', 'rs', ',', 'and', 'i', 'am', 'certain', 'that', 'they', 'will', 'be', 'ke', 'en', 'to', 'ensure', 'that', 'we', 'comp', 'ly', 'with', 'the', 're', 'gu', 'lations', 'we', 'our', 'selves', 'vote', 'on', '.']
46


In [85]:
print(tokenizer.tokenize("Je vais soumettre également le problème au Collège des questeurs et je suis certaine que nos questeurs auront à cur de faire en sorte que nous respections la réglementation qu' en effet nous votons."))
print(len(tokenizer.tokenize("Je vais soumettre également le problème au Collège des questeurs et je suis certaine que nos questeurs auront à cur de faire en sorte que nous respections la réglementation qu' en effet nous votons.")))

['<|unk|>', 'e', 'vais', 'soumettre', 'également', 'le', 'problème', 'au', '<|unk|>', 'o', 'llè', 'ge', 'des', 'ques', 'teurs', 'et', 'je', 'suis', 'certaine', 'que', 'nos', 'ques', 'teurs', 'auront', 'à', 'cur', 'de', 'faire', 'en', 'sorte', 'que', 'nous', 'respe', 'ctions', 'la', 'réglementation', 'qu', "'", 'en', 'effet', 'nous', 'vo', 'tons', '.']
44


In [86]:
print(tokenizer.tokenize("Ée yě ɖɔ mɔ̌ ɔ́, Mɔyízi lɛ́ kɔ bó yi ɖɔ nú Mawu Mavɔmavɔ ɖɔ: \"Aklúnɔ, étɛ́wú a wa nǔ xá togun élɔ́?"))
print(len(tokenizer.tokenize("Ée yě ɖɔ mɔ̌ ɔ́, Mɔyízi lɛ́ kɔ bó yi ɖɔ nú Mawu Mavɔmavɔ ɖɔ: \"Aklúnɔ, étɛ́wú a wa nǔ xá togun élɔ́?")))

['<|unk|>', 'e', 'yě', 'ɖɔ', 'mɔ̌', 'ɔ́', ',', '<|unk|>', 'ɔ', 'yí', 'zi', 'lɛ́', 'kɔ', 'bó', 'yi', 'ɖɔ', 'nú', '<|unk|>', 'awu', '<|unk|>', 'avɔ', 'mavɔ', 'ɖɔ', ':', '"', '<|unk|>', 'kl', 'ú', 'nɔ', ',', 'é', 'tɛ́', 'wú', 'a', 'wa', 'nǔ', 'xá', 'togun', 'élɔ́', '?']
40


In [80]:
print(tokenizer.tokenize("\"Pa kuti kasebanya naikila pali imwe, ali ne cipyu cickalamba, pa kwishibo kuti ali ne nshita inono fye.\" - Ukusokoloa 12:12."))
print(len(tokenizer.tokenize("\"Pa kuti kasebanya naikila pali imwe, ali ne cipyu cickalamba, pa kwishibo kuti ali ne nshita inono fye.\" - Ukusokoloa 12:12.")))

['"', 'Pa', 'kuti', 'ka', 'se', 'ban', 'ya', 'nai', 'kila', 'pali', 'im', 'we', ',', 'ali', 'ne', 'cip', 'yu', 'ci', 'c', 'kalamba', ',', 'pa', 'kw', 'ishi', 'bo', 'kuti', 'ali', 'ne', 'nshita', 'inono', 'fye', '."', '-', 'Uku', 'so', 'kolo', 'a', '12', ':', '12', '.']
41


In [37]:
def test_text_preprocessor():
    token_types = ["/ocean/projects/cis210027p/gichamba/iwslt25/iwslt25_lowres/iwslt25_lowres_cased_4096"]
    for token_type in token_types:
        tokenizer = AutoTokenizer.from_pretrained(token_type)
        parallel_texts = [("I shall also refer the matter to the College of Quaestors, and I am certain that they will be keen to ensure that we comply with the regulations we ourselves vote on.",
                        "Je vais soumettre également le problème au Collège des questeurs et je suis certaine que nos questeurs auront à cur de faire en sorte que nous respections la réglementation qu' en effet nous votons."),
                            ("Aya makampani yonse yaliile ku ntanshi no kucefyako incito no kufumyapo ababomfi.", "All these firms have gone ahead with job cuts and even redundancies."),
                            ("Mɛɖaxo, mi bi jlo na blo nuɖe bo na do fun ahwan xá adingban Elɔpu tɔn lɛ.","Monsieur le Président, nous aimerions tous faire quelque chose pour aider à lutter contre la fraude en Europe."),
                            ("Mon travail a toujours dépassé la mode.","Lelo umulimo wandi lyonse wali pa lwa fyacilapo ukucila pa fya kufwala.")
        ]
        lang_pairs = [("eng", "fra"), ("bem", "eng"), ("fon", "fra"), ("fra", "bem")]
        transcripts = ["We will build a wall",
                    "\"Pa kuti kasebanya naikila pali imwe, ali ne cipyu cickalamba, pa kwishibo kuti ali ne nshita inono fye.\" - Ukusokoloa 12:12.",
                    "Mɛni he je nɛ suɔmi nɛ ngɛ Mawu kɛ e Bi ɔ a kpɛti ɔ mi wa wawɛɛ ɔ?",
                    "Moïse retourna vers l\'Eternel, et dit: Seigneur, pourquoi as-tu fait du mal à ce peuple? pourquoi m\'as-tu envoyé?..."]
        for i, (transcipt, parallel_text) in enumerate(zip(transcripts, parallel_texts)):  
            transcription_preprocessor_upper = TranscriptionPreprocessor(case_standardization="upper", tokenizer=tokenizer)
            translation_preprocessor_upper = TranslationPreprocessor(case_standardization="upper", tokenizer=tokenizer, source_language=lang_pairs[i][0], target_language=lang_pairs[i][1])
            
            transcription_preprocessor_lower = TranscriptionPreprocessor(tokenizer=tokenizer, case_standardization="lower")
            translation_preprocessor_lower = TranslationPreprocessor(case_standardization="lower", tokenizer=tokenizer, source_language=lang_pairs[i][0], target_language=lang_pairs[i][1])

            transcription_preprocessor_none = TranscriptionPreprocessor(tokenizer=tokenizer, case_standardization=None)
            translation_preprocessor_none = TranslationPreprocessor(tokenizer = tokenizer, case_standardization=None, source_language=lang_pairs[i][0], target_language=lang_pairs[i][1])


            print(f"Original transcript: {transcipt}")
            print(f"Upper case tokens: {tokenizer.tokenize(transcription_preprocessor_upper(transcipt))}")
            print(f"Lower case tokens: {tokenizer.tokenize(transcription_preprocessor_lower(transcipt))}")
            print(f"No case standardization transcript: {tokenizer.tokenize(transcription_preprocessor_none(transcipt))}")
            print("\n")
            print(f"Original translation: {parallel_text[0]} || {parallel_text[1]}")
            print(f"Upper case translation: {tokenizer.tokenize(translation_preprocessor_upper(parallel_text[0], parallel_text[1]))}")
            print(f"Lower case translation: {tokenizer.tokenize(translation_preprocessor_lower(parallel_text[0], parallel_text[1]))}")
            print(f"No case standardization translation: {tokenizer.tokenize(translation_preprocessor_none(parallel_text[0], parallel_text[1]))}")
            print("\n\n")

In [38]:
test_text_preprocessor()

Original transcript: We will build a wall


TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]