In [1]:
#!python'-m pip install git+https://github.com/NVIDIA/NeMo-text-processing.git@main#egg=nemo_text_processing

In [4]:
import pynini
import nemo_text_processing
from pynini.lib import pynutil
import os

from nemo_text_processing.text_normalization.en.graph_utils import generator_main,GraphFst, NEMO_DIGIT, NEMO_CHAR, NEMO_ALPHA, NEMO_ALNUM, delete_space, NEMO_SIGMA, NEMO_NOT_QUOTE, NEMO_NOT_SPACE, delete_extra_space, NEMO_NON_BREAKING_SPACE
from nemo_text_processing.text_normalization.normalize import Normalizer

from nemo_text_processing.inverse_text_normalization.en.taggers.word import WordFst
from nemo_text_processing.inverse_text_normalization.en.taggers.money import MoneyFst

from nemo_text_processing.inverse_text_normalization.en.taggers.punctuation import PunctuationFst
from nemo_text_processing.inverse_text_normalization.en.verbalizers.money import MoneyFst
from nemo_text_processing.inverse_text_normalization.en.verbalizers.time import TimeFst as TimeVerbalizerFst
from nemo_text_processing.inverse_text_normalization.en.verbalizers.cardinal import CardinalFst as CardinalVerbalizerFst

from nemo_text_processing.inverse_text_normalization.en.verbalizers.word import WordFst as WordVerbalizerFst

In [5]:
from pynini.lib import pynutil

def apply_fst(text, fst):
    try:
        print(pynini.shortestpath(text @ fst).string())

    except pynini.FstOpError:
        print(f"Error: no valid output with given'input: '{text}'")

In [6]:
transliterations = pynini.string_file("kinya_transliterations.tsv")

class WhiteListFst(GraphFst):
    def __init__(self):
        super().__init__(name="whitelist", kind="classify")

        whitelist = transliterations
        graph = pynutil.insert("name: \"") + whitelist + pynutil.insert("\"")
        self.fst = graph.optimize()

In [7]:
class CardinalFst(GraphFst):
    def __init__(self):
        super().__init__(name="cardinal", kind="classify")
        # Rest of the grammar here
        # ....... 
        #.........
        digit_to_word = {
            "0": "zero",
            "1": "imwe",
            "2": "kabiri",
            "3": "gatatu",
            "4": "kane",
            "5": "gatanu",
            "6": "gatandatu",
            "7": "karindwi",
            "8": "munani",
            "9": "icyenda",
        }

        # Tens mapping
        tens_to_word = {
            "0": "",
            "10": "icumi",
            "20": "makumyabiri",
            "30": "mirongo itatu",
            "40": "mirongo ine",
            "50": "mirongo itanu",
            "60": "mirongo itandatu",
            "70": "mirongo irindwi",
            "80": "mirongo inani",
            "90": "mirongo cyenda",
        }
    
        # Hundreds mapping
        hundreds_to_word = {
            "0":"",
            "100": "ijana",
            "200": "amagana abiri",
            "300": "magana atatu",
            "400": "magana ane",
            "500": "magana atanu",
            "600": "magana atandatu",
            "700": "magana irindwi",
            "800": "magana inani",
            "900": "magana icyenda",
        }
    
        # Thousands mapping
        thousands_to_word = {
            "0":"",
            "1000": "igihumbi",
            "2000": "ibihumbi bibiri",
            "3000": "ibihumbi bitatu",
            "4000": "ibihumbi bine",
            "5000": "ibihumbi bitanu",
            "6000": "ibihumbi bitandatu",
            "7000": "ibihumbi birindwi",
            "8000": "ibihumbi inani",
            "9000": "ibihumbi icyenda",
        }
    
        # Create FSTs for digits, tens, hundreds, and thousands
        digit_fst = pynini.union(*[
            pynini.cross(digit, word) for digit, word in digit_to_word.items()
        ]).optimize()
    
        tens_fst = pynini.union(*[
            pynini.cross(tens, word) for tens, word in tens_to_word.items()
        ]).optimize()
    
        hundreds_fst = pynini.union(*[
            pynini.cross(hundreds, word) for hundreds, word in hundreds_to_word.items()
        ]).optimize()
    
        thousands_fst = pynini.union(*[
            pynini.cross(thousands, word) for thousands, word in thousands_to_word.items()
        ]).optimize()
    
        # Combine FSTs for 0-9, 10, 20-90, 100, 1000
        combined_fst = digit_fst | tens_fst | hundreds_fst | thousands_fst
    
        # Combine FSTs for numbers 11-19
        eleven_to_nineteen_fst = pynini.union(*[
            pynini.cross(num, word) for num, word in {
                "11": "icumi na rimwe",
                "12": "icumi na kabiri",
                "13": "icumi na gatatu",
                "14": "icumi na kane",
                "15": "icumi na gatanu",
                "16": "icumi na gatandatu",
                "17": "icumi na karindwi",
                "18": "icumi na munani",
                "19": "icumi na icyenda",
            }.items()
        ]).optimize()
    
        combined_fst |= eleven_to_nineteen_fst
    
        # Combine FSTs for numbers 21-99 and 101-999
        for tens in range(20, 100, 10):
            for digit in range(1, 10):
                num_str = str(tens + digit)
                tens_word = tens_to_word[str(tens)]
                digit_word = digit_to_word[str(digit)]
                combined_fst |= pynini.cross(num_str, f"{tens_word} na {digit_word}")
    
        for hundreds in range(100, 1000, 100):
            for tens in range(0, 100, 10):
                for digit in range(0, 10):
                    num_str = str(hundreds + tens + digit)
                    #print(num_str)
                    hundreds_word = hundreds_to_word[str(hundreds)]
                    tens_word = tens_to_word[str(tens)]
                    digit_word = digit_to_word[str(digit)]
                    combined_fst |= pynini.cross(num_str, f"{hundreds_word} na {tens_word} na {digit_word}")
    
        for thousands in range(1000, 10000, 1000):
            for hundreds in range(0, 1000, 100):
                for tens in range(0, 100, 10):
                    for digit in range(0, 10):
                        num_str = str(thousands + hundreds + tens + digit)
                        thousands_word = thousands_to_word[str(thousands)]
                        hundreds_word = hundreds_to_word[str(hundreds)]
                        tens_word = tens_to_word[str(tens)]
                        digit_word = digit_to_word[str(digit)]
                        connector = ''
                        combined_fst |= pynini.cross(num_str, f"{thousands_word} na {hundreds_word} na {tens_word} na {digit_word}")

        graph = combined_fst.optimize()
        final_graph = pynutil.insert("integer: \"") + graph + pynutil.insert(" \"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph

In [16]:
#Elected to use the English cardinal verbalizer, the one below was only included for illustration
# class VerbalizeCardinalFst(GraphFst):
#     def __init__(self):
#         super().__init__(name='cardinal',kind='verbalize')
#         graph = (pynutil.delete("integer:")+delete_space+pynutil.delete(" \"")+pynini.closure(NEMO_CHAR,1)+pynutil.delete("\""))
#         delete_tokens = self.delete_tokens(graph)
#         self.fst = delete_tokens.optimize()

In [9]:
class TimeFst(GraphFst):
    def __init__(self):
        super().__init__(name="time", kind="classify")
        
        hours = pynini.string_map([
            ('1', 'saa saba'),
            ('2', 'saa mumani'),
            ('3', 'saa cyenda'),
            ('4', 'saa kumi'),
            ('5', "saa kumi n'imwe"),
            ('6', "saa kumi n'ebyiri"),
            ('7', 'saa moya'),
            ('8', 'saa mbiri'),
            ('9', 'saa tatu'),
            ('10', 'saa ine'),
            ('11', 'saa tanu'),
            ('12', 'saa sita'),

        ])
        
        minutes = pynini.string_map([
            ('00', ' '),
            ('01', " n'umunota umwe") ,
            ('02', " n'iminota ibiri") ,
            ('03', " n'iminota itatu") ,
            ('04', " n'iminota ine") ,
            ('05', " n'iminota itanu") ,
            ('06', " n'iminota itandatu") ,
            ('07', " n'iminota irindwi") ,
            ('08', " n'iminota umunani") ,
            ('09', " n'iminota icyenda") ,
            ('10', " n'iminota icumi") ,
            ('11', " n'iminota cumi n'umwe") ,
            ('12', " n'iminota cumi n'ibiri") ,
            ('13', " n'iminota cumi n'itatu") ,
            ('14', " n'iminota cumi n'ine") ,
            ('15', " n'iminota cumi n'itanu") ,
            ('16', " n'iminota cumi n'itandatu") ,
            ('17', " n'iminota cumi n'irindwi") ,
            ('18', " n'iminota cumi n'umunani") ,
            ('19', " n'iminota cumi n'icyenda") ,
            ('20', " n'iminota makumyabiri") ,
            ('21', " n'iminota makumyabiri na rimwe") ,
            ('22', " n'iminota makumyabiri n'ibiri") ,
            ('23', " n'iminota makumyabiri n'itatu") ,
            ('24', " n'iminota makumyabiri n'ine") ,
            ('25', " n'iminota makumyabiri n'itanu") ,
            ('26', " n'iminota makumyabiri n'itandatu") ,
            ('27', " n'iminota makumyabiri n'irindwi") ,
            ('28', " n'iminota makumyabiri n'umunani") ,
            ('29', " n'iminota makumyabiri n'icyenda") ,
            ('30', " n'iminota mirongo itatu") ,
            ('31', " n'iminota mirongo itatu n'umwe") ,
            ('32', " n'iminota mirongo itatu n'ibiri") ,
            ('33', " n'iminota mirongo itatu n'itatu") ,
            ('34', " n'iminota mirongo itatu n'ine") ,
            ('35', " n'iminota mirongo itatu n'itanu") ,
            ('36', " n'iminota mirongo itatu n'itandatu") ,
            ('37', " n'iminota mirongo itatu n'irindwi") ,
            ('38', " n'iminota mirongo itatu n'umunani") ,
            ('39', " n'iminota mirongo itatu n'icyenda") ,
            ('40', " n'iminota mirongo ine") ,
            ('41', " n'iminota mirongo ine n'umwe") ,
            ('42', " n'iminota mirongo ine n'ibiri") ,
            ('43', " n'iminota mirongo ine n'itatu") ,
            ('44', " n'iminota mirongo ine n'ine") ,
            ('45', " n'iminota mirongo ine n'itanu") ,
            ('46', " n'iminota mirongo ine n'itandatu") ,
            ('47', " n'iminota mirongo ine n'irindwi") ,
            ('48', " n'iminota mirongo ine n'umunani") ,
            ('49', " n'iminota mirongo ine n'icyenda") ,
            ('50', " n'iminota mirongo itanu") ,
            ('51', " n'iminota mirongo itanu n'umwe") ,
            ('52', " n'iminota mirongo itanu n'ibiri") ,
            ('53', " n'iminota mirongo itanu n'itatu") ,
            ('54', " n'iminota mirongo itanu n'ine") ,
            ('55', " n'iminota mirongo itanu n'itanu") ,
            ('56', " n'iminota mirongo itanu n'itandatu") ,
            ('57', " n'iminota mirongo itanu n'irindwi") ,
            ('58', " n'iminota mirongo itanu n'umunani") ,
            ('59', " n'iminota mirongo itanu n'icyenda") ,
        ])
        
        
        final_graph = pynutil.insert("hours: \"")+hours+pynutil.insert("\"")+pynutil.delete(":")+pynutil.insert(" minutes: \"")+minutes+pynutil.insert("\"")

        final_graph = self.add_tokens(final_graph)

        self.fst = final_graph.optimize()


In [27]:
time = TimeFst().fst
example = "12:54"
apply_fst(example, time)

time { hours: "saa sita" minutes: " n'iminota mirongo itanu n'ine" }


In [10]:
class VerbalizeTimeFst(GraphFst):
    def __init__(self):
        super().__init__(name="time",kind="verbalize")
        hour = (pynutil.delete("hours:")+delete_space+pynutil.delete("\"")+pynini.closure(NEMO_CHAR,1,60)+pynutil.delete("\"")+delete_space+pynutil.delete("minutes:")+delete_space+pynutil.delete("\"")+pynini.closure(NEMO_CHAR,1,60)+pynutil.delete("\""))

        graph = hour 
        delete_tokens = self.delete_tokens(graph)
        
        self.fst = delete_tokens.optimize()

In [29]:
verbalize_time = VerbalizeTimeFst().fst
apply_fst('time { hours: "saa sita" minutes: " n\'iminota mirongo itanu n\'ine" }', verbalize_time)

saa sita n'iminota mirongo itanu n'ine


In [11]:
class ClassifyFst(GraphFst):
    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
        super().__init__(name='tokenize_and_classify',kind='classify')
        far_file = "tokenize_and_classify.far" # None
        if cache_dir is not None and cache_dir != "None":
            os.makedirs(cache_dir, exist_ok=True)
            far_file = os.path.join(cache_dir, "tokenize_and_classify.far")
        if not overwrite_cache and far_file and os.path.exists(far_file):
            print("FAR file: ",far_file)
            self.fst = pynini.Far(far_file, mode="r")["TOKENIZE_AND_CLASSIFY"]
        else:
            cardinal = CardinalFst()
            cardinal_graph = cardinal.fst
            time_graph = TimeFst().fst
            word_graph = WordFst().fst
            punct_graph = PunctuationFst().fst
            whitelist_graph = WhiteListFst().fst
            classify = (
                pynutil.add_weight(time_graph, 1.05)
                | pynutil.add_weight(cardinal_graph, 1.1)
                | pynutil.add_weight(word_graph, 1.50)
                | pynutil.add_weight(whitelist_graph,1.01)
            )

            punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")
            token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
            token_plus_punct = (
                pynini.closure(punct + pynutil.insert(" ")) + token+ pynini.closure(pynutil.insert(" ") + punct)
            )
    
            graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)
            graph = delete_space + graph + delete_space
            self.fst = graph.optimize()
            if far_file:
                print("generating grammar")
                generator_main(far_file, {"TOKENIZE_AND_CLASSIFY":self.fst})

In [14]:
classify = ClassifyFst().fst

 NeMo-text-processing :: INFO     :: Created tokenize_and_classify.far
INFO:NeMo-text-processing:Created tokenize_and_classify.far


generating grammar


In [21]:
apply_fst("byatangiye 10:30 haje abantu 2000 niwo mubare wabakora muri 12:02",classify)

tokens { name: "byatangiye" } tokens { time { hours: "saa ine" minutes: " n'iminota mirongo itatu" } } tokens { name: "haje" } tokens { name: "abantu" } tokens { cardinal { integer: "ibihumbi bibiri " } } tokens { name: "niwo" } tokens { name: "mubare" } tokens { name: "wabakora" } tokens { name: "muri" } tokens { time { hours: "saa sita" minutes: " n'iminota ibiri" } }


In [12]:
class VerbalizeTimeFst(GraphFst):
    def __init__(self):
        super().__init__(name="time",kind="verbalize")
        hour = (pynutil.delete("hours:")+delete_space+pynutil.delete("\"")+pynini.closure(NEMO_CHAR,1,60)+pynutil.delete("\"")+delete_space+pynutil.delete("minutes:")+delete_space+pynutil.delete("\"")+pynini.closure(NEMO_CHAR,1,60)+pynutil.delete("\""))

        graph = hour 
        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()

In [17]:


class VerbalizeFst(GraphFst):
    def __init__(self):
        super().__init__(name="verbalize", kind="verbalize")
        cardinal = CardinalVerbalizerFst()#VerbalizeCardinalFst()
        cardinal_graph = cardinal.fst
        time = VerbalizeTimeFst().fst

        graph = (
            cardinal_graph
           | time
        )
        self.fst = graph

In [18]:

class VerbalizeFinalFst(GraphFst):
    def __init__(self):
        super().__init__(name="verbalize_final", kind="verbalize")
        verbalize = VerbalizeFst().fst
        word = WordVerbalizerFst().fst
        
        types = verbalize | word
        graph = (
            pynutil.delete("tokens")
            + delete_space
            + pynutil.delete("{")
            + delete_space
            + types
            + delete_space
            + pynutil.delete("}")
        )
        graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space
        graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space

        self.fst = graph
        far_file = "verbalize.far"
        generator_main(far_file, {"ALL":self.fst,'REDUP': pynini.accep("REDUP")})

In [19]:
final_verbalize = VerbalizeFinalFst().fst 

 NeMo-text-processing :: INFO     :: Created verbalize.far
INFO:NeMo-text-processing:Created verbalize.far


In [25]:
apply_fst('tokens { name: "byatangiye" } tokens { time { hours: "saa ine" minutes: "n\'iminota mirongo itatu" } } tokens { name: "haje" } tokens { name: "abantu" } tokens { cardinal { integer: "ibihumbi bibiri " } } tokens { name: "niwo" } tokens { name: "mubare" } tokens { name: "wabakora" } tokens { name: "muri" } tokens { time { hours: "saa sita" minutes: "n\'iminota ibiri" } } ',final_verbalize)

byatangiye saa inen'iminota mirongo itatu haje abantu ibihumbi bibiri  niwo mubare wabakora muri saa sitan'iminota ibiri


"
