In [1]:
import numpy as np
import pandas as pd
import regex as re
import os
from py_vncorenlp import VnCoreNLP
from transformers import pipeline

cwd = os.getcwd()
vncorenlp = VnCoreNLP(save_dir=os.environ["VNCORENLP"])
os.chdir(cwd)

2024-06-24 21:51:20 INFO  WordSegmenter:24 - Loading Word Segmentation model
2024-06-24 21:51:20 INFO  PosTagger:23 - Loading POS Tagging model
2024-06-24 21:51:22 INFO  NerRecognizer:34 - Loading NER model
2024-06-24 21:51:30 INFO  DependencyParser:32 - Loading Dependency Parsing model


In [2]:
uniChars = "√†√°·∫£√£·∫°√¢·∫ß·∫•·∫©·∫´·∫≠ƒÉ·∫±·∫Ø·∫≥·∫µ·∫∑√®√©·∫ª·∫Ω·∫π√™·ªÅ·∫ø·ªÉ·ªÖ·ªáƒë√¨√≠·ªâƒ©·ªã√≤√≥·ªè√µ·ªç√¥·ªì·ªë·ªï·ªó·ªô∆°·ªù·ªõ·ªü·ª°·ª£√π√∫·ªß≈©·ª•∆∞·ª´·ª©·ª≠·ªØ·ª±·ª≥√Ω·ª∑·ªπ·ªµ√Ä√Å·∫¢√É·∫†√Ç·∫¶·∫§·∫®·∫™·∫¨ƒÇ·∫∞·∫Æ·∫≤·∫¥·∫∂√à√â·∫∫·∫º·∫∏√ä·ªÄ·∫æ·ªÇ·ªÑ·ªÜƒê√å√ç·ªàƒ®·ªä√í√ì·ªé√ï·ªå√î·ªí·ªê·ªî·ªñ·ªò∆†·ªú·ªö·ªû·ª†·ª¢√ô√ö·ª¶≈®·ª§∆Ø·ª™·ª®·ª¨·ªÆ·ª∞·ª≤√ù·ª∂·ª∏·ª¥√ÇƒÇƒê√î∆†∆Ø"
unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU"


def loaddicchar():
    dic = {}
    char1252 = '√†|√°|·∫£|√£|·∫°|·∫ß|·∫•|·∫©|·∫´|·∫≠|·∫±|·∫Ø|·∫≥|·∫µ|·∫∑|√®|√©|·∫ª|·∫Ω|·∫π|·ªÅ|·∫ø|·ªÉ|·ªÖ|·ªá|√¨|√≠|·ªâ|ƒ©|·ªã|√≤|√≥|·ªè|√µ|·ªç|·ªì|·ªë|·ªï|·ªó|·ªô|·ªù|·ªõ|·ªü|·ª°|·ª£|√π|√∫|·ªß|≈©|·ª•|·ª´|·ª©|·ª≠|·ªØ|·ª±|·ª≥|√Ω|·ª∑|·ªπ|·ªµ|√Ä|√Å|·∫¢|√É|·∫†|·∫¶|·∫§|·∫®|·∫™|·∫¨|·∫∞|·∫Æ|·∫≤|·∫¥|·∫∂|√à|√â|·∫∫|·∫º|·∫∏|·ªÄ|·∫æ|·ªÇ|·ªÑ|·ªÜ|√å|√ç|·ªà|ƒ®|·ªä|√í|√ì|·ªé|√ï|·ªå|·ªí|·ªê|·ªî|·ªñ|·ªò|·ªú|·ªö|·ªû|·ª†|·ª¢|√ô|√ö|·ª¶|≈®|·ª§|·ª™|·ª®|·ª¨|·ªÆ|·ª∞|·ª≤|√ù|·ª∂|·ª∏|·ª¥'.split(
        '|')
    charutf8 = "√†|√°|·∫£|√£|·∫°|·∫ß|·∫•|·∫©|·∫´|·∫≠|·∫±|·∫Ø|·∫≥|·∫µ|·∫∑|√®|√©|·∫ª|·∫Ω|·∫π|·ªÅ|·∫ø|·ªÉ|·ªÖ|·ªá|√¨|√≠|·ªâ|ƒ©|·ªã|√≤|√≥|·ªè|√µ|·ªç|·ªì|·ªë|·ªï|·ªó|·ªô|·ªù|·ªõ|·ªü|·ª°|·ª£|√π|√∫|·ªß|≈©|·ª•|·ª´|·ª©|·ª≠|·ªØ|·ª±|·ª≥|√Ω|·ª∑|·ªπ|·ªµ|√Ä|√Å|·∫¢|√É|·∫†|·∫¶|·∫§|·∫®|·∫™|·∫¨|·∫∞|·∫Æ|·∫≤|·∫¥|·∫∂|√à|√â|·∫∫|·∫º|·∫∏|·ªÄ|·∫æ|·ªÇ|·ªÑ|·ªÜ|√å|√ç|·ªà|ƒ®|·ªä|√í|√ì|·ªé|√ï|·ªå|·ªí|·ªê|·ªî|·ªñ|·ªò|·ªú|·ªö|·ªû|·ª†|·ª¢|√ô|√ö|·ª¶|≈®|·ª§|·ª™|·ª®|·ª¨|·ªÆ|·ª∞|·ª≤|√ù|·ª∂|·ª∏|·ª¥".split(
        '|')
    for i in range(len(char1252)):
        dic[char1252[i]] = charutf8[i]
    return dic


dicchar = loaddicchar()


def convert_unicode(txt):
    return re.sub(
        r'√†|√°|·∫£|√£|·∫°|·∫ß|·∫•|·∫©|·∫´|·∫≠|·∫±|·∫Ø|·∫≥|·∫µ|·∫∑|√®|√©|·∫ª|·∫Ω|·∫π|·ªÅ|·∫ø|·ªÉ|·ªÖ|·ªá|√¨|√≠|·ªâ|ƒ©|·ªã|√≤|√≥|·ªè|√µ|·ªç|·ªì|·ªë|·ªï|·ªó|·ªô|·ªù|·ªõ|·ªü|·ª°|·ª£|√π|√∫|·ªß|≈©|·ª•|·ª´|·ª©|·ª≠|·ªØ|·ª±|·ª≥|√Ω|·ª∑|·ªπ|·ªµ|√Ä|√Å|·∫¢|√É|·∫†|·∫¶|·∫§|·∫®|·∫™|·∫¨|·∫∞|·∫Æ|·∫≤|·∫¥|·∫∂|√à|√â|·∫∫|·∫º|·∫∏|·ªÄ|·∫æ|·ªÇ|·ªÑ|·ªÜ|√å|√ç|·ªà|ƒ®|·ªä|√í|√ì|·ªé|√ï|·ªå|·ªí|·ªê|·ªî|·ªñ|·ªò|·ªú|·ªö|·ªû|·ª†|·ª¢|√ô|√ö|·ª¶|≈®|·ª§|·ª™|·ª®|·ª¨|·ªÆ|·ª∞|·ª≤|√ù|·ª∂|·ª∏|·ª¥',
        lambda x: dicchar[x.group()], txt
    )

In [3]:
bang_nguyen_am = [['a', '√†', '√°', '·∫£', '√£', '·∫°', 'a'],
                  ['ƒÉ', '·∫±', '·∫Ø', '·∫≥', '·∫µ', '·∫∑', 'aw'],
                  ['√¢', '·∫ß', '·∫•', '·∫©', '·∫´', '·∫≠', 'aa'],
                  ['e', '√®', '√©', '·∫ª', '·∫Ω', '·∫π', 'e'],
                  ['√™', '·ªÅ', '·∫ø', '·ªÉ', '·ªÖ', '·ªá', 'ee'],
                  ['i', '√¨', '√≠', '·ªâ', 'ƒ©', '·ªã', 'i'],
                  ['o', '√≤', '√≥', '·ªè', '√µ', '·ªç', 'o'],
                  ['√¥', '·ªì', '·ªë', '·ªï', '·ªó', '·ªô', 'oo'],
                  ['∆°', '·ªù', '·ªõ', '·ªü', '·ª°', '·ª£', 'ow'],
                  ['u', '√π', '√∫', '·ªß', '≈©', '·ª•', 'u'],
                  ['∆∞', '·ª´', '·ª©', '·ª≠', '·ªØ', '·ª±', 'uw'],
                  ['y', '·ª≥', '√Ω', '·ª∑', '·ªπ', '·ªµ', 'y']]
bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j']

nguyen_am_to_ids = {}

for i in range(len(bang_nguyen_am)):
    for j in range(len(bang_nguyen_am[i]) - 1):
        nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j)

def chuan_hoa_dau_tu_tieng_viet(word):
    if not is_valid_vietnam_word(word):
        return word

    chars = list(word)
    dau_cau = 0
    nguyen_am_index = []
    qu_or_gi = False
    for index, char in enumerate(chars):
        x, y = nguyen_am_to_ids.get(char, (-1, -1))
        if x == -1:
            continue
        elif x == 9:  # check qu
            if index != 0 and chars[index - 1] == 'q':
                chars[index] = 'u'
                qu_or_gi = True
        elif x == 5:  # check gi
            if index != 0 and chars[index - 1] == 'g':
                chars[index] = 'i'
                qu_or_gi = True
        if y != 0:
            dau_cau = y
            chars[index] = bang_nguyen_am[x][0]
        if not qu_or_gi or index != 1:
            nguyen_am_index.append(index)
    if len(nguyen_am_index) < 2:
        if qu_or_gi:
            if len(chars) == 2:
                x, y = nguyen_am_to_ids.get(chars[1])
                chars[1] = bang_nguyen_am[x][dau_cau]
            else:
                x, y = nguyen_am_to_ids.get(chars[2], (-1, -1))
                if x != -1:
                    chars[2] = bang_nguyen_am[x][dau_cau]
                else:
                    chars[1] = bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else bang_nguyen_am[9][dau_cau]
            return ''.join(chars)
        return word

    for index in nguyen_am_index:
        x, y = nguyen_am_to_ids[chars[index]]
        if x == 4 or x == 8:  # √™, ∆°
            chars[index] = bang_nguyen_am[x][dau_cau]
            # for index2 in nguyen_am_index:
            #     if index2 != index:
            #         x, y = nguyen_am_to_ids[chars[index]]
            #         chars[index2] = bang_nguyen_am[x][0]
            return ''.join(chars)

    if len(nguyen_am_index) == 2:
        if nguyen_am_index[-1] == len(chars) - 1:
            x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
            chars[nguyen_am_index[0]] = bang_nguyen_am[x][dau_cau]
            # x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
            # chars[nguyen_am_index[1]] = bang_nguyen_am[x][0]
        else:
            # x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
            # chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
            x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
            chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
    else:
        # x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
        # chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
        x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
        chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
        # x, y = nguyen_am_to_ids[chars[nguyen_am_index[2]]]
        # chars[nguyen_am_index[2]] = bang_nguyen_am[x][0]
    return ''.join(chars)


def is_valid_vietnam_word(word):
    chars = list(word)
    nguyen_am_index = -1
    for index, char in enumerate(chars):
        x, y = nguyen_am_to_ids.get(char, (-1, -1))
        if x != -1:
            if nguyen_am_index == -1:
                nguyen_am_index = index
            else:
                if index - nguyen_am_index != 1:
                    return False
                nguyen_am_index = index
    return True


def chuan_hoa_dau_cau_tieng_viet(sentence):
    """
        Chuy·ªÉn c√¢u ti·∫øng vi·ªát v·ªÅ chu·∫©n g√µ d·∫•u ki·ªÉu c≈©.
        :param sentence:
        :return:
        """
    words = sentence.split()
    for index, word in enumerate(words):
        cw = re.sub(r'(^\p{P}*)([p{L}.]*\p{L}+)(\p{P}*$)', r'\1/\2/\3', word).split('/')
        # print(cw)
        if len(cw) == 3:
            cw[1] = chuan_hoa_dau_tu_tieng_viet(cw[1])
        words[index] = ''.join(cw)
    return ' '.join(words)

In [4]:
sp_word_sub = {
    "@@": "confuseeyes",
    "‚ÑÖ": "%",
    r"/": " fraction ",
    r":\)+": "smileface",
    r";\)+": "smileface",
    r":\*+": "kissingface",
    r"=\)+": "playfulsmileface",
    r"=\(+": "playfulsadface",
    r":\(+": "sadface",
    r":3+": "threeface",
    r":v+": "vface",
    r"\^\^": "kindsmile",
    r"\^_\^": "kindmountsmile",
    r"\^\.\^": "kindmountsmile",
    r"-_-": "disapointface",
    r"\._\.": "confusedface",
    r":>+": "cutesmile",
    r"(\|)w(\|)": "fancycryface",
    r":\|": "mutedface",
    r":d+": "laughface",
    r"<3": "loveicon",
    r"\.{2,}": "threedot",
    r"-{1,}>{1,}": "arrow",
    r"={1,}>{1,}": "arrow",
    r"(\d+)h": r"\1 gi·ªù",
    r"(\d+)'": r"\1 ph√∫t",
    r"(\d+)trieu": r"\1 tri·ªáu",
    r"(\d+)\s?tr": r"\1 tri·ªáu",
    r"blut\w+": "bluetooth",
    r"(\d+)\s\*": r"\1 sao"
}

replace_dict = {
    "/": "fraction",
    "wf": "wifi",
    "wifj": "wifi",
    "wjfj": "wifi",
    "wjfi": "wifi",
    "wiffi": "wifi",
    "wj": "wifi",
    "ko": "kh√¥ng",
    "k": "kh√¥ng",
    "hong": "kh√¥ng",
    "ƒëc": "ƒë∆∞·ª£c",
    "sp": "s·∫£n ph·∫©m",
    "fb": "facebook",
    "ytb": "youtube",
    "yt": "youtube",
    "mes": "messenger",
    "mess": "messenger",
    "tgdƒë": "thegioididong",
    "nv": "nh√¢n vi√™n",
    "ss": "samsung",
    "ip": "iphone",
    "appel": "apple",
    "oke": "ok",
    "okie": "ok",
    "okey": "ok",
    "oki": "ok",
    "oce": "ok",
    "okela": "ok",
    "mk": "m√¨nh",
    "sd": "s·ª≠ d·ª•ng",
    "sdung": "s·ª≠ d·ª•ng",
    "ae": "anh em",
    "lq": "li√™n qu√¢n",
    "lqmb": "li√™n qu√¢n mobile",
    "lun": "lu√¥n",
    "ng": "ng∆∞·ªùi",
    "ad": "admin",
    "ms": "m·ªõi",
    "cx": "c≈©ng",
    "c≈©g": "c≈©ng",
    "nh√¨u": "nhi·ªÅu",
    "bth": "b√¨nh th∆∞·ªùng",
    "bthg": "b√¨nh th∆∞·ªùng",
    "ngta": "ng∆∞·ªùi ta",
    "dow": "download",
    "hdh": "h·ªá ƒëi·ªÅu h√†nh",
    "hƒëh": "h·ªá ƒëi·ªÅu h√†nh",
    "cammera": "camera",
    "dt": "ƒëi·ªán tho·∫°i",
    "dthoai": "ƒëi·ªán tho·∫°i",
    "dth": "ƒëi·ªán tho·∫°i",
    "ƒëth": "ƒëi·ªán tho·∫°i",
    "hk": "kh√¥ng",
    "j": "g√¨",
    "ji": "g√¨",
    "mn": "m·ªçi ng∆∞·ªùi",
    "m.n": "m·ªçi ng∆∞·ªùi",
    "mjh": "m√¨nh",
    "mjk": "m√¨nh",
    "l·∫Øc": "lag",
    "l√°c": "lag",
    "lang": "lag",
    "nhah": "nhanh",
    "n√≥ichung": "n√≥i chung",
    "zl": "zalo",
    "s√≥g": "s√≥ng",
    "r·∫Ω": "r·∫ª",
    "trc": "tr∆∞·ªõc",
    "ch√≠p": "chip",
    "bin": "pin",
    "lm": "l√†m",
    "bik": "bi·∫øt",
    "hog": "kh√¥ng",
    "z·ªèm": "d·ªïm",
    "z": "v·∫≠y",
    "v": "v·∫≠y",
    "nhah": "nhanh",
    "r": "r·ªìi",
    "·ªón": "·ªïn",
    "nh√¨u": "nhi·ªÅu",
    "w√°": "qu√°",
    "wep": "web",
    "wed": "web",
    "fim": "phim",
    "film": "phim",
    "x·∫°c": "s·∫°c",
    "x√†i": "s√†i",
    "het": "h·∫øt",
    "lun": "lu√¥n",
    "e": "em",
    "a": "anh",
    "bjo": "b√¢y gi·ªù",
    "vl": "v√£i l·ªìn",
    "sac": "s·∫°c",
    "vidieo": "video",
    "t√©t": "test",
    "tes": "test",
    "thik": "th√≠ch",
    "fai": "ph·∫£i",
    "‚úã": "tay",
    "üîã": "pin",
    "‚òÜ": "sao",
    "supper": "super",
    "l·ªïi": "l·ªói",
    "lo√°t": "load",
    "thui": "th√¥i",
    "r√πi": "r·ªìi",
    "·ªón": "·ªïn",
    "l·ªïi": "l·ªói",
    "su·ªëng": "xu·ªëng",
    "selfi": "selfie",
    "gg": "google",
    "cam on": "c·∫£m ∆°n",
    "tg": "th·ªùi gian",
    "nchung": "n√≥i chung",
    "‚ù§": "loveicon",
    "tr·∫°i nghi·ªám": "tr·∫£i nghi·ªám",
    "d·∫•t": "r·∫•t",
    "ƒë·ª©g": "ƒë·ª©ng",
    "b·∫±g": "b·∫±ng",
    "m√¨h": "m√¨nh",
    "ƒëag": "ƒëang",
    "thoi": "th√¥i",
    "c·ªßng": "c≈©ng",
    "ƒë·∫£": "ƒë√£",
    "m√†ng": "m√†n",
    "ff": "free fire",
    "cod": "call of duty",
    "moi th·ª©": "m·ªçi th·ª©",
    "moi thu": "m·ªçi th·ª©",
    "moi th∆∞": "m·ªçi th·ª©",
    "moi ng∆∞·ªùi": "m·ªçi ng∆∞·ªùi",
    "moi": "m·ªõi",
    "dk": "ƒë∆∞·ª£c",
    "ƒëk": "ƒë∆∞·ª£c",
    "nh·∫≠y": "nh·∫°y",
    "ak": "√°",
    "ghe": "nghe",
    "b√πn": "bu·ªìn",
    "bit": "bi·∫øt",
    "b√≠t": "bi·∫øt",
    "bnhieu": "bao nhi√™u",
    "d·ª•g": "d·ª•ng",
    "tk": "t√†i kho·∫£n",
    "sƒÖc": "s·∫°c",
    "r√¢t": "r√¢t",
    "haz": "haiz",
    "sai l√†m": "sai l·∫ßm",
    "flim": "phim",
    "x∆∞·ªõt": "x∆∞·ªõc",
    "vi·ªÅng": "vi·ªÅn"
}

drop_indices = [1537, 2754, 4267, 6536]

def normalize(text: str, track_change=False):
    # Lowercase
    text = text.lower()

    text = re.sub(r"((https?|ftp|file):\/{2,3})+([-\w+&@#/%=~|$?!:,.]*)|(www.)+([-\w+&@#/%=~|$?!:,.]*)", "urllink", text)

    # Remove dup trailing chars (troiiiii -> troi)
    text = re.sub(r"([\D\w])\1+\b", r"\1", text)
    if track_change:
        print("Dedup trailing: ", text)

    # Replace special symbol to word
    for pttn, repl in sp_word_sub.items():
        text = re.sub(fr"{pttn}", f" {repl} ", text)
    if track_change:
        print("Replace special word: ", text)
    
    # Correct misspelled word
    def replace(match):
        orig = match.group(1)
        word = " " + replace_dict.get(orig, orig) + " "
        return word
    text = re.sub(r"\b(\S+)\b", replace, text)
    if track_change:
        print("Correct misspelled word: ", text)

    # Normalize string encoding
    text = convert_unicode(text)
    if track_change:
        print("Normalize string encoding: ", text)

    # Vietnamese unicode normalization
    text = chuan_hoa_dau_cau_tieng_viet(text)
    if track_change:
        print("Vietnamese unicode normalization: ", text)

    # Eliminate decimal delimiter (9.000 -> 9000)
    text = re.sub(r"(?<=\d)\.(?=\d{3})", "", text)
    if track_change:
        print("Eliminate decimal delimiter: ", text)
    
    # Split between value and unit (300km -> 300 km)
    text = re.sub(r"(\d+)(\D+)", r"\1 \2", text)
    if track_change:
        print("Split between value and unit: ", text)

    # Split by punctuations
    text = " ".join(
        re.split("(["+re.escape("!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~")+"])", text)
    )
    if track_change:
        print("Split by punctuations: ", text)

    # Split by emoticons
    text = " ".join(
        re.split("(["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0001f926-\U0001f937"
            u'\U00010000-\U0010ffff'
            u"\u200d"
            u"\u2640-\u2642"
            u"\u2600-\u2B55"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\u3030"
            u"\ufe0f"
            u"\u221a"
        "])", text)
    )

    # Word segmentation
    # text = " ".join(vncorenlp.word_segment(text))
    
    return text

In [5]:
normalize("·ª¶a cho em h·ªèi l√† iphone 7plus ch√≠nh h√£ng c·ªßa tgdd l√† kh√¥ng c√≥ lo·∫°i 64G hay 128G d·∫° ? Hay l√† n√≥ ƒë·ªôc quy·ªÅn 32G √† . C√≤n m·∫•y c√°i 7plus G cao l√† h√†ng nh√°i hay sao ? T·∫°i em ra tgdd ki·∫øm m√† ch·ªâ c√≥ 32G th√¥i n√™n mua lu√¥n üòÇ gi·ªù th·∫Øc m·∫Øc l√™n h·ªèi ü§îü§î", track_change=True)

Dedup trailing:  ·ªßa cho em h·ªèi l√† iphone 7plus ch√≠nh h√£ng c·ªßa tgd l√† kh√¥ng c√≥ lo·∫°i 64g hay 128g d·∫° ? hay l√† n√≥ ƒë·ªôc quy·ªÅn 32g √† . c√≤n m·∫•y c√°i 7plus g cao l√† h√†ng nh√°i hay sao ? t·∫°i em ra tgd ki·∫øm m√† ch·ªâ c√≥ 32g th√¥i n√™n mua lu√¥n üòÇ gi·ªù th·∫Øc m·∫Øc l√™n h·ªèi ü§îü§î
Replace special word:  ·ªßa cho em h·ªèi l√† iphone 7plus ch√≠nh h√£ng c·ªßa tgd l√† kh√¥ng c√≥ lo·∫°i 64g hay 128g d·∫° ? hay l√† n√≥ ƒë·ªôc quy·ªÅn 32g √† . c√≤n m·∫•y c√°i 7plus g cao l√† h√†ng nh√°i hay sao ? t·∫°i em ra tgd ki·∫øm m√† ch·ªâ c√≥ 32g th√¥i n√™n mua lu√¥n üòÇ gi·ªù th·∫Øc m·∫Øc l√™n h·ªèi ü§îü§î
Correct misspelled word:   ·ªßa   cho   em   h·ªèi   l√†   iphone   7plus   ch√≠nh   h√£ng   c·ªßa   tgd   l√†   kh√¥ng   c√≥   lo·∫°i   64g   hay   128g   d·∫°  ?  hay   l√†   n√≥   ƒë·ªôc   quy·ªÅn   32g   √†  .  c√≤n   m·∫•y   c√°i   7plus   g   cao   l√†   h√†ng   nh√°i   hay   sao  ?  t·∫°i   em   ra   tgd   ki·∫øm   m√†   ch·ªâ   c√≥   32g   th√¥i   n√™n   mua

'·ªßa cho em h·ªèi l√† iphone 7 plus ch√≠nh h√£ng c·ªßa tgd l√† kh√¥ng c√≥ lo·∫°i 64 g hay 128 g d·∫°  ?  hay l√† n√≥ ƒë·ªôc quy·ªÅn 32 g √†  .  c√≤n m·∫•y c√°i 7 plus g cao l√† h√†ng nh√°i hay sao  ?  t·∫°i em ra tgd ki·∫øm m√† ch·ªâ c√≥ 32 g th√¥i n√™n mua lu√¥n  üòÇ  gi·ªù th·∫Øc m·∫Øc l√™n h·ªèi  ü§î  ü§î '

In [6]:
from tqdm import tqdm

for dset in ["Train", "Dev", "Test"]:
    df = pd.read_csv(f"data/{dset}.csv")
    if dset == "Train":
        df.drop(index=drop_indices)
    normalized = df.comment.map(normalize)
    with open(f"data/prep-{dset}.txt", "w") as f:
        f.write("\n".join(normalized))