# Import Libary Package

In [1]:
!pip install -qU py_vncorenlp emot levenshtein tqdm

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m824.5 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for py_vncorenlp (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import numpy as np
import json
import re
from difflib import SequenceMatcher
import itertools
from itertools import combinations, product, chain
from pprint import pprint
from ast import literal_eval
from tqdm import tqdm

from emot.emo_unicode import UNICODE_EMOJI
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from Levenshtein import ratio

from __future__ import unicode_literals
import operator
import sys

try:
    from html.parser import HTMLParser
except ImportError:
    from HTMLParser import HTMLParser

try:
    import html
except ImportError:
    pass
import py_vncorenlp

# Load data

In [3]:
train = pd.read_csv('/content/drive/MyDrive/LexNorm/KLTN/Data/ViLexNorm/raw_data/train.csv').drop('Unnamed: 0', axis=1)
test = pd.read_csv('/content/drive/MyDrive/LexNorm/KLTN/Data/ViLexNorm/raw_data/test.csv').drop('Unnamed: 0', axis=1)
dev = pd.read_csv('/content/drive/MyDrive/LexNorm/KLTN/Data/ViLexNorm/raw_data/dev.csv').drop('Unnamed: 0', axis=1)
print(train.shape, test.shape, dev.shape)

(8372, 2) (1045, 2) (1050, 2)


In [4]:
ViLexNorm = pd.concat([train, test, dev]).reset_index(drop=True)
ViLexNorm

Unnamed: 0,original,normalized
0,thích anh cá mập k,thích anh cá mập không
1,cứ ngây thơ thế thoai :)),cứ ngây thơ thế thôi :))
2,bà Nghê xinh vậy mà t thấy k bằng bà ChiPu luô...,bà Nghê xinh vậy mà tôi thấy không bằng bà Chi...
3,Ê k khóc được làm thế nào má =))?,Ê không khóc được làm thế nào má =))?
4,Có biến gì hong dẫy :)),Có biến gì không vậy :))
...,...,...
10462,Từ lúc đu idol Hàn và diễn viên Thái thì hầu n...,Từ lúc đu idol Hàn và diễn viên Thái thì hầu n...
10463,A hay nói e á,Anh hay nói em á
10464,Đọc jd mà nó dễ quá đâm ra sợ cty lừa :))),Đọc jd mà nó dễ quá đâm ra sợ công ty lừa :)))
10465,Chỗ còn tuyển hong rủ tui vô làm nói chuyện dí...,Chỗ còn tuyển không rủ tui vô làm nói chuyện v...


# Tokenization Fuction

In [5]:
def regex_or(*items):
    return '(?:' + '|'.join(items) + ')'

In [6]:
punctChars = r"['\"“”‘’.?!…,:;]"
#punctSeq   = punctChars+"+"	#'anthem'. => ' anthem '.
punctSeq   = r"['\"“”‘’]+|[.?!,…]+|[:;]+"	#'anthem'. => ' anthem ' .
entity     = r"&(?:amp|lt|gt|quot);"

#URLs
urlStart1  = r"(?:https?://|\bwww\.)"
commonTLDs = r"(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)"
ccTLDs	 = r"(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" + \
r"bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" + \
r"er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" + \
r"hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" + \
r"lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" + \
r"nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" + \
r"sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" + \
r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)"	#TODO: remove obscure country domains?
urlStart2  = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)"
urlBody    = r"(?:[^\.\s<>][^\s<>]*?)?"
urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?"
urlEnd     = r"(?:\.\.+|[<>]|\s|$)"
url        = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")"

# Numeric
timeLike   = r"\d+(?::\d+){1,2}"  # (1:00:00)
numdotNum = r"\d+[.,]\d+"
rationum = r"\d+\-\d+"
numberWithCommas = r"(?:(?<!\d)\d{1,3},)+?\d{3}" + r"(?=(?:[^,\d]|$))"  #(1,5  2,5)
number_slash_number = r'-?\d+[,\.]\d+/\d+([,\.]\d+)?'
#currency
numComb	 = u"[\u0024\u058f\u060b\u09f2\u09f3\u09fb\u0af1\u0bf9\u0e3f\u17db\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6\u00a2-\u00a5\u20a0-\u20b9]?\\d+(?:\\.\\d+)+%?"

boundaryNotDot = regex_or("$", r"\s", r"[“\"?!,:;]", entity)
separators  = "(?:--+|―|—|~|–|=)"
decorations = u"(?:[♫♪]+|[★☆]+|[♥❤♡]+|[\u2639-\u263b]+|[\ue001-\uebbb]+)"
thingsThatSplitWords = r"[^\s\.,?\"]"


#  Emoticons
# myleott: in Python the (?iu) flags affect the whole expression
#normalEyes = "(?iu)[:=]" # 8 and x are eyes but cause problems
normalEyes = "[:=+-]" # 8 and x are eyes but cause problems
wink = "[;]"
noseArea = "(?:|-|[^a-zA-Z0-9 ]|\s)" # doesn't get :'-(

happyMouths = r"[D\)\]\}\>]+"
sadMouths = r"[\(\[\{\<]+"
tongue = "[pPdđĐ3*]+"
otherMouths = r"(?:[oO]+|[/\\]+|[vV]+|[Ss]+|[|]+)" # remove forward slash if http://'s aren't cleaned
bfLeft = u"(♥|0|[oO]|°|[vV]|\\$|[tT]|[xX]|;|\u0ca0|@|ʘ|•|・|◕|\^|¬|\\*)"
bfCenter = r"(?:[\.]|•|・x・|・|[_-]+)"
bfRight = r"\2"
s3 = r"(?:--['\"])"
s4 = r"(?:<|&lt;|>|&gt;)[\._-]+(?:<|&lt;|>|&gt;)"
s5 = "(?:[.][_]+[.])"

double_eyes = r"\^\^|@@"
basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
eye_with_nose = "(?:" +bfLeft+bfCenter+bfLeft+ ")" # Like ^.^, T_T,

eeLeft = r"[＼\\ƪԄ\(（<>;ヽ\-=~\*]+"
eeRight= u"[\\-=\\);'\u0022<>ʃ）/／ノﾉ丿╯σっµ~\\*]+"
eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]"
eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight
oOEmote = r"(?:[oO]" + bfCenter + r"[oO])"

Hearts = "(?:<+/?3+)+"
Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+")
Hashtag = "#[a-zA-Z0-9_ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềềểỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễếệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳỵỷỹ]+"
AtMention = "[@＠][a-zA-Z0-9_ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềềểỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễếệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳỵỷỹ]+"
Bound = r"(?:\W|^|$)"
Email = regex_or("(?<=(?:\W))", "(?<=(?:^))") + r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}(?=" +Bound+")"

# Edge punctuation
edgePunctChars    = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols)
edgePunct    = "[" + edgePunctChars + "]"
notEdgePunct = "[a-zA-Z0-9_ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềềểỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễếệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳỵỷỹ]" # content characters
offEdge = r"(^|$|:|;|\s|\.|,)"  # colon here gets "(hello):" ==> "( hello ):"
EdgePunctLeft  = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE)
EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE)

In [7]:
emoticon = regex_or(
        # Standard version  :) :( :] :D :P
        "(?:>|&gt;)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]", "\*") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths),

        # reversed version (: D:  use positive lookbehind to remove "(word):"
        # because eyes on the right side is more ambiguous with the standard usage of : ;
        regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|&lt;)?",

        #inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
        eastEmote.replace("2", "1", 1), basicface,eye_with_nose,
        # iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
        # TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this

        # myleott: o.O and O.o are two of the biggest sources of differences
        #          between this and the Java version. One little hack won't hurt...
        oOEmote, double_eyes
)

In [8]:
# m/d/yyyy + mm/dd/yyyy
d1 = r"^(1[0-2]|0?[1-9])/(3[01]|[12][0-9]|0?[1-9])/(?:[0-9]{2})?[0-9]{2}$"
# d/m/yyyy + dd/mm/yyyy
d2 = r"^(3[01]|[12][0-9]|0?[1-9])/(1[0-2]|0?[1-9])/(?:[0-9]{2})?[0-9]{2}$"
# m/d or mm/dd
d3 = r"^(1[0-2]|0?[1-9])/(3[01]|[12][0-9]|0?[1-9])"
# d/m or dd/mm
d4 = r"^(3[01]|[12][0-9]|0?[1-9])/(1[0-2]|0?[1-9])"

date = regex_or(d1,d2,d3,d4)

In [9]:
# We will be tokenizing using these regexps as delimiters
# Additionally, these things are "protected", meaning they shouldn't be further split themselves.
Protected  = re.compile(
    regex_or(
        Hearts,
        url,
        Email,
        timeLike,
        date,
        numdotNum,
        rationum,
        numberWithCommas,
        number_slash_number,
        numComb,
        emoticon,
        Arrows,
        entity,
        punctSeq,
        separators,
        decorations,
        Hashtag,
        AtMention), re.UNICODE)

In [10]:
emoji_list = list(UNICODE_EMOJI)

In [11]:
def splitEdgePunct(input):
    input = EdgePunctLeft.sub(r"\1\2 \3", input)
    input = EdgePunctRight.sub(r"\1 \2\3", input)
    return input

def addAllnonempty(master, smaller):
    for s in smaller:
        strim = s.strip()
        if (len(strim) > 0):
            master.append(strim)
    return master

def flatten(A):
    rt = []
    for i in A:
        if isinstance(i,list): rt.extend(flatten(i))
        else: rt.append(i)
    return rt

def split_emoji_text(text):
    result = ''.join((' {} '.format(el) if el in emoji_list else el for el in text))
    return result


def split_emoji_emoji(textArray):
    result = []
    for word in textArray:
        is_emoji = False

        # iterate element of characters and check char is emoji if yes
        # then append into result list and set is_emogi flag true.
        for char in word:
            if char in [*emoji_list]:
                result.append(char)
                is_emoji = True

        # check is_emoji flag is false then append string into list.
        if not is_emoji:
            result.append(word)

    return result

def remove_dotNotend_in_list(lst):
  for i, e in enumerate(lst):
    if e == '.':
      if (i != len(lst) -1) and (i == 0 or not lst[i + 1].isupper()):
        lst[i] = ''
      else:
        pass
  return lst

def wordSegmentandKeepicon(text):

    # Do the no-brainers first
    splitPunctText = splitEdgePunct(text)

    textLength = len(splitPunctText)

    # BTO: the logic here got quite convoluted via the Scala porting detour
    # It would be good to switch back to a nice simple procedural style like in the Python version
    # ... Scala is such a pain.  Never again.

    # Find the matches for subsequences that should be protected,
    # e.g. URLs, 1.0, U.N.K.L.E., 12:53
    bads = []
    badSpans = []
    for match in Protected.finditer(splitPunctText):
        # The spans of the "bads" should not be split.
        if (match.start() != match.end()): #unnecessary?
            bads.append( [splitPunctText[match.start():match.end()]] )
            badSpans.append( (match.start(), match.end()) )
    #print(bads)
    #print(badSpans)
    # Create a list of indices to create the "goods", which can be
    # split. We are taking "bad" spans like
    #     List((2,5), (8,10))
    # to create
    #     List(0, 2, 5, 8, 10, 12)
    # where, e.g., "12" here would be the textLength
    # has an even length and no indices are the same
    indices = [0]
    for (first, second) in badSpans:
        indices.append(first)
        indices.append(second)
    indices.append(textLength)
    #print(indices)
    # Group the indices and map them to their respective portion of the string
    splitGoods = []
    for i in range(0, len(indices), 2):
        goodstr = splitPunctText[indices[i]:indices[i+1]]
        goodstr = rdrsegmenter.word_segment(goodstr)
        #splitstr = goodstr.strip().split(" ")
        splitGoods.append(goodstr)

    #  Reinterpolate the 'good' and 'bad' Lists, ensuring that
    #  additonal tokens from last good item get included
    zippedStr = []
    for i in range(len(bads)):
        zippedStr = addAllnonempty(zippedStr, splitGoods[i])
        zippedStr = addAllnonempty(zippedStr, bads[i])
    zippedStr = addAllnonempty(zippedStr, splitGoods[len(bads)])

    # BTO: our POS tagger wants "ur" and "you're" to both be one token.
    # Uncomment to get "you 're"
    #splitStr = []
    #for tok in zippedStr:
    #    splitStr.extend(splitToken(tok))
    #zippedStr = splitStr

    return zippedStr

def simpleTokenize(text):

    # Do the no-brainers first
    splitPunctText = splitEdgePunct(text)

    textLength = len(splitPunctText)

    # BTO: the logic here got quite convoluted via the Scala porting detour
    # It would be good to switch back to a nice simple procedural style like in the Python version
    # ... Scala is such a pain.  Never again.

    # Find the matches for subsequences that should be protected,
    # e.g. URLs, 1.0, U.N.K.L.E., 12:53
    bads = []
    badSpans = []
    for match in Protected.finditer(splitPunctText):
        # The spans of the "bads" should not be split.
        if (match.start() != match.end()): #unnecessary?
            bads.append( [splitPunctText[match.start():match.end()]] )
            badSpans.append( (match.start(), match.end()) )

    # Create a list of indices to create the "goods", which can be
    # split. We are taking "bad" spans like
    #     List((2,5), (8,10))
    # to create
    #     List(0, 2, 5, 8, 10, 12)
    # where, e.g., "12" here would be the textLength
    # has an even length and no indices are the same
    indices = [0]
    for (first, second) in badSpans:
        indices.append(first)
        indices.append(second)
    indices.append(textLength)

    # Group the indices and map them to their respective portion of the string
    splitGoods = []
    for i in range(0, len(indices), 2):
        goodstr = splitPunctText[indices[i]:indices[i+1]]
        splitstr = goodstr.strip().split(" ")
        splitGoods.append(splitstr)

    #  Reinterpolate the 'good' and 'bad' Lists, ensuring that
    #  additonal tokens from last good item get included
    zippedStr = []
    for i in range(len(bads)):
        zippedStr = addAllnonempty(zippedStr, splitGoods[i])
        zippedStr = addAllnonempty(zippedStr, bads[i])
    zippedStr = addAllnonempty(zippedStr, splitGoods[len(bads)])

    # BTO: our POS tagger wants "ur" and "you're" to both be one token.
    # Uncomment to get "you 're"
    #splitStr = []
    #for tok in zippedStr:
    #    splitStr.extend(splitToken(tok))
    #zippedStr = splitStr

    return zippedStr

def preprocess_with_word_segment(text):
    #text = text.lower()
    text = split_emoji_text(text)
    text = wordSegmentandKeepicon(text)
    text = remove_dotNotend_in_list(text)
    text = split_emoji_emoji(text)
    text = list(filter(str.strip, text))
    sentence = ' '.join(text)
    sentence = sentence.lower()
    return sentence

def preprocess(text):
    #text = text.lower()
    text = split_emoji_text(text)
    text = simpleTokenize(text)
    text = remove_dotNotend_in_list(text)
    text = split_emoji_emoji(text)
    text = list(filter(str.strip, text))
    sentence = ' '.join(text)
    sentence = sentence.lower()
    return sentence

# Word Segmentation

In [12]:
# Automatically download VnCoreNLP components from the original repository
# and save them in some local machine folder
py_vncorenlp.download_model(save_dir='/content')

In [13]:
# Load the word and sentence segmentation component
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/content')

In [14]:
text = 'con mẹ nó đoạn hôm qua t làm rồi nhưng mà nay lỗi'
output = preprocess_with_word_segment(text)
print("Orignal:", text)
print("Segmented:", output)

Orignal: con mẹ nó đoạn hôm qua t làm rồi nhưng mà nay lỗi
Segmented: con_mẹ nó đoạn hôm_qua t làm rồi nhưng_mà nay lỗi


In [15]:
text = ViLexNorm['original'][3964]
output = preprocess_with_word_segment(text)
print("Orignal:")
print("Text:", text)
print("Segmented:", output)

text = ViLexNorm['normalized'][3964]
output = preprocess_with_word_segment(text)
print("Normalized:")
print("Text:", text)
print("Segmented:", output)

Orignal:
Text: cô lại đây mà xem, tiểu bảo bối còn quá nhỏ v mà
Segmented: cô lại đây mà xem , tiểu bảo_bối còn quá nhỏ v mà
Normalized:
Text: cô lại đây mà xem, tiểu bảo bối còn quá nhỏ vậy mà
Segmented: cô lại đây mà xem , tiểu bảo_bối còn quá nhỏ vậy_mà


In [16]:
print(ViLexNorm['original'][8787])
print(ViLexNorm['normalized'][8787])

Ăn không có ngon k mn? Có chua k?
Ăn có không ngon không mọi người? Có chua không?


# **Post-processing and Create NSW dictionary**

In [17]:
path = '/content/drive/MyDrive/LexNorm/KLTN/Data/ViLexNorm/raw_data/word_dict/LexNorm_word(checked).csv'

In [18]:
data = pd.read_csv(path, keep_default_na=False)
data['label'] = data['token'] != data['norm']
# label: 0 (standard), 1 (non-standard)
data['token'] = data['token'].str.replace('_',' ')
data['norm'] = data['norm'].str.replace('_',' ')
data

Unnamed: 0,sent_idx,token_idx,token,norm,label
0,0,0,thích,thích,False
1,0,1,anh,anh,False
2,0,2,cá mập,cá mập,False
3,0,3,k,không,True
4,1,0,cứ,cứ,False
...,...,...,...,...,...
121097,10466,0,bí ẩn,bí ẩn,False
121098,10466,1,j,gì,True
121099,10466,2,v,vậy,True
121100,10466,3,mn,mọi người,True


In [19]:
std_df = data[['sent_idx', 'label']].groupby(['sent_idx']).sum()
std_idx = std_df.loc[std_df['label'] == 0].index
std_idx

Index([433, 4904, 5334, 8435], dtype='int64', name='sent_idx')

In [20]:
data = data[~data['sent_idx'].isin(std_idx)].reset_index(drop = True)
data

Unnamed: 0,sent_idx,token_idx,token,norm,label
0,0,0,thích,thích,False
1,0,1,anh,anh,False
2,0,2,cá mập,cá mập,False
3,0,3,k,không,True
4,1,0,cứ,cứ,False
...,...,...,...,...,...
121067,10466,0,bí ẩn,bí ẩn,False
121068,10466,1,j,gì,True
121069,10466,2,v,vậy,True
121070,10466,3,mn,mọi người,True


In [21]:
new_data = []
i=0
for sent_idx in data.groupby(['sent_idx']).sum().index:
  sentence = {}
  sentence['id'] = i
  sentence['input'] = list(data.loc[data['sent_idx'] == sent_idx]['token'])
  sentence['output'] = list(data.loc[data['sent_idx'] == sent_idx]['norm'])
  new_data.append(sentence)
  i+=1

In [22]:
train_json = new_data[:8369]
test_json = new_data[8370:9412]
dev_json = new_data[9413:]

In [23]:
#with open("/content/drive/MyDrive/LexNorm/KLTN/Data/ViLexNorm/Token_data/Segment/ViLexNorm_segment.json", "w") as outfile:
    #json.dump(new_data, outfile, ensure_ascii=False)

#with open("/content/drive/MyDrive/LexNorm/KLTN/Data/ViLexNorm/Token_data/Segment/train_segment.json", "w") as outfile:
    #json.dump(train_json, outfile, ensure_ascii=False)

#with open("/content/drive/MyDrive/LexNorm/KLTN/Data/ViLexNorm/Token_data/Segment/test_segment.json", "w") as outfile:
    #json.dump(test_json, outfile, ensure_ascii=False)

#with open("/content/drive/MyDrive/LexNorm/KLTN/Data/ViLexNorm/Token_data/Segment/dev_segment.json", "w") as outfile:
    #json.dump(dev_json, outfile, ensure_ascii=False)

In [24]:
ViLexNorm = []
for sent_idx in data.groupby(['sent_idx']).sum().index:
  sentence = {}
  sentence['id'] = sent_idx
  sentence['input'] = ' '.join(list(data.loc[data['sent_idx'] == sent_idx]['token'])).replace('_', ' ')
  sentence['output'] = ' '.join(list(data.loc[data['sent_idx'] == sent_idx]['norm'])).replace('_', ' ')
  ViLexNorm.append(sentence)
ViLexNorm = pd.DataFrame(ViLexNorm)
ViLexNorm = ViLexNorm.drop(columns=['id'])
ViLexNorm.columns = ['original', 'normalized']
ViLexNorm

Unnamed: 0,original,normalized
0,thích anh cá mập k,thích anh cá mập không
1,cứ ngây thơ thế thoai :)),cứ ngây thơ thế thôi :))
2,bà nghê xinh vậy mà t thấy k bằng bà chipu luô...,bà nghê xinh vậy mà tôi thấy không bằng bà chi...
3,ê k khóc được làm thế nào má =)) ?,ê không khóc được làm thế nào má =)) ?
4,có biến gì hong dẫy :)),có biến gì không vậy :))
...,...,...
10458,từ lúc đu idol hàn và diễn viên thái thì hầu n...,từ lúc đu idol hàn và diễn viên thái thì hầu n...
10459,a hay nói e á,anh hay nói em á
10460,đọc jd mà nó dễ quá đâm ra sợ cty lừa :))),đọc jd mà nó dễ quá đâm ra sợ công ty lừa :)))
10461,chỗ còn tuyển hong rủ tui vô làm nói chuyện dí...,chỗ còn tuyển không rủ tui vô làm nói chuyện v...


In [25]:
ViLexNorm.original.str.len().max()

556

In [26]:
train = ViLexNorm[:8369]
test = ViLexNorm[8370:9412]
dev = ViLexNorm[9413:]
print(len(train), len(test), len(dev))

8369 1042 1050


In [27]:
#train.to_csv("/content/drive/MyDrive/LexNorm/KLTN/Data/ViLexNorm/train.csv", index=False)
#test.to_csv("/content/drive/MyDrive/LexNorm/KLTN/Data/ViLexNorm/test.csv", index=False)
#dev.to_csv("/content/drive/MyDrive/LexNorm/KLTN/Data/ViLexNorm/dev.csv", index=False)
#ViLexNorm.to_csv("/content/drive/MyDrive/LexNorm/KLTN/Data/ViLexNorm/ViLexNorm.csv", index=False)

## Create LexNorm Dictionary

In [28]:
def count_vocab(data):
  vocab = []
  for sent in data:
    sent = sent['input']
    for token in sent:
      if (re.search('([.,!?()“”;"\':…])', token) is None) & (token != ''):
        vocab.append(token)

  return len(vocab), len(set(vocab))

In [29]:
count_vocab(new_data)

(114759, 10730)

In [30]:
def nsw(data):
  nsw_lst = []
  for sent in data:
    inp = sent['input']
    out = sent['output']
    for i in range(len(inp)):
      if((inp[i] != '') & (inp[i] != out[i])):
        nsw_lst.append(inp[i])

  return(nsw_lst)

In [31]:
LexNorm_dict_word = {}
for i in tqdm (range(len(new_data)), desc="Loading..."):
  inp = new_data[i].get('input')
  out = new_data[i].get('output')
  for i in range(len(inp)):
    if((inp[i] != '') & (inp[i] != out[i])):
      key = inp[i]
      value = out[i]
      key_str = ''.join(key)
      value_str = ''.join(value)
      if key_str not in LexNorm_dict_word:
        LexNorm_dict_word[key_str] = []
      if value_str not in LexNorm_dict_word[key_str]:
        LexNorm_dict_word[key_str].append(value_str)

Loading...: 100%|██████████| 10463/10463 [00:00<00:00, 105061.19it/s]


In [32]:
LexNorm_dict_word

{'k': ['không'],
 'thoai': ['thôi'],
 't': ['tôi',
  'thằng',
  'tao',
  'tớ',
  'tiếng',
  'tui',
  'tồi',
  'ta',
  'tới',
  'tuổi'],
 'chời': ['trời'],
 'hong': ['không', 'hông', 'hóng'],
 'dẫy': ['vậy', 'giẫy'],
 'luý truý': ['lí trí'],
 'zậy': ['vậy'],
 'đc': ['được'],
 'hông': ['không'],
 'fò': ['phò'],
 'bíc': ['biết'],
 'v': ['vậy', 'với'],
 'mk': ['mình'],
 'phia': ['khuya'],
 'hảng': ['háng'],
 'mõi': ['mỗi'],
 'z': ['vậy'],
 'dthoai': ['điện thoại'],
 'ko': ['không'],
 'bánh mỳ': ['bánh mì'],
 'nhứt': ['nhức', 'nhất'],
 'ở riêg': ['ở riêng'],
 'e': ['em'],
 'ồi': ['rồi'],
 'ca': ['công an', 'cả'],
 'nhìu': ['nhiều'],
 'r': ['rồi', 'rượu'],
 'dùm': ['giùm'],
 'quákkk': ['quá'],
 'ck': ['chồng', 'chuyển khoản', 'chợ', 'chị'],
 'b': ['bạn', 'bố'],
 'ntn': ['như thế nào', 'như thế này'],
 'cmn': ['con mẹ nó'],
 'cx': ['cũng', 'củ'],
 't2': ['thứ 2', 'thứ hai'],
 'iu': ['yêu'],
 'deptrai': ['đẹp trai'],
 'hỏng': ['không', 'hông'],
 'dậy': ['vậy'],
 'zị': ['vậy'],
 'tr': ['trời', 

In [33]:
#with open("/content/drive/MyDrive/LexNorm/KLTN/Data/ViLexNorm/LexNorm_dict_segment.json", "w") as outfile:
    #json.dump(LexNorm_dict_word, outfile, ensure_ascii=False)