# Import library

In [1]:
!pip install emot tqdm

Collecting emot
  Downloading emot-3.1-py3-none-any.whl (61 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.5/61.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: emot
Successfully installed emot-3.1


In [2]:
import pandas as pd
import numpy as np
import json
import re
from emot.emo_unicode import UNICODE_EMOJI

from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

from __future__ import unicode_literals
import operator
import sys

try:
    from html.parser import HTMLParser
except ImportError:
    from HTMLParser import HTMLParser

try:
    import html
except ImportError:
    pass

# Load data

In [3]:
vihsd_path = "/content/drive/MyDrive/LexNorm/KLTN/Data/ViHSD"
vsmec_path = "/content/drive/MyDrive/LexNorm/KLTN/Data/UIT_VSMEC"
visfd_path = "/content/drive/MyDrive/LexNorm/KLTN/Data/UIT_ViSFD"
vihos_path = "/content/drive/MyDrive/LexNorm/KLTN/Data/ViHOS"
vispamreviews = "/content/drive/MyDrive/LexNorm/KLTN/Data/ViSpamReviews"

In [4]:
def split_sentence_by_punctuation(text):
    # Define a regular expression pattern to match punctuation marks
    pattern = r'([.!?{\n}]{1,6})'

    # Use the re.split() function with the pattern
    splits = re.split(pattern, text)

    # Combine adjacent punctuation marks with their preceding split
    # This is done to ensure that each split contains both the punctuation mark and the preceding text
    result = []

    for pair in zip(splits[::2], splits[1::2]):
        result.append("".join(pair))
    if splits[-1]:
        result.append(splits[-1])

    return result

In [5]:
def split_paragraph(text):
    # Define a regular expression pattern to match newline characters
    newline_pattern = r'\n+'

    # Split the text by newline characters
    splits = re.split(newline_pattern, text)

    # Remove empty strings from the splits
    splits = [split for split in splits if split.strip()]

    return splits

In [6]:
def remove_url(text):
    # Define a regular expression pattern to match URLs
    url_pattern = r'https?://\S+|www\.\S+'

    # Find all URLs in the text
    urls = re.findall(url_pattern, text)

    # Split the text by the URLs
    splits = re.split(url_pattern, text)

    # Combine splits and URLs into a single list
    result = []
    for split, url in zip(splits, urls):
        result.append(split)
        #result.append(url)

    # Add the last split (if any) to the result
    if splits[-1]:
        result.append(splits[-1])

    return " ".join(result)

# Tokenize data and create dataset

## ViHSD

In [None]:
train = pd.read_csv(vihsd_path + '/raw_data/train.csv')
train['type'] = 'train'
dev = pd.read_csv(vihsd_path + '/raw_data/dev.csv')
dev['type'] = 'dev'
test = pd.read_csv(vihsd_path + '/raw_data/test.csv')
test['type'] = 'test'
data = pd.concat([train,dev, test]).reset_index(drop=True)
data['dataset'] = 'ViHSD'
data = data.fillna('None')
data['index'] = data.index
data = data[[ 'dataset', 'type','index', 'free_text', 'label_id']]
data

Unnamed: 0,dataset,type,index,free_text,label_id
0,ViHSD,train,0,Em ƒë∆∞·ª£c l√†m fan c·ª©ng lu√¥n r·ªìi n√® ‚ù§Ô∏è reaction q...,0
1,ViHSD,train,1,ƒê√∫ng l√† b·ªçn m·∫Øt h√≠p l√≤ xo th·ª•t :))) b√™n vi·ªát n...,2
2,ViHSD,train,2,ƒê·∫≠u VƒÉn C∆∞·ªùng gi·ªù gi·ªëng th·∫±ng sida h∆°n √†,0
3,ViHSD,train,3,C√îN ƒê·ªí C·ª§C S√öC V√î NH√ÇN T√çNH ƒê·ªÄ NGHI VN. NH√Ä N∆Ø...,2
4,ViHSD,train,4,T·ª´ l√Ω thuy·∫øt ƒë·∫øn th·ª±c h√†nh l√† c·∫£ 1 c√¢u chuy·ªán ...,0
...,...,...,...,...,...
33395,ViHSD,test,33395,Nghe c≈©ng ƒë∆∞·ª£c. Nh∆∞ng v·∫´n c√†y views,0
33396,ViHSD,test,33396,Pha team up cƒÉng c·ª±c ƒë·∫øn t·ª´ v·ªã tr√≠ √¥ng ch·ªß ƒë·∫ßm...,0
33397,ViHSD,test,33397,Thanh Nh∆∞·ªùng sao m ƒë·ªçc ƒëc cmt c·ªßa t th·∫ø ti√™n s...,0
33398,ViHSD,test,33398,"ƒê√∫ng r·ªìi, nh√¨n c√≥ ria m√©p , v·ªõi m·∫∑t gi·ªëng ƒë√†n ...",0


In [None]:
data.to_csv('/content/drive/MyDrive/LexNorm/KLTN/Data/ASTRA/ViHSD.csv', index=False)

## UIT-VSMEC

In [None]:
train = pd.read_excel(vsmec_path + '/raw_data/train_nor_811.xlsx')
train['type'] = 'train'
dev = pd.read_excel(vsmec_path + '/raw_data/valid_nor_811.xlsx')
dev['type'] = 'dev'
test = pd.read_excel(vsmec_path + '/raw_data/test_nor_811.xlsx')
test['type'] = 'test'
data = pd.concat([train,dev, test]).reset_index(drop=True)
data['dataset'] = 'UIT-VSMEC'
data = data.fillna('None')
data['index'] = data.index
data = data[[ 'dataset', 'type','index', 'Sentence', 'Emotion']]
data

Unnamed: 0,dataset,type,index,Sentence,Emotion
0,UIT-VSMEC,train,0,cho m√¨nh xin b√†i nh·∫°c t√™n l√† g√¨ v·ªõi ·∫°,Other
1,UIT-VSMEC,train,1,cho ƒë√°ng ƒë·ªùi con qu·ª∑ . v·ªÅ nh√† l√¥i con nh√† m√†y ...,Disgust
2,UIT-VSMEC,train,2,lo h·ªçc ƒëi . y√™u ƒë∆∞∆°ng lol g√¨ hay l·∫°i th√≠ch h·ªçc...,Disgust
3,UIT-VSMEC,train,3,u·ªõc g√¨ sau n√†y v·ªÅ gi√† v·∫´n c√≥ th·ªÉ nh∆∞ c·ª• n√†y :)),Enjoyment
4,UIT-VSMEC,train,4,m·ªói l·∫ßn c√≥ video c·ªßa con l√† c·ª© coi ƒëi coi l·∫°i ...,Enjoyment
...,...,...,...,...,...
6922,UIT-VSMEC,test,6922,l√∫c th√¨ cu·ªëc ƒë·∫•t √® c·ªï ra kh√¥ng c√≥ giun m√† c√¢u ...,Sadness
6923,UIT-VSMEC,test,6923,c√°c b√°c d·∫´n crush qua c√¢y s·ªØa nay m√† h·ª©ng n∆∞·ªõc...,Enjoyment
6924,UIT-VSMEC,test,6924,v·ª£ ch·ªìng r·ªìi m√† trong s√°ng nh∆∞ n√†y ch·ªâ c√≥ tron...,Surprise
6925,UIT-VSMEC,test,6925,bep n√†y v√†o binh d∆∞∆°ng d√πng xu·∫•t,Other


In [None]:
data.to_csv('/content/drive/MyDrive/LexNorm/KLTN/Data/ASTRA/VSMEC.csv', index=False)

## ViSFD

In [None]:
train = pd.read_csv(visfd_path + '/raw_data/Train.csv')
train['type'] = 'train'
dev = pd.read_csv(visfd_path + '/raw_data/Dev.csv')
dev['type'] = 'dev'
test = pd.read_csv(visfd_path + '/raw_data/Test.csv')
test['type'] = 'test'
data = pd.concat([train,dev, test]).reset_index(drop=True)
data['dataset'] = 'UIT-ViSFD'
data = data.fillna('None')
data['index'] = data.index
data = data[[ 'dataset', 'type','index', 'comment', 'n_star', 'date_time', 'label']]
data

Unnamed: 0,dataset,type,index,comment,n_star,date_time,label
0,UIT-ViSFD,train,0,M·ªõi mua m√°y n√†y T·∫°i thegioididong th·ªët n·ªët c·∫£m...,5,2 tu·∫ßn tr∆∞·ªõc,{CAMERA#Positive};{FEATURES#Positive};{BATTERY...
1,UIT-ViSFD,train,1,Pin k√©m c√≤n l·∫°i mi·ªÖn ch√™ mua 8/3/2019 t√¨nh tr·∫°...,5,14/09/2019,{BATTERY#Negative};{GENERAL#Positive};{OTHERS};
2,UIT-ViSFD,train,2,Sao l√∫c g·ªçi ƒëi·ªán tho·∫°i m√†n h√¨nh b·ªã ch·∫•m nh·ªè nh...,3,17/08/2020,{FEATURES#Negative};
3,UIT-ViSFD,train,3,"M·ªçi ng∆∞·ªùi c·∫≠p nh·∫≠t ph·∫ßn m·ªÅm l·∫°i , n√≥ s·∫Ω b·ªõt t·ªë...",3,29/02/2020,{FEATURES#Negative};{BATTERY#Neutral};{GENERAL...
4,UIT-ViSFD,train,4,"M·ªõi mua S√†i ƒë∆∞·ª£c 1 th√°ng th·∫•y pin r·∫•t tr√¢u, S√†...",5,4/6/2020,{BATTERY#Positive};{PERFORMANCE#Positive};{SER...
...,...,...,...,...,...,...,...
11117,UIT-ViSFD,test,11117,"M·∫´u m√£ ƒë·∫πp lung linh. M√°y ch·∫°y c·ª±c nhanh, m∆∞·ª£t...",5,13/05/2019,{FEATURES#Positive};{PERFORMANCE#Positive};{DE...
11118,UIT-ViSFD,test,11118,C√≥ ai b·ªã gi·ªëng m√¨nh kh√¥ng m√°y th√¨ s√†i b√¨nh th∆∞...,4,8/7/2020,{PERFORMANCE#Negative};
11119,UIT-ViSFD,test,11119,S·∫£n ph·∫©m t·ªët üòä\nAi ch∆°i game c·ª© mang 1 em v·ªÅ m...,5,31/05/2020,{PERFORMANCE#Positive};{GENERAL#Positive};
11120,UIT-ViSFD,test,11120,V·ª´a m·ªõi mua xong m√°y r·∫•t ƒë·∫πp nh√¢n vi√™n r·∫•t nhi...,5,8/3/2020,{CAMERA#Positive};{BATTERY#Positive};{PERFORMA...


In [None]:
data.to_csv('/content/drive/MyDrive/LexNorm/KLTN/Data/ASTRA/ViSFD.csv', index=False)

## ViSpamReviews

In [None]:
train = pd.read_csv(vispamreviews + '/raw_data/train.csv')
train['type'] = 'train'
dev = pd.read_csv(vispamreviews + '/raw_data/dev.csv')
dev['type'] = 'dev'
test = pd.read_csv(vispamreviews + '/raw_data/test.csv')
test['type'] = 'test'
data = pd.concat([train,dev, test]).reset_index(drop=True)
data['dataset'] = 'ViSpamReviews'
data = data.fillna('None')
data['index'] = data.index
data = data[[ 'dataset', 'type','index', 'Comment', 'Label', 'SpamLabel']]
data

Unnamed: 0,dataset,type,index,Comment,Label,SpamLabel
0,ViSpamReviews,train,0,"Ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m tuy·ªát v·ªùi, m√°",0,0
1,ViSpamReviews,train,1,Jdjd is a waste of my time and place to ask qu...,1,3
2,ViSpamReviews,train,2,Dhnxncbcncncncncnncncncnncncncncncncncnncncncj...,1,3
3,ViSpamReviews,train,3,"giao h√†ng nhanh, ƒë√≥ng g√≥i c·∫©n th·∫≠n, s·∫£n ph·∫©m n...",0,0
4,ViSpamReviews,train,4,B·∫°t m·ªèng vs l·∫°i kh√¥ng c√≥ t√∫i ƒëi k√®m ƒë·ªÉ c·∫•t khi...,0,0
...,...,...,...,...,...,...
19865,ViSpamReviews,test,19865,S·∫£n ph·∫©m OK r·∫•t ƒë·∫πp !c·∫£m ∆°n Shop Sp r·∫ª m√† r·∫•t ...,0,0
19866,ViSpamReviews,test,19866,R√™u ƒë∆∞·ª£c l·∫•y t·ª´ r·ª´ng v·ªÅ ƒë√£ r·ª≠a s·∫°ch ƒë·∫•t c√°t r·ªì...,1,3
19867,ViSpamReviews,test,19867,Shop giao h√†ng nhanh nh∆∞ng m·ªói t·ªôi c√°i shopee ...,1,2
19868,ViSpamReviews,test,19868,Shop ∆°i em c√≥ ƒë·∫∑t 2 ram 8g dr4 nh∆∞ng m√°y em nh...,1,3


In [None]:
data.to_csv('/content/drive/MyDrive/LexNorm/KLTN/Data/ASTRA/ViSpamReviews.csv', index=False)

## ViHOS

In [7]:
vihos_path = "/content/drive/MyDrive/LexNorm/KLTN/Data/ViHOS"
train_BIO = pd.read_csv(vihos_path + '/raw_data/Sequence_labeling_based_version/Syllable/train_BIO_syllable.csv')
train_span = pd.read_csv('/content/drive/MyDrive/LexNorm/KLTN/Data/ViHOS/raw_data/Span_Extraction_based_version/train.csv')
dev_BIO = pd.read_csv(vihos_path + '/raw_data/Sequence_labeling_based_version/Syllable/dev_BIO_syllable.csv')
dev_span = pd.read_csv('/content/drive/MyDrive/LexNorm/KLTN/Data/ViHOS/raw_data/Span_Extraction_based_version/dev.csv')
test_BIO = pd.read_csv(vihos_path + '/raw_data/Sequence_labeling_based_version/Syllable/test_BIO_syllable.csv')
test_span = pd.read_csv('/content/drive/MyDrive/LexNorm/KLTN/Data/ViHOS/raw_data/Test_data/test.csv')

In [8]:
def create_vihos_df(BIO, spans, type):
  df = pd.DataFrame(columns=['dataset', 'type','sentence_id', 'sentence', 'word', 'index_spans', 'tag'])
  sentence_id = 0
  sent_ids =[]
  sentence = []
  tag_lst = []

  for i in range(len(BIO)):
    if (BIO['sentence_id'][i] == sentence_id + 1):
      df.loc[sentence_id] = {
          'dataset': 'ViHOS',
          'type': type,
          'sentence_id': sentence_id,
          'sentence': spans['content'][sentence_id],
          'word': sentence,
          'index_spans': spans['index_spans'][sentence_id],
          'tag': tag_lst
      }
      sentence_id += 1
      sentence = []
      tag_lst = []
    sentence.append(BIO['Word'][i])
    tag_lst.append(BIO['Tag'][i])
    sent_ids.append(sentence_id)

  if sent_ids[-1]:
      df.loc[sentence_id] = {
          'dataset': 'ViHOS',
          'type': type,
          'sentence_id': sentence_id,
          'sentence': spans['content'][sentence_id],
          'word': sentence,
          'index_spans': spans['index_spans'][sentence_id],
          'tag': tag_lst
      }
  return df

In [9]:
df_train = create_vihos_df(train_BIO, train_span, 'train')
df_dev = create_vihos_df(dev_BIO, dev_span, 'dev')
df_test = create_vihos_df(test_BIO, test_span, 'test')

In [10]:
data = pd.concat([df_train,df_dev, df_test]).reset_index(drop=True)
data = data.fillna('None')
data['index'] = data.index
data = data[[ 'dataset', 'type','index', 'sentence', 'word', 'index_spans', 'tag']]
data

Unnamed: 0,dataset,type,index,sentence,word,index_spans,tag
0,ViHOS,train,0,D·ª´a l·∫Øm :)),"[D·ª´a, l·∫Øm, :, ), )]",[],"[O, O, O, O, O]"
1,ViHOS,train,1,B·∫•p b√™nh vl th·∫ø,"[B·∫•p_b√™nh, vl, th·∫ø]","[9, 10]","[O, B-T, O]"
2,ViHOS,train,2,Ch·∫Øc c≈©ng bi·∫øt ko t·ªìn t·∫°i ƒëc bao l√¢u n·ªØa n√™n c...,"[Ch·∫Øc, c≈©ng, bi·∫øt, ko, t·ªìn_t·∫°i, ƒëc, bao_l√¢u, n...","[53, 54, 55]","[O, O, O, O, O, O, O, O, O, O, B-T]"
3,ViHOS,train,3,Th·∫•y ch√°n ad page n√†y ki·∫øn th·ª©c th√¨ n√¥ng c·∫£n c...,"[Th·∫•y, ch√°n, ad, page, n√†y, ki·∫øn_th·ª©c, th√¨, n√¥...","[5, 6, 7, 8, 36, 37, 38, 39, 40, 41, 42, 43, 6...","[O, B-T, O, O, O, O, O, B-T, I-T, O, O, O, O, ..."
4,ViHOS,train,4,Giang Giang ƒê·ªó Th·ªã Ng·ªçc H√† trend m·ªõi k√¨a k√¨a,"[Giang_Giang_ƒê·ªó_Th·ªã, Ng·ªçc_H√†, trend, m·ªõi, k√¨a,...",[],"[O, O, O, O, O, O]"
...,...,...,...,...,...,...,...
11051,ViHOS,test,11051,t·ª´ d·ªãch n√†y th·∫•y √Ω th·ª©c ng∆∞·ªùi nh·∫≠t xƒÉng ƒë·ªï n·ª≠a...,"[t·ª´_d·ªãch, n√†y, th·∫•y, √Ω_th·ª©c, ng∆∞·ªùi, nh·∫≠t, xƒÉng...","[137, 138, 139, 140, 141, 142, 143, 144, 145, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
11052,ViHOS,test,11052,ƒêang n√≥i chuy·ªán n√†y c√°i ·ªïng b·∫ª l√°i qua chuy·ªán ...,"[ƒêang, n√≥i_chuy·ªán, n√†y, c√°i, ·ªïng, b·∫ª_l√°i, qua,...",[],"[O, O, O, O, O, O, O, O, O, O, O, O]"
11053,ViHOS,test,11053,√îi tr·ªùi giai c·∫•p lƒë ·ªü ta th√¨ ƒë·ªïi tr·∫Øng thay ƒëe...,"[√îi, tr·ªùi, giai_c·∫•p, lƒë, ·ªü, ta, th√¨, ƒë·ªïi_tr·∫Øng...","[29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 4...","[O, O, O, O, O, O, O, B-T, O, O, O, O, O, O, O..."
11054,ViHOS,test,11054,ƒê√©o ai l√†m th·∫ø,"[ƒê√©o, ai, l√†m, th·∫ø]","[0, 1, 2]","[B-T, O, O, O]"


In [None]:
data.to_csv('/content/drive/MyDrive/LexNorm/KLTN/Data/ASTRA/ViHOS.csv', index=False)