## STEP 1: DATA PRE-PROCESSING: TIỀN XỬ LÝ DỮ LIỆU

In [179]:
import pandas as pd
import string
import re
from underthesea import word_tokenize
import json

### Tao duong dan den dataset va doc file

In [180]:
sentiment_dataset_path = "Dataset/vietnamese_sentiment_dataset.csv" #Duong dan den file dataset(csv)
sentiment_data_df = pd.read_csv(sentiment_dataset_path)

### Loai bo text trung nhau va text/lable khong co du lieu

In [181]:
sentiment_data_df = sentiment_data_df.drop_duplicates(subset= ['text'])
sentiment_data_df = sentiment_data_df.dropna(subset = ['text','label'])

### Normalization: Chuẩn hóa văn bản 


In [182]:
# Kí tự đặc biệt
remove_character = string.punctuation

# Load toan bo emoji
emoji_pattern = re.compile(
    "["
    u"\U0001F600-\U0001F64F"  # Emoticons
    u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
    u"\U0001F680-\U0001F6FF"  # Transport & Map
    u"\U0001F1E0-\U0001F1FF"  # Flags
    u"\U00002700-\U000027BF"  # Dingbats
    u"\U0001F900-\U0001F9FF"  # Supplemental Symbols & Pictographs
    u"\U0001FA70-\U0001FAFF"  # Symbols & Pictographs Extended-A
    u"\U00002600-\U000026FF"  # Misc symbols
    u"\u2600-\u26FF"          # mặt trời, mưa, ...
    u"\u2700-\u27BF"          # ký hiệu đặc biệt    
    "]+", flags=re.UNICODE
)
# Load file các từ viết tắt
file_replace_dict = 'StopWords/vietnamese_replace_dict.json'
with open(file_replace_dict, 'r', encoding='utf-8') as f:
    replace_dict = json.load(f)

def clean_text(text: string): 
    # Chuyen toan chu hoa thanh chu thuong
    text = text.lower()

    # Xoa toan bo emoji
    text = emoji_pattern.sub('', text)

    # Xoa toan bo ki tu dac biet
    for character in remove_character:
        text = text.replace(character, ' ')

    # Thay the toan bo tu viet tat
    words = text.split()
    words = [replace_dict.get(w, w) for w in words]
    text = ' '.join(words)

    # Xoa nhieu khoang trang thanh 1 khoang trang
    text = re.sub(r'\s+', ' ', text)
    # Xóa những từ có kí tự lặp lại
    text =re.sub(r'(.)\1+', r'\1+', text)

    # Xoa toan bo con so
    text = re.sub(r'\d+','', text)

    # Xoa khoang trang o dau dong va cuoi dong
    text = text.strip()

    return text

sentiment_data_df['text'] = sentiment_data_df['text'].apply(clean_text)

### Tokenization: Tách từ

In [183]:
sentiment_data_df['tokens'] = sentiment_data_df["text"].apply(lambda x: word_tokenize(x, format='text'))

### Loai bo stopwords

In [184]:
stopwords_path_1 = 'StopWords/vietnamese-stopwords.txt'
stopwords_path_2 ='StopWords/vietnamese_stopwords.txt'

In [193]:
with open(stopwords_path_1, 'r', encoding='utf-8') as f:
    stopwords_1 = set(line.strip().replace(' ', '_') for line in f)
    count =0
    for x in stopwords_1:
        if (x == "tạm"): cout +=1
    print(count)
   
    
with open(stopwords_path_2, 'r', encoding='utf-8') as f:
    stopwords_2 = set(line.strip() for line in f)
    count =0
    for x in stopwords_2:
        if (x == "tạm"): cout +=1
    print(count)
stopwords = stopwords_1.union(stopwords_2)

def remove_stopwords(text):

    tokens = text.split() #Chia thanh tung tu

    tokens = [word for word in tokens if word not in stopwords if len(word) > 1]

    return " ".join(tokens)


sentiment_data_df['tokens'] = sentiment_data_df['tokens'].apply(remove_stopwords)


0
0


### Luu lai du lieu sau khi hoan thanh buoc tien xu ly du lieu

In [186]:
sentiment_data_df[['tokens','label']].to_csv('Dataset/vietnamese_sentiment_dataset_processed.csv',index= False, encoding='utf-8')