In [None]:
!git clone https://github.com/duyvuleo/VNTC.git

In [None]:
!unrar x -Y "/content/VNTC/Data/10Topics/Ver1.1/Test_Full.rar" "/content/VNTC/Data/10Topics/Ver1.1/"
!unrar x -Y "/content/VNTC/Data/10Topics/Ver1.1/Train_Full.rar" "/content/VNTC/Data/10Topics/Ver1.1/"

In [None]:
!pip3 install fairseq
!pip3 install fastbpe
!pip3 install vncorenlp
!pip3 install transformers
!pip3 install underthesea

In [None]:
import os
import re
import torch
from transformers import AutoModel, AutoTokenizer
import numpy as np
import pickle
from vncorenlp import VnCoreNLP
import underthesea

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def save_dump(file_path, data, labels):
    file = open(file_path, 'wb')
    # dump information to that file
    pickle.dump((data, labels), file)
    # close the file
    file.close()
    pass


def load_data(path_file):
    file = open(path_file, 'rb')
    # dump information to that file
    (pixels, labels) = pickle.load(file)
    # close the file
    file.close()
    print(pixels.shape)
    print(labels.shape)
    return pixels, labels


def load_pho_bert():
    pho_bert = AutoModel.from_pretrained("vinai/phobert-base")
    tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
    return pho_bert, tokenizer


def standardize_data(row):
    # Xóa dấu chấm, phẩy, hỏi ở cuối câu
    row = re.sub(r"[\,\?]+$-()!*=._", "", row)
    row = row.replace(",", " ") \
        .replace(";", " ").replace("“", " ") \
        .replace(":", " ").replace("”", " ") \
        .replace('"', " ").replace("'", " ") \
        .replace("!", " ").replace("?", " ") \
        .replace("-", " ").replace("*", " ")\
        .replace("=", " ").replace("(", " ")\
        .replace(")", " ").replace("_", " ").replace(".", " ")
    row = row.strip().lower()
    return row


def get_max_decode_token(v_token, max_len):
    data_token = []
    cnt_code = 0
    for code in v_token:
        if cnt_code >= max_len:
            break
        data_token.append(code)
        cnt_code += 1
    # print(len(data_token))
    return data_token


def get_text_feature(sentence, v_pho_bert, v_tokenizer):
    v_tokenized = []
    max_len_word = 100
    word_segmented_text = underthesea.word_tokenize(sentence)
    
    line = " ".join(word_segmented_text)
    line = underthesea.word_tokenize(line, format="text")

    v_token = v_tokenizer.encode(line)
    # print(v_token)

    data_token = get_max_decode_token(v_token, max_len_word)
    v_tokenized.append(data_token)

    padded = np.array([i + [1] * (max_len_word - len(i)) for i in v_tokenized])
    # print(padded)
    # print(padded.shape)

    # Đánh dấu các từ thêm vào = 0 để không tính vào quá trình lấy features
    attention_mask = np.where(padded == 1, 0, 1)
    # print('attention mask:', attention_mask[0])

    # Chuyển thành tensor
    padded = torch.tensor(padded).to(torch.long)
    # print(padded)
    # print("Padd = ", padded.size())
    attention_mask = torch.tensor(attention_mask)
    # print(attention_mask)

    # Lấy features dầu ra từ BERT
    with torch.no_grad():
        last_hidden_states = v_pho_bert(input_ids=padded, attention_mask=attention_mask)

    v_features = last_hidden_states[0].numpy().T
    print(v_features.shape)

    return v_features


def load_data_post_directory(DIRECTORY, CATEGORIES, cnt_max_category):
    print("[INFO] loading post...")
    data = []
    labels = []
    pho_bert, v_token = load_pho_bert()
    for category in CATEGORIES:
        path = os.path.join(DIRECTORY, category)
        print(path)
        cnt_category = 0
        for post in os.listdir(path):
            if cnt_category == cnt_max_category:
                break
            post_path = os.path.join(path, post)
            print(post_path)
            f = open(post_path, "r", encoding='utf-16')
            text_post = f.read()
            text_post = standardize_data(text_post)

            v_feat = get_text_feature(text_post, pho_bert, v_token)

            data.append(v_feat)
            labels.append(category)
            cnt_category += 1

    dataset = np.array(data, dtype="float32")
    labels = np.array(labels)
    return dataset, labels

In [None]:
# Dataset
DIRECTORY_test = "/content/VNTC/Data/10Topics/Ver1.1/Test_Full"
DIRECTORY_train = "/content/VNTC/Data/10Topics/Ver1.1/Train_Full"
CATEGORIES = ['Chinh tri Xa hoi', 'Doi song', 'Khoa hoc', 'Kinh doanh', 'Phap luat', 'Suc khoe',
              'The gioi', 'The thao', 'Van hoa', 'Vi tinh']

In [None]:
data_train, labels_train = load_data_post_directory(DIRECTORY_train, CATEGORIES, 700)

In [None]:
data_test, labels_test = load_data_post_directory(DIRECTORY_test, CATEGORIES, 300)

In [None]:
save_dump('/content/drive/MyDrive/data_train.data', data_train, labels_train)
save_dump('/content/drive/MyDrive/data_test.data', data_test, labels_test)

print(data_train.shape)
print(data_test.shape)

print(labels_train.shape)
print(labels_test.shape)

(1400, 768, 100, 1)
(1400,)
