In [1]:
!pip install transformers
!pip install py_vncorenlp

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m94.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m86.7 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
from google.colab import drive
drive.mount('./drive')

Mounted at ./drive


In [19]:
import torch
from torch.utils.data import TensorDataset, DataLoader

In [4]:
import re
import py_vncorenlp

py_vncorenlp.download_model(save_dir='./')
segment_model = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='./')

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/DS105/model_save')
model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/DS105/model_save')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [34]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030" "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_special_characters(text):
    pattern = r"[^\w\s.,;:?]"
    return re.sub(pattern, "", text)

def from_logit_to_label(logit):
  return 'Positive' if logit == 0 else 'Negative'

In [35]:
def classification_review(text: list, tokenizer=tokenizer, model=model):
  input_ids, attention_masks = [], []
  sentences = text.copy()

  for i in range(len(text)):
    text[i] = text[i].replace('\n', '. ')
    text[i] = remove_emoji(text[i])
    text[i] = remove_special_characters(text[i])
    text[i] = segment_model.word_segment(text[i])[0]

    encoded_dict = tokenizer.encode_plus(
        text[i],
        max_length=100,
        padding='max_length',
        return_attention_mask=True,
        truncation=True
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

  input_ids = torch.tensor(input_ids)
  attention_masks = torch.tensor(attention_masks)

  dataset = TensorDataset(input_ids, attention_masks)
  dataloader = DataLoader(dataset, shuffle=False, batch_size=32)

  model.eval()

  for batch in dataloader:
    b_input_ids, b_attention_mask = batch

    with torch.no_grad():
      result = model(b_input_ids,
                     token_type_ids=None,
                     attention_mask=b_attention_mask)

  logits =result.logits

  labels_out = torch.argmax(logits, dim=1)

  for i, sent in enumerate(sentences):
    print("{} : {}".format(sent, from_logit_to_label(labels_out[i])))

  return labels_out

In [40]:
res = classification_review(['Hàng hóa tệ'])

Hàng hóa tệ : Negative
