# __I. DEPENDENCIES__

In [None]:
!pip install bitsandbytes -q
!pip install datasets transformers &>> /dev/null -q
!pip install datasets==3.6.0 -q
!pip install --upgrade scikit-learn &>> /dev/null -q
!pip install evaluate -q
!pip install regex -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
'''
Block : Importing libraries
'''

# ================================= transformers =================================
from transformers import (
    PreTrainedTokenizerBase,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    EarlyStoppingCallback,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    pipeline,
)
# ================================================================================

# =================================== datasets ===================================
from datasets import (
    load_dataset,
    Dataset,
    concatenate_datasets
)
# ================================================================================

# ==================================== torch =====================================
import torch
from torch.utils import data
# ================================================================================

# =================================== sklearn ====================================
import evaluate
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    precision_score,
    recall_score,
    accuracy_score,
    f1_score,
    classification_report,
)
from sklearn.utils import resample
# ================================================================================

# ================================= other utils ==================================
import numpy as np
import pandas as pd
import os
import random
# ================================================================================

# __II. REPRODUCIBILITY__

In [None]:
'''
Block : Set SEEDs for reproducibility
'''

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic, torch.backends.cudnn.benchmark = True, False

# __III. MODEL TRAINING AND EVALUATING__

## __1. Hyperparameter declaration__

In [None]:
# ============================= Model configuration ==============================
MAX_INPUT_LEN = 128
MODEL_PATH = "vinai/phobert-base"
# ================================================================================

# =========================== Training hyperparameters ===========================
EPOCHS = 5
LEARNING_RATE = 1e-5
SCHEDULER_TYPE = "cosine"
WARM_UP_RATIO = 0.03
WEIGHT_DECAY = 0
TRAIN_BATCH_SIZE = 32
GRAD_ACCUM_STEPS = 1
# ================================================================================

# ============================ Evaluation parameters =============================
TEST_BATCH_SIZE = 128
# ================================================================================

## __2. Data preparation__

### __2.1. Data loading__

In [None]:
'''
Block : Load dataset splits
'''

trainDataset = load_dataset("uitnlp/vietnamese_students_feedback", split="train")
evalDataset = load_dataset("uitnlp/vietnamese_students_feedback", split="validation")
testDataset = load_dataset("uitnlp/vietnamese_students_feedback", split="test")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

vietnamese_students_feedback.py: 0.00B [00:00, ?B/s]

default/train/0000.parquet:   0%|          | 0.00/475k [00:00<?, ?B/s]

default/validation/0000.parquet:   0%|          | 0.00/63.3k [00:00<?, ?B/s]

default/test/0000.parquet:   0%|          | 0.00/134k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11426 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1583 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3166 [00:00<?, ? examples/s]

In [None]:
'''
Block : Get Label2Id and Id2Label
'''

LABEL2ID = {k: v for v, k in enumerate(set(trainDataset.unique("sentiment")))}
ID2LABEL = {v: k for v, k in enumerate(set(trainDataset.unique("sentiment")))}

### __2.2. Data sentences preprocessing__

In [None]:
'''
Block : Preprocess text function definition
'''

import regex as re
import string

# ================================ Emoji pattern =================================
emoji_pattern = re.compile(
    "["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
    "]+",
    flags=re.UNICODE
)
# ================================================================================

# ======================= 'clean_text' function definition =======================
def clean_text(
        examples: Dataset | str
) -> dict[str, str]:
    """
    Clean sentences by lowercasing uppercased characters, removing repeated spaces, normalizing punctuation
    and replacing emoji with flag.

    :param examples: A Dataset containing a "sentence" field or a raw string.
    :return: A dictionary with key "sentence" and the cleaned text as value.
    """

    text = examples["sentence"] if not isinstance(examples, str) else examples

    # Lowercase text
    text = text.lower()

    # Replace emoji with '[emoji]' flag
    text = re.sub(emoji_pattern, "[emoji]", text)

    # Handle repeated characters
    text = re.sub(r"(\p{L})\1{1,}", r"\1", text)

    # Limit repeated punctuation (., ,, !, ?) to a normalized form
    text = re.sub(r"([.])\1{3,}", r"\1\1\1", text)
    text = re.sub(r"[.,]{2,}", lambda m: m.group(0)[0], text)
    text = re.sub(r"([!?])\1{1,}", r"\1\1\1", text)

    # Ensure proper spacing around punctuation marks
    text = re.sub(r"(\w)\s*([" + string.punctuation + "])\s*(\w)", r"\1 \2 \3", text)
    text = re.sub(r"(\w)\s*([" + string.punctuation + "])", r"\1 \2", text)
    text = re.sub(r"([" + string.punctuation + "])\s*(\w)", r"\1 \2", text)

    # Strip leading and trailing whitespace
    text = text.strip()

    # Remove all unnecessary repeated spaces
    text = re.sub(r"\s+", " ", text)

    return {"sentence": text}
# ================================================================================


  text = re.sub(r"(\w)\s*([" + string.punctuation + "])\s*(\w)", r"\1 \2 \3", text)
  text = re.sub(r"([" + string.punctuation + "])\s*(\w)", r"\1 \2", text)


In [None]:
'''
Block : Preprocess sentences in dataset splits
'''

trainDataset = trainDataset.map(
    clean_text,
    batched = False,
)

evalDataset = evalDataset.map(
    clean_text,
    batched = False,
)

testDataset = testDataset.map(
    clean_text,
    batched = False,
)

Map:   0%|          | 0/11426 [00:00<?, ? examples/s]

Map:   0%|          | 0/1583 [00:00<?, ? examples/s]

Map:   0%|          | 0/3166 [00:00<?, ? examples/s]

### __2.3. Data oversampling__

In [None]:
'''
Block : Define Oversampling function
'''

# =================== 'oversample_dataset' function definition ===================
def oversample_dataset(
        dataset: Dataset,
        label_column: str = "sentiment",
        seed: int = 42,
):
    """
    Oversample dataset by multiplying random datapoints in minor labels to balance all labels datapoint number.

    :param dataset: A batch of examples
    :param label_column: Labels column name.
    :param seed: Seed for reproducibility.
    :return:
    """

    labels, counts = np.unique(dataset[label_column], return_counts=True)
    max_count = counts.max()
    oversampled_parts = []

    # Process each label of dataset
    for lbl in labels:
        subset = dataset.filter(lambda x: x[label_column] == lbl)
        data = subset.to_pandas()
        data = resample(data, replace=True, n_samples=max_count, random_state=seed)
        oversampled_parts.append(Dataset.from_pandas(data.reset_index(drop=True)))

    # Merge oversampled parts
    merged = concatenate_datasets(oversampled_parts)
    return merged.shuffle(seed=seed)
# ================================================================================

In [None]:
'''
Block : Oversample train split
'''

trainDataset = oversample_dataset(trainDataset, seed = SEED)

Filter:   0%|          | 0/11426 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11426 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11426 [00:00<?, ? examples/s]

## __3. Model initialization__

In [None]:
'''
Block : Initialize tokenizer and model
'''

# ============================= Model initialization =============================
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=len(ID2LABEL),
    id2label=ID2LABEL,
    label2id=LABEL2ID,
    device_map = "cuda:0"
)
# ================================================================================

# =========================== Tokenizer initialization ===========================
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.model_max_length = MAX_INPUT_LEN
# ================================================================================


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

## __4. Model training__

### __4.1. Data tokenization__

In [None]:
'''
Block : Define Tokenizing function
'''

# ===================== 'tokenize_func' function definition ======================
def tokenize_func(
        examples,
        tokenizer: PreTrainedTokenizerBase,
        text_column: str,
        label_column: str,
        label2id: dict[str, int],
) -> dict:
    """
    Tokenizer dataset and convert string labels to integer IDs.

    :param examples: A batch of examples as a mapping from column names
                     to lists of values.
    :param tokenizer: Hugging Face tokenizer used for text tokenization.
    :param text_column: Texts column name
    :param label_column: Labels column name
    :param label2id: Labels to IDs mapping
    :return:
    """
    tokenized = tokenizer(examples[text_column],
                          max_length=MAX_INPUT_LEN,
                          padding="max_length",
                          truncation = True)
    return {
        "input_ids": tokenized["input_ids"],
        "labels": [label2id[lb] for lb in examples[label_column]]}
# ================================================================================


In [None]:
'''
Block : Tokenize train and evaluate splits
'''

trainDataset = trainDataset.map(
    tokenize_func,
    batched = True,
    fn_kwargs={
        "tokenizer": tokenizer,
        "text_column": "sentence",
        "label_column": "sentiment",
        "label2id": LABEL2ID
    },
    remove_columns=evalDataset.column_names,
)

evalDataset = evalDataset.map(
    tokenize_func,
    batched = True,
    fn_kwargs={
        "tokenizer": tokenizer,
        "text_column": "sentence",
        "label_column": "sentiment",
        "label2id": LABEL2ID
    },
    remove_columns=evalDataset.column_names,
)

Map:   0%|          | 0/16929 [00:00<?, ? examples/s]

Map:   0%|          | 0/1583 [00:00<?, ? examples/s]

### __4.2. Training Configuration__

In [None]:
'''
Block : Declare Data collator and Training arguments
'''

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    # Save trainer checkpoints
    output_dir="FTbert",
    save_total_limit = 2,

    # Train batch size and Gradient accumulation steps
    per_device_train_batch_size = TRAIN_BATCH_SIZE,
    gradient_accumulation_steps = GRAD_ACCUM_STEPS,

    # Basic training hyperparameters
    num_train_epochs = EPOCHS,
    learning_rate = LEARNING_RATE,
    lr_scheduler_type = SCHEDULER_TYPE,
    warmup_ratio = WARM_UP_RATIO,
    weight_decay = WEIGHT_DECAY,
    optim = "adamw_bnb_8bit",

    # Logging, Evaluating and Saving steps
    logging_steps = 50,
    logging_strategy="steps",

    eval_steps = 50,
    eval_strategy = "steps",

    save_steps = 50,
    save_strategy = "steps",

    # Early stopping metric
    metric_for_best_model = "f1",
    greater_is_better = True,
    load_best_model_at_end=True,

    report_to="none"
)

In [None]:
'''
Block : Define Computing metrics function
'''

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=-1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average="macro", zero_division=0),
        "recall": recall_score(labels, preds, average="macro", zero_division=0),
        "f1": f1_score(labels, preds, average="macro", zero_division=0),
    }

In [None]:
'''
Block : Declare Trainer
'''

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trainDataset,
    eval_dataset=evalDataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

The model is already on multiple devices. Skipping the move to device specified in `args`.


### __4.3. Train and Save__

In [None]:
'''
Block : Train
'''

trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,1.0959,1.049951,0.535692,0.452979,0.400002,0.358804
100,1.0571,1.087093,0.402401,0.465228,0.517238,0.372589
150,0.9576,0.787878,0.683512,0.574411,0.642469,0.567753
200,0.8145,0.671461,0.707517,0.604541,0.660007,0.593808
250,0.6341,0.433739,0.862919,0.718854,0.775379,0.737179
300,0.5209,0.340672,0.88945,0.746638,0.800623,0.765626
350,0.4073,0.378232,0.862287,0.724847,0.834446,0.744376
400,0.381,0.40446,0.847757,0.718952,0.839592,0.726623
450,0.3298,0.290618,0.899558,0.763528,0.848417,0.789772
500,0.2905,0.293353,0.901453,0.758751,0.833227,0.782272


TrainOutput(global_step=700, training_loss=0.5326316288539341, metrics={'train_runtime': 913.3391, 'train_samples_per_second': 92.676, 'train_steps_per_second': 2.901, 'total_flos': 1471396010289408.0, 'train_loss': 0.5326316288539341, 'epoch': 1.320754716981132})

In [None]:
'''
Block : Save model
'''

os.makedirs("./output", exist_ok=True)
model.save_pretrained("./output")
tokenizer.save_pretrained("./output")

('./output/tokenizer_config.json',
 './output/special_tokens_map.json',
 './output/vocab.txt',
 './output/bpe.codes',
 './output/added_tokens.json')

## __5. Model testing__

In [None]:
'''
Block : Declare Transformer pipeline
'''

tagger = pipeline("text-classification", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [None]:
'''
Block : Show test result
'''
preds = tagger(testDataset["sentence"])
preds = [pred["label"] for pred in preds]
true_labels = testDataset["sentiment"]

print(classification_report(true_labels, preds, digits=4, zero_division=True))

              precision    recall  f1-score   support

           0     0.9167    0.9290    0.9228      1409
           1     0.3984    0.5988    0.4785       167
           2     0.9516    0.8899    0.9197      1590

    accuracy                         0.8920      3166
   macro avg     0.7556    0.8059    0.7737      3166
weighted avg     0.9069    0.8920    0.8978      3166



# __III. MODEL INFERENCE WITH EXAMPLE__

In [None]:
'''
Block : Change model to evaluate mode
'''

model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

## __1. Example sentence__

In [None]:
text = "Thầy abcdef dạy tâm huyết lémmmm áá mọi người, mỗi buổi dạạy rất nà nhiều kiến thức 😀.."
cleaned_text = clean_text(text)['sentence']
print(cleaned_text)

thầy abcdef dạy tâm huyết lém á mọi người , mỗi buổi dạy rất nà nhiều kiến thức [ emoji ].


# __2. Tokenize preprocessed example__

In [None]:
tokens = tokenizer.tokenize(cleaned_text)
print(tokens)

['thầy', 'ab@@', 'c@@', 'de@@', 'f', 'dạy', 'tâm', 'huyết', 'l@@', 'ém', 'á', 'mọi', 'người', ',', 'mỗi', 'buổi', 'dạy', 'rất', 'nà', 'nhiều', 'kiến', 'thức', '[', 'emoji', ']@@', '.']


# __3. Encode tokenized example__

In [None]:
encoded_tokens = tokenizer.encode(tokens)
print(encoded_tokens)

[0, 1249, 3377, 1395, 4251, 3961, 940, 2652, 6418, 1494, 18598, 6992, 207, 18, 4, 205, 391, 940, 59, 49929, 36, 5603, 2908, 2735, 45101, 3, 5, 2]


# __4. Outputs__

In [None]:
'''
Block : Get Inputs and Outputs
'''

inputs = tokenizer(
            cleaned_text,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=128
        ).to("cuda:0")

with torch.no_grad():
    outputs = model(
        **inputs,
        output_hidden_states=True,
        output_attentions=True,
        return_dict=True
    )



In [None]:
'''
Block : Show last encoder layer output matrix
'''
last_hidden_state = outputs.hidden_states[-1]

print(last_hidden_state.shape)
print(last_hidden_state)

torch.Size([1, 28, 768])
tensor([[[-0.3987, -0.5553, -0.3003,  ..., -0.6324,  0.4141,  0.3192],
         [-0.0522, -0.2769, -0.4171,  ..., -0.3485,  0.3849,  0.0753],
         [-0.1600, -0.1483, -0.3340,  ..., -0.5536,  0.0190,  0.0715],
         ...,
         [ 0.0381, -0.5258, -0.5089,  ..., -0.4999,  0.0299,  0.0442],
         [-0.3779, -0.5521,  0.0215,  ...,  0.1698,  0.0105, -0.1147],
         [-0.4862, -0.5384, -0.2769,  ..., -0.5852,  0.4383,  0.3454]]],
       device='cuda:0')


In [None]:
'''
Block : Show logits after classification head
'''
with torch.no_grad():
    logits = model.classifier(last_hidden_state)
print(logits)

tensor([[-1.7500, -0.7256,  2.6119]], device='cuda:0')


In [None]:
'''
Block : Show final predicts
'''

probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
probs = probs.cpu().numpy()[0]

LABELS = {
    0: "Tiêu cực",
    1: "Trung tính",
    2: "Tích cực"
}
pred_label_idx = np.argmax(probs)
pred_label = LABELS.get(pred_label_idx, "Unknown")
confidence = float(probs[pred_label_idx])

print("Probabilities: \n\n", probs, end="\n\n")
print("Predicted Label: \n\n",pred_label, end="\n\n")
print("Confidence: \n\n", confidence, end="\n\n")

Probabilities: 

 [0.01216585 0.03388898 0.9539451 ]

Predicted Label: 

 Tích cực

Confidence: 

 0.9539451003074646



# __IV. CORRECT AND WRONG SAMPLES__

In [None]:
'''
correct sample
'''
targets = ["Negative","Neutral","Positive"]
samples = ["sinh viên không tiếp thu kịp cũng như không hiểu gì .",'chưa giỏi chuyên môn cho lắm .',"giảng viên đảm bảo thời gian trên lớp , tạo điều kiện trong quá trình thực hành và thi thực hành ."]

In [None]:
for i in samples:
  print(f"Câu mẫu {i} model dự đoán nhãn là {targets[tagger(i)[0]['label']]} và nhãn đúng là {targets[true_labels[testDataset["sentence"].index(i)]]}  ")
j = 0
count = 0
while count < 7:
  senc = testDataset["sentence"][j]
  if tagger(senc)[0]['label'] == true_labels[j]:
    print(f"Câu mẫu {senc} model dự đoán nhãn là {targets[true_labels[j]]} và nhãn đúng là {targets[true_labels[j]]}  ")
    count +=1
  j +=1

Câu mẫu sinh viên không tiếp thu kịp cũng như không hiểu gì . model dự đoán nhãn là Negative và nhãn đúng là Negative  
Câu mẫu chưa giỏi chuyên môn cho lắm . model dự đoán nhãn là Negative và nhãn đúng là Negative  
Câu mẫu giảng viên đảm bảo thời gian trên lớp , tạo điều kiện trong quá trình thực hành và thi thực hành . model dự đoán nhãn là Positive và nhãn đúng là Positive  
Câu mẫu giáo viên rất vui tính . model dự đoán nhãn là Positive và nhãn đúng là Positive  
Câu mẫu cô max có tâm . model dự đoán nhãn là Positive và nhãn đúng là Positive  
Câu mẫu giáo viên không giảng dạy kiến thức , hướng dẫn thực hành trong quá trình học . model dự đoán nhãn là Negative và nhãn đúng là Negative  


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Câu mẫu thầy dạy nhiệt tình và tâm huyết . model dự đoán nhãn là Positive và nhãn đúng là Positive  
Câu mẫu thầy nhiệt tình giảng lại cho học sinh . model dự đoán nhãn là Positive và nhãn đúng là Positive  
Câu mẫu có đôi lúc nói hơi nhanh làm sinh viên không theo kịp . model dự đoán nhãn là Negative và nhãn đúng là Negative  
Câu mẫu giảng dạy nhiệt tình , liên hệ thực tế khá nhiều , tương tác với sinh viên tương đối tốt . model dự đoán nhãn là Positive và nhãn đúng là Positive  


In [None]:
'''
wrong sample
'''
samples = ["chô tiếng việt giống như là copy nguyên văn từ gogle dịch vậy .",'không có điều gì không hài lòng .',"cô cho tài liệu học tập là một trang web , lên đó tự học và làm đồ án ."]

In [None]:
for i in samples:
  print(f"Câu mẫu {i} model dự đoán nhãn là {targets[tagger(i)[0]['label']]} và nhãn đúng là {targets[true_labels[testDataset["sentence"].index(i)]]}  ")
j = 0
count = 0
while count < 10:
  senc = testDataset["sentence"][j]
  if tagger(senc)[0]['label'] != true_labels[j]:
    print(f"Câu mẫu {senc} model dự đoán nhãn là {targets[tagger(senc)[0]['label']]} và nhãn đúng là {targets[true_labels[j]]}  ")
    count +=1
  j +=1

Câu mẫu chô tiếng việt giống như là copy nguyên văn từ gogle dịch vậy . model dự đoán nhãn là Negative và nhãn đúng là Neutral  
Câu mẫu không có điều gì không hài lòng . model dự đoán nhãn là Neutral và nhãn đúng là Positive  
Câu mẫu cô cho tài liệu học tập là một trang web , lên đó tự học và làm đồ án . model dự đoán nhãn là Neutral và nhãn đúng là Negative  
Câu mẫu nói tiếng anh lưu loát . model dự đoán nhãn là Neutral và nhãn đúng là Positive  
Câu mẫu giảng bài thu hút , dí dỏm . model dự đoán nhãn là Negative và nhãn đúng là Positive  
Câu mẫu tính điểm thi đua các nhóm . model dự đoán nhãn là Neutral và nhãn đúng là Positive  
Câu mẫu trong trường macbok thầy số hai thì không có máy nào số một . model dự đoán nhãn là Negative và nhãn đúng là Positive  
Câu mẫu bắt đầu buổi học đúng giờ . model dự đoán nhãn là Negative và nhãn đúng là Positive  
Câu mẫu giữa lý thuyết từ vựng với trò chơi để dễ tiếp thu . model dự đoán nhãn là Positive và nhãn đúng là Negative  
Câu mẫu cung cấ

# __IV. MACHINE LEARNING METHOD__

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb

In [None]:
'''
Block : Load dataset splits
'''

trainDataset = load_dataset("uitnlp/vietnamese_students_feedback", split="train")
evalDataset = load_dataset("uitnlp/vietnamese_students_feedback", split="validation")
testDataset = load_dataset("uitnlp/vietnamese_students_feedback", split="test")

In [None]:
'''
Block : Preprocess sentences in dataset splits
'''

trainDataset = trainDataset.map(
    clean_text,
    batched = False,
)

evalDataset = evalDataset.map(
    clean_text,
    batched = False,
)

testDataset = testDataset.map(
    clean_text,
    batched = False,
)

In [None]:
'''
Block : Oversample train split
'''

trainDataset = oversample_dataset(trainDataset, seed = SEED)

In [None]:
'''
Transform words into vectors of numbers by using TF-IDF
'''

vectorizer = TfidfVectorizer(ngram_range = (1, 2),
                             min_df = 3,
                             max_df = 0.9,
                             max_features = 30000,
                             lowercase = True)


X_train_tfidf = vectorizer.fit_transform(trainDataset['sentence'])

X_val_tfidf   = vectorizer.transform(evalDataset['sentence'])
X_test_tfidf  = vectorizer.transform(testDataset['sentence'])

In [None]:
'''
LOGISTIC REGRESSION
'''
clf = LogisticRegression(
    max_iter = 1000,
    class_weight = 'balanced',
    solver = 'lbfgs',
    n_jobs = -1
)
clf.fit(X_train_tfidf,trainDataset['sentiment'])



0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",'balanced'
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [None]:
from sklearn.metrics import classification_report
print(classification_report(testDataset['sentiment'], clf.predict(X_test_tfidf)))

              precision    recall  f1-score   support

           0       0.87      0.92      0.89      1409
           1       0.34      0.44      0.38       167
           2       0.95      0.87      0.91      1590

    accuracy                           0.87      3166
   macro avg       0.72      0.74      0.73      3166
weighted avg       0.88      0.87      0.87      3166



In [None]:
'''
SVM
'''
svm = SVC(
    C =1.0,
    class_weight = 'balanced'
)

svm.fit(X_train_tfidf,trainDataset['sentiment'])

0,1,2
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",1.0
,"kernel  kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf' Specifies the kernel type to be used in the algorithm. If none is given, 'rbf' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``. For an intuitive visualization of different kernel types see :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`.",'rbf'
,"degree  degree: int, default=3 Degree of the polynomial kernel function ('poly'). Must be non-negative. Ignored by all other kernels.",3
,"gamma  gamma: {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses  1 / (n_features * X.var()) as value of gamma, - if 'auto', uses 1 / n_features - if float, must be non-negative. .. versionchanged:: 0.22  The default value of ``gamma`` changed from 'auto' to 'scale'.",'scale'
,"coef0  coef0: float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.",0.0
,"shrinking  shrinking: bool, default=True Whether to use the shrinking heuristic. See the :ref:`User Guide `.",True
,"probability  probability: bool, default=False Whether to enable probability estimates. This must be enabled prior to calling `fit`, will slow down that method as it internally uses 5-fold cross-validation, and `predict_proba` may be inconsistent with `predict`. Read more in the :ref:`User Guide `.",False
,"tol  tol: float, default=1e-3 Tolerance for stopping criterion.",0.001
,"cache_size  cache_size: float, default=200 Specify the size of the kernel cache (in MB).",200
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.",'balanced'


In [None]:
print(classification_report(testDataset['sentiment'],svm.predict(X_test_tfidf)))

              precision    recall  f1-score   support

           0       0.86      0.97      0.91      1409
           1       0.64      0.20      0.31       167
           2       0.94      0.90      0.92      1590

    accuracy                           0.89      3166
   macro avg       0.81      0.69      0.71      3166
weighted avg       0.89      0.89      0.88      3166



In [None]:
'''
XGBOOST
'''
xgb_clf = xgb.XGBClassifier(
    objective = "multi:softprob",
    num_class = 3,
    n_estimators = 300,
    max_depth = 6,
    learning_rate = 0.1,
    tree_method = 'hist'
)
xgb_clf.fit(
    X_train_tfidf, trainDataset['sentiment']
)

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'multi:softprob'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [None]:
print(classification_report(testDataset['sentiment'], xgb_clf.predict(X_test_tfidf)))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89      1409
           1       0.34      0.57      0.43       167
           2       0.94      0.87      0.90      1590

    accuracy                           0.86      3166
   macro avg       0.72      0.78      0.74      3166
weighted avg       0.88      0.86      0.87      3166

