In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/data_train_vn.csv')

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

label_to_id = {label: idx for idx, label in enumerate(set(train_labels))}
train_labels = [label_to_id[label] for label in train_labels]
val_labels = [label_to_id[label] for label in val_labels]
id_to_label = {v: k for k, v in label_to_id.items()}


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


In [None]:
import torch
from torch.utils.data import Dataset

class IntentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IntentDataset(train_encodings, train_labels)
val_dataset = IntentDataset(val_encodings, val_labels)


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=len(label_to_id))


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=12,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()




model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,No log,1.07615
2,No log,0.796899
3,No log,0.57174
4,No log,0.39803
5,No log,0.271024
6,No log,0.173977
7,No log,0.135278
8,No log,0.110725
9,No log,0.08562
10,No log,0.069714


TrainOutput(global_step=192, training_loss=0.4206454356511434, metrics={'train_runtime': 972.609, 'train_samples_per_second': 1.505, 'train_steps_per_second': 0.197, 'total_flos': 13542240306816.0, 'train_loss': 0.4206454356511434, 'epoch': 12.0})

In [None]:
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)


Evaluation results: {'eval_loss': 0.06690804660320282, 'eval_runtime': 1.9277, 'eval_samples_per_second': 16.081, 'eval_steps_per_second': 2.075, 'epoch': 12.0}


In [None]:
model.save_pretrained('./intent_classifier')
tokenizer.save_pretrained('./intent_classifier')


('./intent_classifier/tokenizer_config.json',
 './intent_classifier/special_tokens_map.json',
 './intent_classifier/vocab.txt',
 './intent_classifier/bpe.codes',
 './intent_classifier/added_tokens.json')

In [None]:
import torch

def predict_intent(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    return id_to_label[predicted_class]

print(predict_intent("Can you tell me the price of this product?"))
print(predict_intent("Hello, I need some assistance!"))
print(predict_intent("Show me items under $50."))
print(predict_intent("Could you describe this product?"))
print(predict_intent("What is the status of my order?"))


description
greeting
price range inquiry
greeting
price range inquiry


In [None]:

def predict_intent(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    probabilities = torch.softmax(outputs.logits, dim=1)
    confidence = probabilities[0][predicted_class].item()
    return id_to_label[predicted_class], confidence


('description', 0.414120614528656)
('greeting', 0.9647712707519531)
('price range inquiry', 0.901065468788147)
('greeting', 0.5367602705955505)
('greeting', 0.9039254188537598)


In [None]:

from google.colab import files
import shutil

shutil.make_archive('intent_classifier', 'zip', './intent_classifier')

files.download('intent_classifier.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Enter a text (or type 'exit' to quit): Cho tôi thông tin sản phẩm
Intent: price range inquiry, Confidence: 0.5844980478286743
Enter a text (or type 'exit' to quit): Xin chào bạn
Intent: greeting, Confidence: 0.864778995513916
Enter a text (or type 'exit' to quit): Xin chào
Intent: greeting, Confidence: 0.9225947856903076
Enter a text (or type 'exit' to quit): Gía của sản phẩm trên là bao nhiêu
Intent: description, Confidence: 0.7782225608825684
Enter a text (or type 'exit' to quit): Sản phẩm nào dưới 200k
Intent: price range inquiry, Confidence: 0.944764256477356
Enter a text (or type 'exit' to quit): Sản phẩm đó có giá bao nhiêu tiền
Intent: price inquiry, Confidence: 0.927739143371582
Enter a text (or type 'exit' to quit): Thông tin của sản phẩm đó
Intent: description, Confidence: 0.9290415644645691
Enter a text (or type 'exit' to quit): Hello
Intent: greeting, Confidence: 0.9565747976303101
Enter a text (or type 'exit' to quit): Hello
Intent: greeting, Confidence: 0.9565747976303101

KeyboardInterrupt: Interrupted by user

In [36]:
for label_id in range(4):
    print(f"Label ID: {label_id}, Intent: {id_to_label[label_id]}")

Label ID: 0, Intent: price range inquiry
Label ID: 1, Intent: price inquiry
Label ID: 2, Intent: greeting
Label ID: 3, Intent: description
