In [2]:
from datasets import DatasetDict, Dataset
import pandas as pd

df_train = pd.read_csv('data/data_train.csv')
df_train.drop(['Title', 'Link', 'Publish Date'], axis=1, inplace=True)
df_train['CatName'] = df_train['CatName'].map({'Kinh tế': 1, 'Không phải kinh tế': 0})
df_train.rename(columns={'Contents': 'text', 'CatName': 'label'}, inplace=True)
print(df_train['label'].value_counts())

df_test = pd.read_csv('data/data_test.csv')
df_test.drop(['Title', 'Link', 'Publish Date'], axis=1, inplace=True)
df_test['CatName'] = df_test['CatName'].map({'Kinh tế': 1, 'Không phải kinh tế': 0})
df_test.rename(columns={'Contents': 'text', 'CatName': 'label'}, inplace=True)
print(df_test['label'].value_counts())

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})
dataset

label
1    800
0    800
Name: count, dtype: int64
label
1    200
0    200
Name: count, dtype: int64


DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 400
    })
})

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

train_dataset = train_dataset.remove_columns('id')
test_dataset = test_dataset.remove_columns('id')
model_name = 'vinai/phobert-base-v2'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=256)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    
    return {
      'accuracy': acc,
    }

training_args = TrainingArguments(
    output_dir="PhoBert_banking_classification",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    eval_accumulation_steps=4,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=True,
    report_to='tensorboard',
    load_best_model_at_end=True,
    eval_strategy='epoch',
    save_strategy='epoch',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
from sklearn.metrics import classification_report

predictions = trainer.predict(tokenized_test)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids
print(classification_report(labels, preds))