In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv(r"C:\Users\Admin\OneDrive - VNU-HCMUS\cleaned")
print(df.head())
print(df.shape)

                                                text  label  text_length
0  link imo happening guys jumping conclusions co...      1         2717
1  house passed budget today slim margin democrat...      1         3203
2  youtube video posted january cloudgate studio ...      0         2549
3  marseille france sang danced chanted even reas...      0         8149
4  sunday nbc meet press discussing president don...      0         1235
(68604, 3)


In [None]:
# Kiểm tra dữ liệu rỗng
df.isnull().sum()

text           7
label          0
text_length    0
dtype: int64

In [None]:
# Loại bỏ các hàng có giá trị NaN trong cột 'text'
df = df.dropna(subset=['text'])

In [None]:
# Chia dữ liệu thành 3 phần: train, valid, test
# 70% train, 10% valid, 20% test
train_df = df[:int(0.7 * len(df))]
valid_df = df[int(0.7 * len(df)):int(0.8 * len(df))]
test_df = df[int(0.8 * len(df)):]

# Tokenize data

In [5]:
# Tokenize dữ liệu
from transformers import BertTokenizerFast
from transformers import XLNetTokenizerFast

class Tokenizer:
    def __init__(self, model_name, max_length=256):
        if 'bert' in model_name:
            self.tokenizer = BertTokenizerFast.from_pretrained(model_name)
        elif 'xlnet' in model_name:
            self.tokenizer = XLNetTokenizerFast.from_pretrained(model_name)
        self.max_length = max_length

    def tokenize_function(self, examples):
        return self.tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=self.max_length
        )

from datasets import Dataset

tokenizer = Tokenizer('bert-base-uncased')

# Convert DataFrames to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize datasets
train_dataset = train_dataset.map(tokenizer.tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenizer.tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenizer.tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
valid_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/48017 [00:00<?, ? examples/s]

Map:   0%|          | 0/6860 [00:00<?, ? examples/s]

Map:   0%|          | 0/13720 [00:00<?, ? examples/s]

# Build and Load model

In [6]:
from transformers import BertForSequenceClassification, XLNetForSequenceClassification

class ModelBuilder:
    def __init__(self, num_labels=2):
        self.num_labels = num_labels
        self.model = None

    def build_model(self, model_name):
        if 'bert' in model_name:
            self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=self.num_labels)
        elif 'xlnet' in model_name:
            self.model = XLNetForSequenceClassification.from_pretrained(model_name, num_labels=self.num_labels)
        return self.model

In [7]:
model_builder = ModelBuilder()
bert_model = model_builder.build_model('bert-base-uncased')
xlnet_model = model_builder.build_model('xlnet-base-cased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train model

In [8]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)
xlnet_model.to(device)

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': float(f1),
        'precision': float(precision),
        'recall': float(recall)
    }

actual_tokenizer = tokenizer.tokenizer

data_collator = DataCollatorWithPadding(tokenizer=actual_tokenizer)

training_args = TrainingArguments(
    output_dir='./bert_results',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    dataloader_num_workers=8,
    fp16=True
)

trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=actual_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()



  0%|          | 0/6003 [00:00<?, ?it/s]

In [None]:
# Evaluate the model
eval_results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Evaluation results: {eval_results}")

  0%|          | 0/1715 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.2187958061695099, 'eval_runtime': 139.296, 'eval_samples_per_second': 98.495, 'eval_steps_per_second': 12.312, 'epoch': 1.0}
