In [None]:
!pip install transformers evaluate datasets rouge_score -q

In [2]:
# Import essential libraries
import os
import tensorflow as tf
import pandas as pd
import numpy as np
import warnings
from datasets import Dataset
from transformers import (
    TFAutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
import evaluate

In [43]:
# Configuration
num_workers = os.cpu_count()
epochs = 7
learning_rate = 2e-5

# Model 
model_name = 'vinai/bartpho-word'  

# Warnings configuration
warnings.filterwarnings('ignore')

# Load the model and tokenizer (TensorFlow version)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


All model checkpoint layers were used when initializing TFMBartForConditionalGeneration.

Some layers of TFMBartForConditionalGeneration were not initialized from the model checkpoint at vinai/bartpho-word and are newly initialized: ['final_logits_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
out_dir = '/kaggle/working'  # Output directory
train_path = '/kaggle/input/vietnews-dataset/train.csv'  # Path to training data
valid_path = '/kaggle/input/vietnews-dataset/valid.csv'  # Path to validation data
test_path = '/kaggle/input/vietnews-dataset/test.csv'  # Path to test data

# Convert datasets to Hugging Face format after cleaning
train_df["Content"] = train_df["Content"].astype(str)
valid_df["Content"] = valid_df["Content"].astype(str)
test_df["Content"] = test_df["Content"].astype(str)
train_df["Abstract"] = train_df["Abstract"].astype(str)
valid_df["Abstract"] = valid_df["Abstract"].astype(str)
test_df["Abstract"] = test_df["Abstract"].astype(str)

train_ds = Dataset.from_pandas(train_df)
valid_ds = Dataset.from_pandas(valid_df)
test_ds = Dataset.from_pandas(test_df)

# Preprocess function to tokenize Vietnamese inputs and outputs
def preprocessing(examples):
    """
    Tokenizes the input Vietnamese text (Content) and prepares the model inputs for training.
    Uses the tokenizer's maximum length to handle longer documents.
    """
    inputs = [doc for doc in examples["Content"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
    
    # Tokenizing Abstracts (targets)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["Abstract"], max_length=1024, truncation=True, padding="max_length")
    
    # Adding labels to the inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenizing the Vietnamese datasets
tokenized_train = train_ds.map(preprocessing, batched=True)
tokenized_valid = valid_ds.map(preprocessing, batched=True)
tokenized_test = test_ds.map(preprocessing, batched=True)


Map:   0%|          | 0/105418 [00:00<?, ? examples/s]

Map:   0%|          | 0/22642 [00:00<?, ? examples/s]

Map:   0%|          | 0/22644 [00:00<?, ? examples/s]

In [45]:
tokenized_train

Dataset({
    features: ['Filename', 'Title', 'Abstract', 'Content', 'Keyword', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 105418
})

In [47]:
import tensorflow as tf

# Convert the dataset to a TensorFlow dataset
tf_train_dataset = tokenized_train.to_tf_dataset(
    columns=['input_ids', 'attention_mask'],  # Input features
    label_cols='labels',                      # Label column
    shuffle=True,
    batch_size=16,
    collate_fn=lambda x: {
        'input_ids': tf.ragged.constant([i['input_ids'] for i in x], dtype=tf.int32).to_tensor(),
        'attention_mask': tf.ragged.constant([i['attention_mask'] for i in x], dtype=tf.int32).to_tensor(),
        'labels': tf.constant([i['labels'] for i in x], dtype=tf.int32)
    }
)

tf_valid_dataset = tokenized_valid.to_tf_dataset(
    columns=['input_ids', 'attention_mask'],  # Input features
    label_cols='labels',                      # Label column
    shuffle=True,
    batch_size=16,
    collate_fn=lambda x: {
        'input_ids': tf.ragged.constant([i['input_ids'] for i in x], dtype=tf.int32).to_tensor(),
        'attention_mask': tf.ragged.constant([i['attention_mask'] for i in x], dtype=tf.int32).to_tensor(),
        'labels': tf.constant([i['labels'] for i in x], dtype=tf.int32)
    }
)

# Verify a batch
for batch in tf_train_dataset.take(1):
    inputs, labels = batch
    print("Input IDs shape:", inputs['input_ids'].shape)
    print("Attention Mask shape:", inputs['attention_mask'].shape)
    print("Labels shape:", labels.shape)


Input IDs shape: (16, 1024)
Attention Mask shape: (16, 1024)
Labels shape: (16, 1024)


In [None]:
max_len = 512

# Giảm kích thước dữ liệu trong tập huấn luyện
tf_train_dataset = tf_train_dataset.map(lambda x, y: ({
    'input_ids': x['input_ids'][:, :max_len],  # Chỉ lấy 512 token đầu tiên
    'attention_mask': x['attention_mask'][:, :max_len]
}, y[:, :max_len]))  
model.fit(tf_train_dataset.batch(32), epochs=3)


In [53]:
# Adjust the batch size to match your TensorFlow dataset
batch_size = 16

# Calculate steps per epoch and total training steps
steps_per_epoch = len(tokenized_train) // batch_size
total_training_steps = steps_per_epoch * epochs

# Create an optimizer using the transformers utility
from transformers import create_optimizer

optimizer, schedule = create_optimizer(
    init_lr=learning_rate, 
    num_train_steps=total_training_steps, 
    num_warmup_steps=0
)

# Compile the model with the optimizer and loss function
model.compile(optimizer=optimizer, loss=model.compute_loss)
model.summary()

# Train the model
model.fit(
    tf_train_dataset,
    validation_data=tf_valid_dataset,
    epochs=epochs
)

Model: "tfm_bart_for_conditional_generation_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFMBartMainLayer)    multiple                  420361216 
                                                                 
 final_logits_bias (BiasLay  multiple                  64001     
 er)                                                             
                                                                 
Total params: 420425217 (1.57 GB)
Trainable params: 420361216 (1.57 GB)
Non-trainable params: 64001 (250.00 KB)
_________________________________________________________________


In [26]:
def summarize_text(texts, max_length=150, min_length=40):
    """
    Generate summaries for a list of input texts using the trained model.
    
    Args:
    - texts: List of strings containing the texts to summarize.
    - max_length: Maximum length of the summary.
    - min_length: Minimum length of the summary.
    
    Returns:
    - List of generated summaries.
    """
    # Tokenize inputs
    inputs = tokenizer(texts, max_length=1024, truncation=True, padding="max_length", return_tensors="tf")
    
    # Generate summaries
    summaries = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        min_length=min_length,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    
    # Decode summaries
    return [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in summaries]

# Example usage:
input_texts = [
    "Giá vàng sẽ tiếp tục tăng trong dài hạn, một phần do tình hình tài chính bấp bênh của nhiều quốc gia phương Tây. Hiệp hội Thị trường Vàng London (LBMA) dự đoán, giá vàng có thể tăng lên 2.941 USD/ounce trong 12 tháng tới."
]

summarized_texts = summarize_text(input_texts)
for i, summary in enumerate(summarized_texts):
    print(f"Summary {i+1}: {summary}")


Summary 1: Giá vàng sẽ tiếp_tục tăng trong dài hạn, một phần do tình_hình tài_chính bấp_bênh của nhiều quốc_gia phương Tây( * ) : 1 - 1 - 1 - 1 - 1 - 1 - 1


----