In [None]:
# This is an example testing original model:
# from transformers import MT5ForConditionalGeneration, T5Tokenizer
# #!pip install sentencepiece

# #import sentencepiece

# model = MT5ForConditionalGeneration.from_pretrained("heack/HeackMT5-ZhSum100k")
# tokenizer = T5Tokenizer.from_pretrained("heack/HeackMT5-ZhSum100k")

# chunk = """太平天国占领区街市没有刻字铺，所有刻字匠人都编入镌刻营，“朝勋詹记”一印应为太平天国镌刻营所出。但它不属于太平天国礼部统一制发的印章。太平天国私人便章实物从无发现，“朝勋詹记”一印的发现，弥足珍贵，它为研究太平天国用印情况及制度提供了第一手重要实物资料。
# """
# inputs = tokenizer.encode("summarize: " + chunk, return_tensors='pt', max_length=4096, truncation=True)
# summary_ids = model.generate(inputs, max_length=512, num_beams=4, length_penalty=0.2, no_repeat_ngram_size=2)
# summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# print(summary)

## Data loading

We use the cleaned data from CSL dataset, and load the evaluation method, Rouge.

In [None]:
from datasets import load_dataset, load_metric, load_from_disk
raw_datasets = load_from_disk("./Paper")
metric = load_metric("rouge")

  metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Each piece of data containing document(Paper abstract), summary(Paper title) and id, like the example below:

In [None]:
raw_datasets["test"][0]

{'document': '双官能团活性艳蓝GN和RN在固色浴中凝聚性小、骤染性小、匀染性好,且吸尽率和固色率高、提升性和重现性好,较好地克服了常用单乙烯砜型活性艳蓝(C.I.B-19)的性能缺陷.该染料最适合70℃染色,与嫩黄Y-160或翠蓝B-21配伍拼染艳绿色或艳蓝色,可以大幅提高染色一等品率.',
 'summary': '双官能团活性艳蓝的应用性能',
 'id': 0}

## Data Tokenize

Before we input the data, we should use tokenizer to transform the natural language to vector. We use the tokenizer from pretrained model MT5

In [3]:
import torch

from transformers import MT5ForConditionalGeneration, T5Tokenizer

model_name = "heack/HeackMT5-ZhSum100k"
model = MT5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


To ensure all the data are computed in the same device, we assign the device:

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 47.45 GiB of which 7.69 MiB is free. Process 4002836 has 37.81 GiB memory in use. Process 4002839 has 8.19 GiB memory in use. Including non-PyTorch memory, this process has 1.42 GiB memory in use. Of the allocated memory 1.23 GiB is allocated by PyTorch, and 38.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Before tokenizing, we also need to decide the max input and output length. Given that the title and abstract are not to long, I decide to use small length.

In [None]:
max_input_length = 256
max_target_length = 32

def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

## Train

We can now start the training. First we import the necessary packages and set the hyperparameter within the training. 
You can set the batch_size and num_train_epochs here, I set the epoch in 1, 2, and 5 for comparison

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

batch_size = 49
args = Seq2SeqTrainingArguments(
    "test-summarization",
    evaluation_strategy = "epoch",
    learning_rate=3e-3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    generation_max_length=max_target_length,
    logging_dir='./logs',  # Set the logging directory
    logging_steps=100,  # Log every 100 steps
    disable_tqdm=False,  # Ensure the progress bar and logging are enabled
)




We also need a data collator to input the raw data into our model:

In [5]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

NameError: name 'DataCollatorForSeq2Seq' is not defined

After training, we set the evaluation method.

In [104]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    print("reference summary: ", decoded_labels[0])
    print("generated summary: ", decoded_preds[0])
    
    predictions_tensor = torch.tensor(predictions)
    generated_preds = []
    for input_ids in predictions_tensor:
        input_ids = input_ids.to(device)
    
        generated_summary_ids = model.generate(
            input_ids.unsqueeze(0),  
            max_length=max_target_length,
            num_beams=4,
            length_penalty=0.4,
            no_repeat_ngram_size=2
        )
        generated_summary = tokenizer.decode(generated_summary_ids[0], skip_special_tokens=True)
        generated_preds.append(generated_summary)
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

Here is the final step before our training: send all the parameter into trainer.

In [105]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)



Each epoch the trainer will evaluate the model and output the score and I also set the trainer to output the first generation example.

In [106]:
output = trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.6427,1.282349,4.7967,0.75,4.7933,4.7417,12.952
2,0.6993,1.263666,5.4984,0.8733,5.5082,5.5082,14.112
3,0.3632,1.364481,6.2879,0.9667,6.2998,6.2226,13.948
4,0.151,1.503404,6.2673,1.4,6.2973,6.2377,14.43
5,0.059,1.731158,5.9945,1.0833,6.0229,5.9587,14.63


reference summary:  一种基于平方和优化的飞行器大角度机动镇定控制器设计方法
generated summary:  基于平方/M的飞行器姿态控制


Non-default generation parameters: {'max_length': 84, 'num_beams': 4, 'length_penalty': 0.6}


reference summary:  一种基于平方和优化的飞行器大角度机动镇定控制器设计方法
generated summary:  基于层次分析的飞行器纵向控制
reference summary:  一种基于平方和优化的飞行器大角度机动镇定控制器设计方法
generated summary:  基于修正的飞行器姿态控制系统建模与仿真


Non-default generation parameters: {'max_length': 84, 'num_beams': 4, 'length_penalty': 0.6}


reference summary:  一种基于平方和优化的飞行器大角度机动镇定控制器设计方法
generated summary:  基于修正的飞行器姿态控制系统建模与仿真


Non-default generation parameters: {'max_length': 84, 'num_beams': 4, 'length_penalty': 0.6}


reference summary:  一种基于平方和优化的飞行器大角度机动镇定控制器设计方法
generated summary:  基于修正的飞行器姿态控制


Remember to save the model after training

In [107]:
trainer.save_model("finetune5")


Non-default generation parameters: {'max_length': 84, 'num_beams': 4, 'length_penalty': 0.6}


## Comparison

I want to compare the performance between original and finetuned model.

First let us see how to evaluate a single model using test dataset:

In [108]:
from transformers import MT5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import torch

batch_size = 48
max_target_length = 32

# Load the original model and tokenizer
original_model = MT5ForConditionalGeneration.from_pretrained("heack/HeackMT5-ZhSum100k")
tokenizer = T5Tokenizer.from_pretrained("heack/HeackMT5-ZhSum100k")

# Select the training device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
original_model.to(device)

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=original_model)

# Create training arguments
ori_training_args = Seq2SeqTrainingArguments(
    "test-summarization",
    evaluation_strategy = "epoch",
    learning_rate=3e-3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    generation_max_length=max_target_length,
)

# Ensure raw_datasets["test"] is not empty
if raw_datasets["test"] is None:
    raise ValueError("The validation dataset is None. Please provide a valid dataset.")

# Create Seq2SeqTrainer
ori_trainer = Seq2SeqTrainer(
    model=original_model,
    args=ori_training_args,
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Evaluate the original model
results = ori_trainer.evaluate()

print("Evaluation Results for Original Model:")
print(results)

# # Print some example results

# val_dataset = raw_datasets["validation"]
# if val_dataset is not None:
#     for i in range(5):
#         inputs = tokenizer("summarize: " + val_dataset[i]["document"], return_tensors='pt', max_length=4096, truncation=True).to(device)
#         summary_ids = original_model.generate(inputs.input_ids, max_length=512, num_beams=4, length_penalty=0.2, no_repeat_ngram_size=2)
#         generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
#         reference_summary = val_dataset[i]["summary"]

#         print(f"Reference: {reference_summary}")
#         print(f"Generated: {generated_summary}")
#         print("\n")
# else:
#     raise ValueError("The validation dataset is None. Please provide a valid dataset.")

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



reference summary:  双官能团活性艳蓝的应用性能
generated summary:  双官能团活性艳蓝在固色浴中发挥重要作用
Evaluation Results for Original Model:
{'eval_loss': 1.2633886337280273, 'eval_rouge1': 7.5571, 'eval_rouge2': 1.1767, 'eval_rougeL': 7.5714, 'eval_rougeLsum': 7.6779, 'eval_gen_len': 15.773, 'eval_runtime': 341.6383, 'eval_samples_per_second': 2.927, 'eval_steps_per_second': 0.061}


And similar to the code above, we evaluate the score among original and finetuned models in 1, 2 and 5 epochs at the same time.

In [109]:
from transformers import MT5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import torch
import nltk
import numpy as np
from datasets import load_metric

# Load evaluation metric
rouge = load_metric("rouge")

# Load the original model and tokenizer
original_model = MT5ForConditionalGeneration.from_pretrained("heack/HeackMT5-ZhSum100k")
tokenizer = T5Tokenizer.from_pretrained("heack/HeackMT5-ZhSum100k")

# Load fine-tuned models
finetuned_model = MT5ForConditionalGeneration.from_pretrained("./finetune1")
finetuned_model2 = MT5ForConditionalGeneration.from_pretrained("./finetune2")
finetuned_model5 = MT5ForConditionalGeneration.from_pretrained("./finetune5")

# Select the training device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
original_model.to(device)
finetuned_model.to(device)
finetuned_model2.to(device)
finetuned_model5.to(device)

# Create training arguments
testing_args = Seq2SeqTrainingArguments(
    "test-summarization",
    evaluation_strategy = "epoch",
    learning_rate=2e-3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    generation_max_length=max_target_length,
    logging_dir='./logs',  # Set the logging directory
    logging_steps=10,  # Log every 10 steps
    disable_tqdm=False,  # Ensure the progress bar and logging are enabled
)

# Ensure raw_datasets["test"] is not empty
if raw_datasets["test"] is None:
    raise ValueError("The test dataset is None. Please provide a valid dataset.")

# Create Seq2SeqTrainer instances to evaluate the original model
original_trainer = Seq2SeqTrainer(
    model=original_model,
    args=testing_args,
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Create Seq2SeqTrainer instances to evaluate the fine-tuned model
finetuned_trainer = Seq2SeqTrainer(
    model=finetuned_model,
    args=testing_args,
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Create Seq2SeqTrainer instances to evaluate the fine-tuned2 model
finetuned2_trainer = Seq2SeqTrainer(
    model=finetuned_model2,
    args=testing_args,
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Create Seq2SeqTrainer instances to evaluate the fine-tuned5 model
finetuned5_trainer = Seq2SeqTrainer(
    model=finetuned_model5,
    args=testing_args,
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Evaluate the original model
original_results = original_trainer.evaluate()

# Evaluate the fine-tuned models
finetuned_results = finetuned_trainer.evaluate()
finetuned2_results = finetuned2_trainer.evaluate()
finetuned5_results = finetuned5_trainer.evaluate()

# Print evaluation results
print("Evaluation Results for Original Model:")
print(original_results)
print("Evaluation Results for Fine-tuned Model after 1 epoch:")
print(finetuned_results)
print("Evaluation Results for Fine-tuned Model after 2 epoch:")
print(finetuned2_results)
print("Evaluation Results for Fine-tuned Model after 5 epoch:")
print(finetuned5_results)



You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


reference summary:  双官能团活性艳蓝的应用性能
generated summary:  双官能团活性艳蓝在固色浴中发挥重要作用


reference summary:  双官能团活性艳蓝的应用性能
generated summary:  双官能团活性艳蓝芪染料的染色


reference summary:  双官能团活性艳蓝的应用性能
generated summary:  双官能团活性艳蓝的染色


reference summary:  双官能团活性艳蓝的应用性能
generated summary:  双官能团活性艳蓝领及林业在紫菜中的应用
Evaluation Results for Original Model on Test Set:
{'eval_loss': 1.2633886337280273, 'eval_rouge1': 7.5571, 'eval_rouge2': 1.1767, 'eval_rougeL': 7.5714, 'eval_rougeLsum': 7.6779, 'eval_gen_len': 15.773, 'eval_runtime': 431.7017, 'eval_samples_per_second': 2.316, 'eval_steps_per_second': 0.049}
Evaluation Results for Fine-tuned Model on Test Set:
{'eval_loss': 1.0314708948135376, 'eval_rouge1': 7.4925, 'eval_rouge2': 0.7, 'eval_rougeL': 7.4099, 'eval_rougeLsum': 7.4257, 'eval_gen_len': 14.044, 'eval_runtime': 325.5001, 'eval_samples_per_second': 3.072, 'eval_steps_per_second': 0.065}
Evaluation Results for Fine-tuned Model2 on Test Set:
{'eval_loss': 1.1142152547836304, 'eval_rouge1': 7.8213, 'eval_rouge2': 0.69, 'eval_rougeL': 7.7348, 'eval_rougeLsum': 7.8231, 'eval_gen_len': 13.999, 'eval_runtime': 432.3147, 'eval_samples_per_second': 2.313, 'eval_steps_per_second': 0.049}
Evaluation Results for Fine-tuned Mode