# 数据准备

In [28]:
! pip install datasets transformers rouge-score nltk evaluate sentencepiece



In [29]:
model_checkpoint = "t5-small"

In [30]:
from transformers.utils import move_cache

# 迁移模型缓存
move_cache()

0it [00:00, ?it/s]

In [31]:
from datasets import load_dataset
import evaluate

raw_datasets = load_dataset("xsum")
metric = evaluate.load("rouge")

In [32]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [33]:
raw_datasets["train"][0]

 'summary': 'Clean-up operations are continuing across the Scottish Borders and Dumfries and Galloway after flooding caused by Storm Frank.',
 'id': '35232142'}

In [34]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=2):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [35]:
show_random_elements(raw_datasets["train"])

Unnamed: 0,document,summary,id
0,"The university announced back in March it needed to save £10.5m and planned to cut 150 posts.\nMembers of the University and College Union (UCU) Scotland were balloted as a result.\nAfter the ballot closed on Monday, the union said 73% of those who voted backed strike action. And 80% also voted for action short of a strike.\nAndrew MacKillop, Aberdeen UCU representative, said: ""Members have made it quite clear that they reject the job losses proposed by the university.\n""Strike action is always a last resort but we can't sit back and see jobs lost with the accompanying damage to the student experience and the reputation of the university.""\nIn a statement, the university said it was disappointed that the union had vote for strike action ""in the midst of ongoing dialogue"".\nIt added: ""According to the results of the ballot, 263 UCU members voted in favour of strike action, representing 12.5% of our total academic and academic-related workforce.\n""The UCU had asked for assurance that the university would rule out compulsory redundancies as it seeks to make savings of £10.5m.\n""We were unable to give that assurance, although we are working tirelessly to achieve the savings we need through voluntary measures as far as possible, and are pursuing a range of additional options to increase our efficiency as a world-leading university.""",Staff at the University of Aberdeen have backed plans for industrial action in a dispute over planned job losses.,33049482
1,A selection of your pictures of Scotland sent in between 18 and 25 August. Send your photos to scotlandpictures@bbc.co.uk or via Instagram at #bbcscotlandpics,All images are copyrighted.,41047208


In [36]:
metric

EvaluationModule(name: "rouge", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id=None)}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each prediction
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLsum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/

In [37]:
fake_preds = ["hello there", "general kenobi"]
fake_labels = ["hello there", "general kenobi"]
metric.compute(predictions=fake_preds, references=fake_labels)

{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}

In [38]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [39]:
tokenizer("Hello, this one sentence!")

{'input_ids': [8774, 6, 48, 80, 7142, 55, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [40]:
tokenizer(["Hello, this one sentence!", "This is another sentence."])

{'input_ids': [[8774, 6, 48, 80, 7142, 55, 1], [100, 19, 430, 7142, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [41]:
with tokenizer.as_target_tokenizer():
    print(tokenizer(["Hello, this one sentence!", "This is another sentence."]))

{'input_ids': [[8774, 6, 48, 80, 7142, 55, 1], [100, 19, 430, 7142, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}




In [42]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

In [43]:
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [44]:
preprocess_function(raw_datasets['train'][:2])

{'input_ids': [[21603, 10, 37, 423, 583, 13, 1783, 16, 20126, 16496, 6, 80, 13, 8, 844, 6025, 4161, 6, 19, 341, 271, 14841, 5, 7057, 161, 19, 4912, 16, 1626, 5981, 11, 186, 7540, 16, 1276, 15, 2296, 7, 5718, 2367, 14621, 4161, 57, 4125, 387, 5, 15059, 7, 30, 8, 4653, 4939, 711, 747, 522, 17879, 788, 12, 1783, 44, 8, 15763, 6029, 1813, 9, 7472, 5, 1404, 1623, 11, 5699, 277, 130, 4161, 57, 18368, 16, 20126, 16496, 227, 8, 2473, 5895, 15, 147, 89, 22411, 139, 8, 1511, 5, 1485, 3271, 3, 21926, 9, 472, 19623, 5251, 8, 616, 12, 15614, 8, 1783, 5, 37, 13818, 10564, 15, 26, 3, 9, 3, 19513, 1481, 6, 18368, 186, 1328, 2605, 30, 7488, 1887, 3, 18, 8, 711, 2309, 9517, 89, 355, 5, 3966, 1954, 9233, 15, 6, 113, 293, 7, 8, 16548, 13363, 106, 14022, 84, 47, 14621, 4161, 6, 243, 255, 228, 59, 7828, 8, 1249, 18, 545, 11298, 1773, 728, 8, 8347, 1560, 5, 611, 6, 255, 243, 72, 1709, 1528, 161, 228, 43, 118, 4006, 91, 12, 766, 8, 3, 19513, 1481, 410, 59, 5124, 5, 96, 196, 17, 19, 1256, 68, 27, 103, 317, 132

In [45]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

# 微调模型

In [46]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [47]:
batch_size = 16
args = Seq2SeqTrainingArguments(
    output_dir="test-summarization",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    run_name="chenwenjie"  # 这里设置一个独特的运行名称
)



In [48]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [49]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [53]:

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [54]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [57]:
import os

trainer.train()
save_path = "/content/t5-small-summarization-model"

if not os.path.exists(save_path):
    os.makedirs(save_path)

trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.609,2.388964,29.5028,8.6703,23.367,23.3699,18.8259




('/content/t5-small-summarization-model/tokenizer_config.json',
 '/content/t5-small-summarization-model/special_tokens_map.json',
 '/content/t5-small-summarization-model/spiece.model',
 '/content/t5-small-summarization-model/added_tokens.json',
 '/content/t5-small-summarization-model/tokenizer.json')

In [58]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# 加载保存的模型和分词器
tokenizer = T5Tokenizer.from_pretrained("/content/t5-small-summarization-model")
model = T5ForConditionalGeneration.from_pretrained("/content/t5-small-summarization-model")

def generate_answer(context, question, max_length=50):
    # 格式化输入
    input_text = f"{context} {question}"
    inputs = tokenizer(input_text, return_tensors="pt")

    # 生成答案
    outputs = model.generate(inputs["input_ids"], max_new_tokens=max_length)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return answer

# 测试一个示例
context = "违规分为:一般违规扣分、严重违规扣分、出售假冒商品违规扣分,淘宝网每年12月31日24:00点会对符合条件的扣分做清零处理,详情如下..."
question = "淘宝扣分什么时候清零"
answer = generate_answer(context, question)
print("生成的答案:", answer)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


生成的答案: :,123124:00,...


In [59]:
!zip -r t5-small-summarization-model.zip /content/t5-small-summarization-model

  adding: content/t5-small-summarization-model/ (stored 0%)
  adding: content/t5-small-summarization-model/model.safetensors (deflated 8%)
  adding: content/t5-small-summarization-model/spiece.model (deflated 48%)
  adding: content/t5-small-summarization-model/tokenizer_config.json (deflated 95%)
  adding: content/t5-small-summarization-model/training_args.bin (deflated 51%)
  adding: content/t5-small-summarization-model/config.json (deflated 62%)
  adding: content/t5-small-summarization-model/generation_config.json (deflated 29%)
  adding: content/t5-small-summarization-model/special_tokens_map.json (deflated 85%)
  adding: content/t5-small-summarization-model/tokenizer.json (deflated 74%)


In [None]:
from google.colab import drive
drive.mount('/content/drive')