In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers datasets peft accelerate bitsandbytes

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-

说明
替换 texts 和 summaries 为你自己的英文清洗内容（课件 + 摘要）

{"input": "Cleaned courseware text block 1", "output": "Summary 1"}

{"input": "Cleaned courseware text block 2", "output": "Summary 2"}

直接上传你现有的 JSONL 文件（如已生成）

若训练数据较多，将 per_device_train_batch_size 调小，如 4

In [None]:
import json
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, pipeline
from peft import prepare_model_for_kbit_training

# 配置参数
model_name = "facebook/bart-large"  # 使用 BART 模型
jsonl_path = "/content/drive/My Drive/Colab Notebooks/EBAC5004/training_data_t51.jsonl"  # 你的JSONL文件路径
output_dir = "/content/drive/My Drive/bart-summary"  # 输出目录

# 加载数据
with open(jsonl_path, 'r') as f:
    data = [json.loads(line) for line in f]

dataset = Dataset.from_list(data)
print("Loaded dataset size:", len(dataset))

# 加载 tokenizer 和模型
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)  # 加载BART模型

# 编码数据
def preprocess(example):
    inputs = tokenizer(example['input'], truncation=True, padding='max_length', max_length=512)
    outputs = tokenizer(example['output'], truncation=True, padding='max_length', max_length=128)
    labels = outputs['input_ids']  # 动态生成 input_ids，而不是依赖 JSONL 里直接存
    labels = [label if label != tokenizer.pad_token_id else -100 for label in labels]  # 处理padding
    inputs['labels'] = labels
    return inputs


dataset = dataset.map(preprocess, remove_columns=dataset.column_names)
split = dataset.train_test_split(test_size=0.1)
train_dataset, eval_dataset = split['train'], split['test']

# 配置训练参数
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=False,
    bf16=False,
    logging_steps=10,
    push_to_hub=False,  # 如果不想推送到 Hub 上
    save_steps=500,  # 每 500 步保存模型
    save_strategy="steps"  # 设置为“steps”进行按步保存
)

# 初始化 Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)



Loaded dataset size: 272


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Map:   0%|          | 0/272 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


In [None]:
# 开始训练
trainer.train()

# 保存模型
trainer.save_model(output_dir)


Epoch,Training Loss,Validation Loss
1,2.352,1.900522
2,1.983,1.744545
3,1.793,1.68988
4,1.6116,1.644786
5,1.4525,1.653253
6,1.4905,1.637907
7,1.2745,1.651004
8,1.2985,1.652004
9,1.1878,1.654465
10,1.1901,1.657981




In [None]:
!pip install PyPDF2



In [None]:
import nltk
nltk.download('punkt')  # 下载必要的资源

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

测试：目前单独根据一个PDF文件生成摘要

In [None]:
import PyPDF2
import re
import nltk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

nltk.download('punkt')

# Step 1: 提取 PDF 文本
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

# Step 2: 清洗文本

def clean_courseware_text(text):
    # 去除页码、版权、学校名等信息
    text = re.sub(r'\bPage \d+ of \d+\b', '', text)
    text = re.sub(r'\b(Copyright|©|All Rights Reserved).*\n?', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(University|Institute|School).*\n?', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(COPYRIGHT|CONFIDENTIAL|DRAFT|VERSION \d+)\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Figure\s*\d+:.*', '', text, flags=re.IGNORECASE)

    # 替换公式为标记
    text = re.sub(r'(\$.*?\$|\\\[.*?\\\])', '[FORMULA]', text)

    # 去除奇怪的 Unicode 字符（保留英文、标点）
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    # 去除典型“表格标题 + 数值行”结构
    text = re.sub(r'(\d{1,3}(?:,\d{3})*\.?\d*\s+[A-Za-z ]+\s+(?:-?\d{1,3}(?:,\d{3})*\.?\d*\s*){2,})', '', text)

    # 删除数字密度过高的段落（数字比例 > 50%）
    lines = text.split('\n')
    filtered_lines = []
    for line in lines:
        tokens = line.strip().split()
        if not tokens:
            continue
        num_tokens = sum(1 for tok in tokens if re.match(r'-?\d{1,3}(?:,\d{3})*(?:\.\d+)?$', tok))
        if num_tokens / len(tokens) < 0.5:
            filtered_lines.append(line.strip())

    # 去重 + 去空行
    unique_lines = list(dict.fromkeys([l for l in filtered_lines if l]))
    text = '\n'.join(unique_lines)

    return text.strip()


# Step 3: 分块（用 tokenizer 控制分块长度）
def chunk_text_by_sentence(text, max_tokens=1024, tokenizer=None):
    sentences = nltk.sent_tokenize(text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(tokenizer.tokenize(current_chunk + sentence)) <= max_tokens:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


使用原BART模型测试：

In [None]:

# Step 4: 加载 BART 模型和 tokenizer
model_name = "facebook/bart-large"  # or "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
pipe = pipeline("summarization", model=model, tokenizer=tokenizer, device_map="auto")

# Step 5: 读取并清洗 PDF 文本
pdf_path = "/content/drive/My Drive/Colab Notebooks/EBAC5004/EBAC5004/CNI/CUI_Day1_v3.pdf"
raw_text = extract_text_from_pdf(pdf_path)
cleaned_text = clean_courseware_text(raw_text)

# Step 6: 分块
chunks = chunk_text_by_sentence(cleaned_text, max_tokens=1024, tokenizer=tokenizer)

# Step 7: 生成摘要
for chunk in chunks:
    if len(tokenizer(chunk)['input_ids']) > 1024:
        chunk = tokenizer.decode(tokenizer(chunk)['input_ids'][:1024], skip_special_tokens=True)
    result = pipe(chunk, max_length=128, do_sample=False)
    summaries.append(result[0]['summary_text'])  # <-- 修改这里

# Step 8: 合并摘要
final_summary = " ".join(summaries)
print("Final summary:", final_summary)



Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (1154 > 1024). Running this sequence through the model will result in indexing errors


Final summary:  The lecture slide discusses the use of conversational interfaces (UIs) in various fields, focusing on the introduction to conversational UI (text or speech) and its role in field applications. The objective of this module is to learn skills to design and implement systems that can interact with users using spoken or written natural language, such as chatbots and virtual assistants.

Conversational UI can play a variety of roles in fields, including sales support, commerce, marketing, and enterprise productivity. Sales support provides sales support to support salespeople in their work by giving them support in the sales cycle. Marketing
as part  The lecture slide discusses the use of Natural Language Processing (NLP) and Deep Learning (GPT) in conversational systems, specifically for task-oriented and end-to-end models, such as Claude, Lena, and Sparrow, as well as for general conversation. NLP and intent matching are often tightly integrated in applications, and senten

使用训练后模型测试：

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Step 4: 加载你自己保存的 BART 模型和 tokenizer
model_dir = "/content/drive/My Drive/bart-summary"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
pipe = pipeline("summarization", model=model, tokenizer=tokenizer, device_map="auto")

# Step 5: 读取并清洗 PDF 文本
pdf_path = "/content/drive/My Drive/Colab Notebooks/EBAC5004/EBAC5004/CNI/CUI_Day1_v3.pdf"
raw_text = extract_text_from_pdf(pdf_path)
cleaned_text = clean_courseware_text(raw_text)  # ⬅️ 用你之前定义的 preprocess_text() 更彻底

# Step 6: 分块（BART 支持最多 1024 tokens）
chunks = chunk_text_by_sentence(cleaned_text, max_tokens=1024, tokenizer=tokenizer)

# Step 7: 生成摘要（可设置较长 max_length，例如 256）
summaries = []
for chunk in chunks:
    if len(tokenizer(chunk)['input_ids']) > 1024:
        chunk = tokenizer.decode(tokenizer(chunk)['input_ids'][:1024], skip_special_tokens=True)
    result = pipe(chunk, max_length=128, do_sample=False)
    summaries.append(result[0]['summary_text'])  # <-- 修改这里

# Step 8: 合并摘要
final_summary = " ".join(summaries)
print("Final summary:\n", final_summary)


Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (1154 > 1024). Running this sequence through the model will result in indexing errors


Final summary:
  The lecture slide discusses the use of conversational interfaces (UIs) in various fields, focusing on the introduction to conversational UI (text or speech) and its role in field applications. The objective of this module is to learn skills to design and implement systems that can interact with users using spoken or written natural language, such as chatbots and virtual assistants.

Conversational UI can play a variety of roles in fields, including sales support, commerce, marketing, and enterprise productivity. Sales support provides sales support to support salespeople in their work by giving them support in the sales cycle. Marketing
as part  The lecture slide discusses the use of Natural Language Processing (NLP) and Deep Learning (GPT) in conversational systems, specifically for task-oriented and end-to-end models, such as Claude, Lena, and Sparrow, as well as for general conversation. NLP and intent matching are often tightly integrated in applications, and sente

In [None]:
!pip install nltk rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=05370acbbcb60165239cd3270e24d6377cbc4ccd789b16efa623cb5ce6c14586
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


测试（评估指标）


在计算 BLEU 分数时，我们添加了 smoothing_function 参数，并将其设置为 nltk.translate.bleu_score.SmoothingFunction().method1。这将使用平滑函数 method1 来处理零计数问题。理由：

在 BLEU 分数计算中，如果生成的摘要与参考摘要之间没有共同的 n-gram，则 BLEU 分数会为 0，这会导致评估结果不准确。为了解决这个问题，我们引入了平滑函数。平滑函数通过对零计数进行修正，可以更全面地考虑不同长度的 n-gram 匹配情况，从而提高 BLEU 分数的可靠性。

更专业的描述：

引入平滑函数以提高 BLEU 评估的鲁棒性：针对部分样本 BLEU 分数为 0 的情况，我们采用了平滑函数来处理 n-gram 匹配中的零计数问题。平滑函数可以有效缓解数据稀疏性对 BLEU 评估的影响，使评估结果更具鲁棒性和可靠性。



In [6]:
import json
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
import nltk

nltk.download('punkt')

# 加载模型和分词器
model_name = "facebook/bart-large"  # or "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
pipe = pipeline("summarization", model=model, tokenizer=tokenizer, device_map="auto")

# 加载JSONL数据集
jsonl_path = "/content/drive/My Drive/Colab Notebooks/EBAC5004/training_data_t51.jsonl"  # 替换为你的JSONL文件路径
with open(jsonl_path, 'r') as f:
    data = [json.loads(line) for line in f]

dataset = Dataset.from_list(data)

# 初始化指标计算器
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# 批量生成摘要并计算指标
rouge_scores = []
bleu_scores = []

for example in dataset:
    input_text = example['input']  # 假设你的JSONL文件使用'input'作为输入文本的键
    reference_summary = example['output']  # 假设你的JSONL文件使用'output'作为参考摘要的键

    # 生成摘要
    summary = pipe(input_text, max_length=128, do_sample=False)[0]['summary_text']

    # 计算ROUGE分数
    rouge_scores.append(rouge_scorer.score(reference_summary, summary))

    # 计算BLEU分数
    # 使用平滑函数
    smoothing_function = nltk.translate.bleu_score.SmoothingFunction().method1
    bleu_scores.append(sentence_bleu([reference_summary.split()], summary.split(), smoothing_function=smoothing_function))

# 计算平均指标
avg_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_bleu = sum(bleu_scores) / len(bleu_scores)

# 打印结果
print(f"平均 ROUGE-1: {avg_rouge1:.4f}")
print(f"平均 ROUGE-2: {avg_rouge2:.4f}")
print(f"平均 ROUGE-L: {avg_rougeL:.4f}")
print(f"平均 BLEU: {avg_bleu:.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Device set to use cpu
Your max_length is set to 128, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_length is set to 128, but your input_length is only 127. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)
Your max_length is set to 128, but your input_length is only 60. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)
Your max_length is set to 128, but your input_length is only 47. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...

平均 ROUGE-1: 0.3263
平均 ROUGE-2: 0.1494
平均 ROUGE-L: 0.2187
平均 BLEU: 0.0270


In [None]:
import json
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
import nltk

nltk.download('punkt')

# 加载模型和分词器
model_dir = "/content/drive/My Drive/bart-summary"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
pipe = pipeline("summarization", model=model, tokenizer=tokenizer, device_map="auto")

# 加载JSONL数据集
jsonl_path = "/content/drive/My Drive/Colab Notebooks/EBAC5004/training_data_t51.jsonl"  # 替换为你的JSONL文件路径
with open(jsonl_path, 'r') as f:
    data = [json.loads(line) for line in f]

dataset = Dataset.from_list(data)

# 初始化指标计算器
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# 批量生成摘要并计算指标
rouge_scores = []
bleu_scores = []

for example in dataset:
    input_text = example['input']  # 假设你的JSONL文件使用'input'作为输入文本的键
    reference_summary = example['output']  # 假设你的JSONL文件使用'output'作为参考摘要的键

    # 生成摘要
    summary = pipe(input_text, max_length=128, do_sample=False)[0]['summary_text']

    # 计算ROUGE分数
    rouge_scores.append(rouge_scorer.score(reference_summary, summary))

    # 计算BLEU分数
    # 使用平滑函数
    smoothing_function = nltk.translate.bleu_score.SmoothingFunction().method1
    bleu_scores.append(sentence_bleu([reference_summary.split()], summary.split(), smoothing_function=smoothing_function))

# 计算平均指标
avg_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_bleu = sum(bleu_scores) / len(bleu_scores)

# 打印结果
print(f"平均 ROUGE-1: {avg_rouge1:.4f}")
print(f"平均 ROUGE-2: {avg_rouge2:.4f}")
print(f"平均 ROUGE-L: {avg_rougeL:.4f}")
print(f"平均 BLEU: {avg_bleu:.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use cuda:0
Your max_length is set to 128, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_length is set to 128, but your input_length is only 127. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)
Your max_length is set to 128, but your input_length is only 60. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)
Your max_length is set to 128, but your input_length is only 47. Since this is a summarization task, where outputs sh

平均 ROUGE-1: 0.4891
平均 ROUGE-2: 0.2702
平均 ROUGE-L: 0.3278
平均 BLEU: 0.0980


Your max_length is set to 128, but your input_length is only... 这些警告信息与 摘要长度有关。它们表明在某些情况下，输入文本的长度很短，而 max_length 设置得过长，导致生成的摘要长度可能超过了输入文本的长度。
原因：在摘要任务中，通常希望生成的摘要比输入文本短。
建议： 手动降低 max_length 参数的值，使其与输入文本的长度相匹配。例如，如果输入文本长度为 12，则可以将 max_length 设置为 6。

评估结果分析

ROUGE 分数： ROUGE-1、ROUGE-2 和 ROUGE-L 分数分别为 0.4891、0.2702 和 0.3278。这些分数表明生成的摘要与参考摘要之间存在一定的重叠，但重叠程度不算很高。ROUGE-2 分数较低，说明生成的摘要在捕捉参考摘要的连续短语方面表现不佳。
BLEU 分数： BLEU 分数为 0.0980，这表明生成的摘要与参考摘要之间 n-gram 的重叠程度较低。

总结

生成的摘要在一定程度上能够捕捉到参考摘要的信息，但仍有改进的空间。
需要关注 BLEU 分数为 0 的情况，并尝试使用更低阶的 n-gram 或平滑函数来解决这个问题。
需要根据输入文本的长度调整 max_length 参数的值，以避免生成的摘要过长。

在本地 8 GB GPU 环境下，不依赖远程 LLM，将已提取的关键词与 BART 摘要结果转化为结构化思维导图图形的完整代码示意。