<a href="https://colab.research.google.com/github/CHEN-886a/bart_pretrain02/blob/main/bart_pretrain_712.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#first existing model args
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import json

class FinancialNewsDataset(torch.utils.data.Dataset):
    def __init__(self, file_path, tokenizer, max_length=512):
        self.data = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                self.data.append(json.loads(line))
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        record = self.data[idx]
        company_name = record["input"]["company_name"]
        news_content = record["input"]["content"]
        combined_input = f"{company_name}: {news_content}"
        summary = record["output"]

        inputs = self.tokenizer(combined_input, return_tensors="pt", max_length=self.max_length, truncation=True, padding="max_length")
        outputs = self.tokenizer(summary, return_tensors="pt", max_length=self.max_length, truncation=True, padding="max_length")

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": outputs["input_ids"].squeeze()
        }

# 加载分词器和数据集
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
train_file_path = "/content/drive/MyDrive/colab notebook/data/train_dataset_bart_02.jsonl"
test_file_path = "/content/drive/MyDrive/colab notebook/data/test_dataset_bart_02.jsonl"

train_dataset = FinancialNewsDataset(train_file_path, tokenizer)
test_dataset = FinancialNewsDataset(test_file_path, tokenizer)

# 检查 GPU 可用性
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 加载预训练模型并调整层数
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base', num_hidden_layers=12)  # 增加Decoder层数
model.to(device)

# 设置训练参数
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/colab notebook/results',
    num_train_epochs=5,  # 增加训练轮数
    per_device_train_batch_size=4,  # 根据GPU内存调整批量大小
    per_device_eval_batch_size=4,
    save_steps=1000,  # 调整保存频率
    save_total_limit=3,
    logging_dir='/content/drive/MyDrive/colab notebook/logs',
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,  # 调整评估频率
    load_best_model_at_end=True,
    metric_for_best_model="loss"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 定义训练器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

# 开始训练
trainer.train()

# 保存模型和分词器
model.save_pretrained('/content/drive/MyDrive/colab notebook/results/trained_model')
tokenizer.save_pretrained('/content/drive/MyDrive/colab notebook/results/trained_model')

In [2]:
#model02 第二次调参训练
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
import json

# 定义数据集类
class FinancialNewsDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=1024):
        self.data = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                self.data.append(json.loads(line))
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        record = self.data[idx]
        company_name = record["input"]["company_name"]
        news_content = record["input"]["content"]
        combined_input = f"{company_name}: {news_content}"
        summary = record["output"]

        inputs = self.tokenizer(combined_input, return_tensors="pt", max_length=self.max_length, truncation=True, padding="max_length")
        outputs = self.tokenizer(summary, return_tensors="pt", max_length=self.max_length, truncation=True, padding="max_length")

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": outputs["input_ids"].squeeze()
        }

# 加载分词器和数据集
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
train_file_path = "/content/drive/MyDrive/colab notebook_02/dataset02/train_dataset_bart_02.jsonl"
test_file_path = "/content/drive/MyDrive/colab notebook_02/dataset02/test_dataset_bart_02.jsonl"

train_dataset = FinancialNewsDataset(train_file_path, tokenizer)
test_dataset = FinancialNewsDataset(test_file_path, tokenizer)

# 检查 GPU 可用性
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 加载预训练模型并调整层数
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base', num_hidden_layers=16)  # 增加Decoder层数到16层
model.to(device)

# 设置训练参数

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/colab notebook_02/result02',
    num_train_epochs=6,                   # 训练轮数
    per_device_train_batch_size=8,        # 每个GPU设备的训练批量大小
    per_device_eval_batch_size=8,         # 每个GPU设备的评估批量大小
    save_steps=500,                      # 模型保存频率（每多少个步骤保存一次）
    save_total_limit=3,                   # 最多保存的模型数量
    logging_dir='/content/drive/MyDrive/colab notebook_02/logs02',
    logging_steps=100,                    # 日志记录频率（每多少个步骤记录一次）
    eval_strategy="steps",
    eval_steps=500,                       # 评估频率（每多少个步骤评估一次）
    load_best_model_at_end=True,          # 训练结束时加载最佳模型
    metric_for_best_model="loss",         # 选择最佳模型的评估指标（损失函数）
    learning_rate=5e-5,                   # 学习率
    weight_decay=0.01,                    # 权重衰减
    gradient_accumulation_steps=1,        # 梯度累积步数（根据GPU内存调整）
    fp16=True,                            # 混合精度训练
    fp16_opt_level='O1'                   # 混合精度优化级别
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 定义训练器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

# 开始训练
trainer.train()

# 保存模型和分词器
model.save_pretrained('/content/drive/MyDrive/colab notebook_02/result02/trained_model')
tokenizer.save_pretrained('/content/drive/MyDrive/colab notebook_02/result02/trained_model')


Using device: cuda


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss,Validation Loss
500,0.0289,0.024708
1000,0.0261,0.021496
1500,0.0225,0.01958
2000,0.0181,0.018832
2500,0.0171,0.017696
3000,0.0169,0.017107
3500,0.0133,0.016921
4000,0.0128,0.016282
4500,0.0122,0.015762
5000,0.0095,0.01596


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams

('/content/drive/MyDrive/colab notebook_02/result02/trained_model/tokenizer_config.json',
 '/content/drive/MyDrive/colab notebook_02/result02/trained_model/special_tokens_map.json',
 '/content/drive/MyDrive/colab notebook_02/result02/trained_model/vocab.json',
 '/content/drive/MyDrive/colab notebook_02/result02/trained_model/merges.txt',
 '/content/drive/MyDrive/colab notebook_02/result02/trained_model/added_tokens.json')

In [None]:
!pip install transformers[torch] -U

In [None]:
!pip install accelerate -U

In [None]:
#use pretrained model to generate summary
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
import json

# 挂载Google Drive


# 定义路径
model_path = '/content/drive/MyDrive/colab notebook/results/trained_model'
data_file_path = '/content/drive/MyDrive/colab notebook/data/total_dataset.jsonl'

# 加载分词器和模型
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

# 检查是否有GPU可用，并将模型移动到GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

# 定义生成摘要的函数
def generate_summary(company_name, news_content):
    combined_input = f"{company_name}: {news_content}"
    inputs = tokenizer(combined_input, return_tensors="pt", max_length=1024, truncation=True, padding="max_length")
    inputs = inputs.to(device)
    summary_ids = model.generate(inputs.input_ids, max_length=600, num_beams=4, length_penalty=2.0, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# 处理数据集并生成摘要
output_summaries = []
with open(data_file_path, 'r', encoding='utf-8') as f:
    for line in f:
        record = json.loads(line)
        company_name = record["input"]["company_name"]
        news_content = record["input"]["content"]
        summary = generate_summary(company_name, news_content)
        output_summaries.append({
            "input": {
                "company_name": company_name,
                "content": news_content
            },
            "generated_summary": summary
        })

# 保存生成的摘要到新的JSON文件
output_file_path = '/content/drive/MyDrive/colab notebook/data/result_summary/bart_generated_summary02.jsonl'
with open(output_file_path, 'w', encoding='utf-8') as f:
    for summary_record in output_summaries:
        f.write(json.dumps(summary_record, ensure_ascii=False) + '\n')

print("Summaries generated and saved successfully.")


Using device: cuda


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/colab notebook/data/total_dataset.jsonl'

In [None]:
#extract generated_summary data to a new jsonl file
import json

# 输入和输出文件路径
input_jsonl_path = '/content/drive/MyDrive/colab notebook/data/result_summary/02/bart_generated_summary02.jsonl'
output_json_path = '/content/drive/MyDrive/colab notebook/data/result_summary/02/bart_final_generated_summary02.jsonl'

# 读取JSONL文件，并提取generated_summary到新的JSON文件
generated_summaries = []

with open(input_jsonl_path, 'r', encoding='utf-8') as f:
    for line in f:
        record = json.loads(line.strip())
        generated_summary = record['generated_summary']
        generated_summaries.append({
            'generated_summary': generated_summary
        })

# 将提取的数据写入新的JSON文件
with open(output_json_path, 'w', encoding='utf-8') as fout:
    json.dump(generated_summaries, fout, ensure_ascii=False, indent=4)

print(f"Generated summaries extracted to {output_json_path}")


Generated summaries extracted to /content/drive/MyDrive/colab notebook/data/result_summary/02/bart_final_generated_summary02.jsonl


In [None]:
#bleu
from nltk.translate.bleu_score import sentence_bleu
import json
generated_summary_filepath=
# 加载生成的摘要和参考摘要
with open('generated_summaries.json', 'r', encoding='utf-8') as f:
    generated_summaries = json.load(f)

with open('reference_summaries.json', 'r', encoding='utf-8') as f:
    reference_summaries = json.load(f)

# 计算每个生成摘要的BLEU分数
bleu_scores = []
for gen_summary, ref_summary in zip(generated_summaries, reference_summaries):
    gen_text = gen_summary['generated_summary']
    ref_text = ref_summary['reference_summary']

    # 这里假设参考摘要可以是多个，按需调整
    bleu = sentence_bleu([ref_text.split()], gen_text.split())
    bleu_scores.append(bleu)

    # 输出每个摘要的BLEU分数
    print(f"Generated Summary: {gen_text}")
    print(f"Reference Summary: {ref_text}")
    print(f"BLEU Score: {bleu}")

# 计算平均BLEU分数
average_bleu = sum(bleu_scores) / len(bleu_scores)
print(f"Average BLEU Score: {average_bleu}")


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

# 加载模型和分词器
model_path = '/content/drive/MyDrive/colab notebook/results/trained_model'
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)
# Move the model to the device
model.to(device)

# 输入文本示例
input_text = "Mainland China and Hong Kong stocks ended lower, with a key index logging its fifth straight losing session. Investors were disappointed by a lack of policy stimulus measures amid a weak economic recovery, rising geopolitical tensions and foreign outflows.In France, a leftist alliance unexpectedly took top spot ahead of the far right in Sunday's election, a major upset that was set to prevent Marine Le Pen's National Rally from running the government.The weaker than expected showing for the far right was something of a relief for investors, though they also have concerns the left s plans could unwind many of President Emmanuel Macrons pro-market reforms"

# 处理输入并生成摘要
inputs = tokenizer(input_text, return_tensors="pt").to(device) # Move the input tensors to the device
summary_ids = model.generate(inputs['input_ids'])

# 解码摘要并打印结果
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Generated Summary:", summary)



Generated Summary: China Hong Kong stocks fall for 5th session
