## 使用Bert模型和自定义数据训练chat模型

自定义数据需将数据转换为需要的格式。对于Bert模型，需要将数据转换为BERT的输入格式，即token和segment embeddings的形式。

In [None]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel

class BertForQuestionAnswering(nn.Module):
    def __init__(self, config):
        super(BertForQuestionAnswering, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased', config=config)
#         self.bert = BertModel.from_pretrained('bert-base-chinese', config=config)        
        self.qa_outputs = nn.Linear(config.hidden_size, 2)
    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        outputs = self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        return start_logits, end_logits
    
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

class QAData(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
    def __getitem__(self, index):
        example = self.data[index]
        question = example['question']
        answer = example['answer']
        inputs = tokenizer(question, answer, return_tensors='pt', padding=True, truncation=True)
        start_positions = torch.tensor([example['start_position']])
        end_positions = torch.tensor([example['end_position']])
        return inputs, start_positions, end_positions
    def __len__(self):
        return len(self.data)
    
config = BertConfig.from_pretrained('bert-base-uncased')
# config = BertConfig.from_pretrained('bert-base-chinese')

model = BertForQuestionAnswering(config)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

train_data = [...] # 自定义训练数据

train_dataset = QAData(train_data)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8)

for epoch in range(10):
    for batch in train_loader:
        model.train()
        optimizer.zero_grad()
        inputs, start_positions, end_positions = batch
        outputs = model(**inputs)
        loss = loss_fn(outputs[0], start_positions) + loss_fn(outputs[1], end_positions)
        loss.backward()
        optimizer.step()
model.eval()


test_data = [...] # 自定义测试数据
test_dataset = QAData(test_data)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8)

for batch in test_loader:
    inputs, start_positions, end_positions = batch
    with torch.no_grad():
        outputs = model(**inputs)
    start_preds = torch.argmax(outputs[0], dim=1)
    end_preds = torch.argmax(outputs[1], dim=1)
    # 计算准确率等指标

自定义数据train_data通常包括一系列问题和对应的答案，每个问题和答案都需要对应的起始位置和结束位置，用于训练模型预测答案的位置。下面是一个示例：

In [1]:
train_data = [
    {
        'question': 'What is the capital of China?',
        'answer': 'Beijing is the capital of China.',
        'start_position': 23,
        'end_position': 30
    },
    {
        'question': 'Who is the author of The Great Gatsby?',
        'answer': 'The author of The Great Gatsby is F. Scott Fitzgerald.',
        'start_position': 27,
        'end_position': 46
    },
    {
        'question': 'What is the highest mountain in the world?',
        'answer': 'Mount Everest is the highest mountain in the world.',
        'start_position': 0,
        'end_position': 12
    }
]

如果使用中文的SQuAD数据集进行训练，可以使用Hugging Face的transformers库中的SquadDataset类来加载数据集。同时，为了添加自定义的训练数据，需要将自定义数据加入到SQuAD数据集中。下面是修改后的代码示例：

In [None]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from transformers import InputExample
from transformers.data.processors.squad import SquadV2Processor, squad_convert_examples_to_features
from transformers.data.datasets.squad import SquadDataset


class BertForQuestionAnswering(nn.Module):
    def __init__(self, config):
        super(BertForQuestionAnswering, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese', config=config)
        self.qa_outputs = nn.Linear(config.hidden_size, 2)
    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        outputs = self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        return start_logits, end_logits
    
    
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
processor = SquadV2Processor()
train_examples = processor.get_train_examples('data/', filename='train-v2.0.json')

custom_train_data = [
    {
        'question': '什么是Python?',
        'answer': 'Python是一种解释型、高级的、通用的编程语言。',
        'start_position': 0,
        'end_position': 6
    },
    {
        'question': '什么是深度学习?',
        'answer': '深度学习是一种机器学习技术，它允许计算机模拟人类大脑进行学习和理解。',
        'start_position': 0,
        'end_position': 4
    }
]

custom_train_examples = [
    InputExample(qa_id=str(i), question=example['question'], context='', answer_text=example['answer'], start_position_answer=example['start_position'], end_position_answer=example['end_position']) for i, example in enumerate(custom_train_data)
]

train_examples += custom_train_examples
train_features, _ = squad_convert_examples_to_features(train_examples, tokenizer, max_seq_length=512, doc_stride=128, max_query_length=64, is_training=True)
train_dataset = SquadDataset(train_features)

config = BertConfig.from_pretrained('bert-base-chinese')

model = BertForQuestionAnswering(config)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8)

for epoch in range(10):
    for batch in train_loader:
        model.train()
        optimizer.zero_grad()
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'token_type_ids': batch[2],
        }
        start_positions = batch[3]
        end_positions = batch[4]
        outputs = model(**inputs)
        loss = loss_fn(outputs[0], start_positions) + loss_fn(outputs[1], end_positions)
        loss.backward()
        optimizer.step()
model.eval()


test_examples = processor.get_dev_examples('data/', filename='dev-v2.0.json')
test_features, _ = squad_convert_examples_to_features(test_examples, tokenizer, max_seq_length=512, doc_stride=128, max_query_length=64, is_training=False)
test_dataset = SquadDataset(test_features)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8)

for batch in test_loader:
    inputs = {
        'input_ids': batch[0],
        'attention_mask': batch[1],
        'token_type_ids': batch[2],
    }
    start_positions = batch[3]
    end_positions = batch[4]
    with torch.no_grad():
        outputs = model(**inputs)
    start_preds = torch.argmax(outputs[0], dim=1)
    end_preds = torch.argmax(outputs[1], dim=1)
    # 计算准确率等指标

在这个示例中，我们首先使用SquadV2Processor从文件中加载中文SQuAD数据集中的训练数据，并使用SquadDataset类将其转换为PyTorch的Dataset对象。同时，我们将自定义的训练数据加入到SQuAD数据集中，并将其转换为InputExample格式的对象，再调用squad_convert_examples_to_features函数将其转换为模型需要的特征格式。在训练和测试阶段，我们使用PyTorch的DataLoader将数据分批加载，并按照BERT模型的输入格式将数据传入模型中进行训练和预测。