In [19]:
    import pandas as pd
    import json
    import re
    import spacy
    import nltk
    from nltk.tokenize import word_tokenize
    from rank_bm25 import BM25Okapi
    from collections import defaultdict


In [20]:
# 加载声明数据
claim_file_path = "data/train-claims.json"
with open(claim_file_path, "r") as file:
    claims_data = json.load(file)

# 加载证据数据
evidence_file_path = "data/evidence.json"
with open(evidence_file_path, "r") as file:
    evidence_data = json.load(file)


In [21]:
# 转换声明数据为DataFrame
claims = []
for claim_id, details in claims_data.items():
    claims.append({'claim_id': claim_id, **details})
claims_df = pd.DataFrame(claims)

# 转换证据数据为DataFrame
evidences = []
for ev_id, text in evidence_data.items():
    evidences.append({'evidence_id': ev_id, 'text': text})
evidence_df = pd.DataFrame(evidences)



In [22]:
# 加载Spacy英语模型
nlp = spacy.load("en_core_web_sm")

# 创建NER映射表并进行实体提取
ner_map = defaultdict(list)

for index, row in evidence_df.iterrows():
    doc = nlp(row['text'])
    entities = []
    for ent in doc.ents:
        entities.append(ent.text)
    ner_map[row['evidence_id']] = entities  # 保存每个证据的实体列表

# 将实体信息添加到DataFrame
evidence_df['entities'] = evidence_df['evidence_id'].map(ner_map)


In [23]:
def filter_text(text, entities):
    """保留英文单词和特定模式（如化学符号、数字与字母的组合等），同时保留已识别的实体"""
    pattern = re.compile(r'\b[a-zA-Z0-9]+\b')
    tokens = word_tokenize(text)
    entities = set(entities)  # 将实体列表转换为集合以快速检查
    filtered_tokens = [token for token in tokens if pattern.match(token) or token in entities]
    return " ".join(filtered_tokens)

# 应用文本过滤，同时考虑实体
claims_df['claim_text'] = claims_df.apply(lambda row: filter_text(row['claim_text'], row.get('entities', [])), axis=1)
evidence_df['text'] = evidence_df.apply(lambda row: filter_text(row['text'], row['entities']), axis=1)


In [24]:
def tokenize_with_entities(text, entities):
    """使用Spacy进行分词，并对实体进行加权处理"""
    doc = nlp(text)
    words = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    # 增加实体的出现次数
    words.extend([entity.lower() for entity in entities for _ in range(3)])  # 实体权重增加，出现3次
    return words

# 应用分词并考虑实体
evidence_df['tokens'] = evidence_df.apply(lambda row: tokenize_with_entities(row['text'], row['entities']), axis=1)
bm25 = BM25Okapi(evidence_df['tokens'].tolist())


In [67]:
# 检查evidence_df['tokens']列是否不存在大写字母
def has_upper(tokens):
    for token in tokens:
        if token.isupper():
            return True
    return False
 

# 检查是否存在大写字母的证据
evidence_df['has_upper'] = evidence_df['tokens'].apply(has_upper)




In [68]:
print(evidence_df[evidence_df['has_upper'] == True])


Empty DataFrame
Columns: [evidence_id, text, entities, tokens, has_upper]
Index: []


In [27]:
def filter_text(text):
    """保留英文单词和特定模式（如化学符号、数字与字母的组合等）"""
    pattern = re.compile(r'\b[a-zA-Z0-9]+\b')  # 识别字母和数字的组合
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if pattern.match(token)]
    return " ".join(filtered_tokens)
# 应用文本过滤函数到正确的列
dev_claims_df['claim_text'] = dev_claims_df['claim_text'].apply(filter_text)


In [28]:
# 加载开发集数据
dev_claim_file_path = "data/dev-claims.json"
with open(dev_claim_file_path, "r") as file:
    dev_claims_data = json.load(file)

dev_claims = []
for claim_id, details in dev_claims_data.items():
    dev_claims.append({'claim_id': claim_id, **details})
dev_claims_df = pd.DataFrame(dev_claims)

# 应用文本过滤函数到正确的列
dev_claims_df['claim_text'] = dev_claims_df['claim_text'].apply(filter_text)


In [54]:
import pickle

# 保存ner_map
with open('ner_map.pkl', 'wb') as f:
    pickle.dump(ner_map, f)
# 保存处理后的DataFrame
claims_df.to_pickle("processed_claims_df.pkl")
evidence_df.to_pickle("processed_evidence_df.pkl")
# 保存令牌列表
with open('tokens.pkl', 'wb') as f:
    pickle.dump(evidence_df['tokens'].tolist(), f)


In [55]:
# 读取测试集数据
test_claim_file_path = "data/test-claims-unlabelled.json"

# 处理测试集数据
with open(test_claim_file_path, "r") as file:
    test_claims_data = json.load(file)

test_claims = []
for claim_id, details in test_claims_data.items():
    test_claims.append({'claim_id': claim_id, **details})
test_claims_df = pd.DataFrame(test_claims)

# 应用文本过滤函数到正确的列
test_claims_df['claim_text'] = test_claims_df['claim_text'].apply(filter_text)
# 保存测试集DataFrame
test_claims_df.to_pickle("processed_test_claims_df.pkl")


In [29]:
def get_top_n_evidence(claim_text, entities, top_n=20):
    # 为声明文本和相关实体构建查询令牌
    claim_tokens = tokenize_with_entities(claim_text, entities)
    scores = bm25.get_scores(claim_tokens)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [evidence_df.iloc[i]['evidence_id'] for i in top_indexes]

# 检索证据并计算准确率
correct_hits = 0
total_evidences = 0

for index, row in dev_claims_df.iterrows():
    predicted_evidences = get_top_n_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    correct_hits += len(set(predicted_evidences) & set(actual_evidences))
    total_evidences += len(actual_evidences)

accuracy = correct_hits / total_evidences
print(f"Accuracy: {accuracy}")


Accuracy: 0.2973523421588595


In [30]:
def get_top_n_evidence(claim_text, entities, top_n=40):
    claim_tokens = tokenize_with_entities(claim_text, entities)
    scores = bm25.get_scores(claim_tokens)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [evidence_df.iloc[i]['evidence_id'] for i in top_indexes]
predicted_results = {}
correct_hits = 0
total_evidences = 0

for index, row in dev_claims_df.iterrows():
    predicted_evidences = get_top_n_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    correct_hits += len(set(predicted_evidences) & set(actual_evidences))
    total_evidences += len(actual_evidences)
    predicted_results[row['claim_id']] = {
        "claim_text": row['claim_text'],
        "evidences": predicted_evidences
    }

accuracy = correct_hits / total_evidences
print(f"Accuracy: {accuracy}")
import json

# 保存预测结果到JSON文件
with open('data/predicted_evidences_dev.json', 'w') as f:
    json.dump(predicted_results, f, indent=4)


Accuracy: 0.3890020366598778


In [31]:
def get_top_n_evidence(claim_text, entities, top_n=60):
    claim_tokens = tokenize_with_entities(claim_text, entities)
    scores = bm25.get_scores(claim_tokens)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [evidence_df.iloc[i]['evidence_id'] for i in top_indexes]
predicted_results = {}
correct_hits = 0
total_evidences = 0

for index, row in dev_claims_df.iterrows():
    predicted_evidences = get_top_n_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    correct_hits += len(set(predicted_evidences) & set(actual_evidences))
    total_evidences += len(actual_evidences)
    predicted_results[row['claim_id']] = {
        "claim_text": row['claim_text'],
        "evidences": predicted_evidences
    }

accuracy = correct_hits / total_evidences
print(f"Accuracy: {accuracy}")
import json

# 保存预测结果到JSON文件
with open('data/predicted_evidences_dev_60.json', 'w') as f:
    json.dump(predicted_results, f, indent=4)


Accuracy: 0.42973523421588594


In [32]:
def get_top_n_evidence(claim_text, entities, top_n=80):
    claim_tokens = tokenize_with_entities(claim_text, entities)
    scores = bm25.get_scores(claim_tokens)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [evidence_df.iloc[i]['evidence_id'] for i in top_indexes]
predicted_results = {}
correct_hits = 0
total_evidences = 0

for index, row in dev_claims_df.iterrows():
    predicted_evidences = get_top_n_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    correct_hits += len(set(predicted_evidences) & set(actual_evidences))
    total_evidences += len(actual_evidences)
    predicted_results[row['claim_id']] = {
        "claim_text": row['claim_text'],
        "evidences": predicted_evidences
    }

accuracy = correct_hits / total_evidences
print(f"Accuracy: {accuracy}")
import json

# 保存预测结果到JSON文件
with open('data/predicted_evidences_dev_80.json', 'w') as f:
    json.dump(predicted_results, f, indent=4)


Accuracy: 0.4623217922606925


In [33]:
def get_top_n_evidence(claim_text, entities, top_n=100):
    claim_tokens = tokenize_with_entities(claim_text, entities)
    scores = bm25.get_scores(claim_tokens)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [evidence_df.iloc[i]['evidence_id'] for i in top_indexes]
predicted_results = {}
correct_hits = 0
total_evidences = 0

for index, row in dev_claims_df.iterrows():
    predicted_evidences = get_top_n_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    correct_hits += len(set(predicted_evidences) & set(actual_evidences))
    total_evidences += len(actual_evidences)
    predicted_results[row['claim_id']] = {
        "claim_text": row['claim_text'],
        "evidences": predicted_evidences
    }

accuracy = correct_hits / total_evidences
print(f"Accuracy: {accuracy}")
import json

# 保存预测结果到JSON文件
with open('data/predicted_evidences_dev_100.json', 'w') as f:
    json.dump(predicted_results, f, indent=4)


Accuracy: 0.4908350305498982


In [34]:
def get_top_n_evidence(claim_text, entities, top_n=150):
    claim_tokens = tokenize_with_entities(claim_text, entities)
    scores = bm25.get_scores(claim_tokens)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [evidence_df.iloc[i]['evidence_id'] for i in top_indexes]
predicted_results = {}
correct_hits = 0
total_evidences = 0

for index, row in dev_claims_df.iterrows():
    predicted_evidences = get_top_n_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    correct_hits += len(set(predicted_evidences) & set(actual_evidences))
    total_evidences += len(actual_evidences)
    predicted_results[row['claim_id']] = {
        "claim_text": row['claim_text'],
        "evidences": predicted_evidences
    }

accuracy = correct_hits / total_evidences
print(f"Accuracy: {accuracy}")
import json

# 保存预测结果到JSON文件
with open('data/predicted_evidences_dev_150.json', 'w') as f:
    json.dump(predicted_results, f, indent=4)


Accuracy: 0.5193482688391039


In [53]:
def get_top_n_evidence(claim_text, entities, top_n=200):
    claim_tokens = tokenize_with_entities(claim_text, entities)
    scores = bm25.get_scores(claim_tokens)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [evidence_df.iloc[i]['evidence_id'] for i in top_indexes]
predicted_results = {}
correct_hits = 0
total_evidences = 0

for index, row in dev_claims_df.iterrows():
    predicted_evidences = get_top_n_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    correct_hits += len(set(predicted_evidences) & set(actual_evidences))
    total_evidences += len(actual_evidences)
    predicted_results[row['claim_id']] = {
        "claim_text": row['claim_text'],
        "evidences": predicted_evidences
    }

accuracy = correct_hits / total_evidences
print(f"Accuracy: {accuracy}")
import json

# 保存预测结果到JSON文件
with open('data/predicted_evidences_dev_200.json', 'w') as f:
    json.dump(predicted_results, f, indent=4)


Accuracy: 0.5560081466395111


In [35]:
def get_top_n_evidence(claim_text, entities, top_n=500):
    claim_tokens = tokenize_with_entities(claim_text, entities)
    scores = bm25.get_scores(claim_tokens)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [evidence_df.iloc[i]['evidence_id'] for i in top_indexes]
predicted_results = {}
correct_hits = 0
total_evidences = 0

for index, row in dev_claims_df.iterrows():
    predicted_evidences = get_top_n_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    correct_hits += len(set(predicted_evidences) & set(actual_evidences))
    total_evidences += len(actual_evidences)
    predicted_results[row['claim_id']] = {
        "claim_text": row['claim_text'],
        "evidences": predicted_evidences
    }

accuracy = correct_hits / total_evidences
print(f"Accuracy: {accuracy}")
import json

# 保存预测结果到JSON文件
with open('data/predicted_evidences_dev_500.json', 'w') as f:
    json.dump(predicted_results, f, indent=4)


Accuracy: 0.6476578411405295


In [48]:
def get_top_n_evidence(claim_text, entities, top_n=5):
    claim_tokens = tokenize_with_entities(claim_text, entities)
    scores = bm25.get_scores(claim_tokens)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [evidence_df.iloc[i]['evidence_id'] for i in top_indexes]
predicted_results = {}
correct_hits = 0
total_evidences = 0

for index, row in dev_claims_df.iterrows():
    predicted_evidences = get_top_n_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    correct_hits += len(set(predicted_evidences) & set(actual_evidences))
    total_evidences += len(actual_evidences)
    predicted_results[row['claim_id']] = {
        "claim_text": row['claim_text'],
        "evidences": predicted_evidences
    }

accuracy = correct_hits / total_evidences
print(f"Accuracy: {accuracy}")
import json

# 保存预测结果到JSON文件
with open('data/predicted_evidences_dev_5.json', 'w') as f:
    json.dump(predicted_results, f, indent=4)


Accuracy: 0.15478615071283094


In [50]:
def get_top_n_evidence(claim_text, entities, top_n=4):
    claim_tokens = tokenize_with_entities(claim_text, entities)
    scores = bm25.get_scores(claim_tokens)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [evidence_df.iloc[i]['evidence_id'] for i in top_indexes]
predicted_results = {}
correct_hits = 0
total_evidences = 0

for index, row in dev_claims_df.iterrows():
    predicted_evidences = get_top_n_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    correct_hits += len(set(predicted_evidences) & set(actual_evidences))
    total_evidences += len(actual_evidences)
    predicted_results[row['claim_id']] = {
        "claim_text": row['claim_text'],
        "evidences": predicted_evidences
    }

accuracy = correct_hits / total_evidences
print(f"Accuracy: {accuracy}")
import json

# 保存预测结果到JSON文件
with open('data/predicted_evidences_dev_4.json', 'w') as f:
    json.dump(predicted_results, f, indent=4)


Accuracy: 0.1384928716904277


In [52]:
def get_top_n_evidence(claim_text, entities, top_n=1):
    claim_tokens = tokenize_with_entities(claim_text, entities)
    scores = bm25.get_scores(claim_tokens)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [evidence_df.iloc[i]['evidence_id'] for i in top_indexes]
predicted_results = {}
correct_hits = 0
total_evidences = 0

for index, row in dev_claims_df.iterrows():
    predicted_evidences = get_top_n_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    correct_hits += len(set(predicted_evidences) & set(actual_evidences))
    total_evidences += len(actual_evidences)
    predicted_results[row['claim_id']] = {
        "claim_text": row['claim_text'],
        "evidences": predicted_evidences
    }

accuracy = correct_hits / total_evidences
print(f"Accuracy: {accuracy}")
import json

# 保存预测结果到JSON文件
with open('data/predicted_evidences_dev_1.json', 'w') as f:
    json.dump(predicted_results, f, indent=4)


Accuracy: 0.04684317718940937


In [37]:
def get_top_100_evidence(claim_text, entities):
    # 为声明文本和相关实体构建查询令牌
    claim_tokens = tokenize_with_entities(claim_text, entities)
    scores = bm25.get_scores(claim_tokens)
    top_indexes = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:100]
    return [evidence_df.iloc[i]['evidence_id'] for i in top_indexes]


In [38]:
# 创建机器学习训练数据集
ml_training_data = []

for index, row in claims_df.iterrows():
    predicted_evidences = get_top_100_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    true_evidences = [ev for ev in predicted_evidences if ev in actual_evidences]
    false_evidences = [ev for ev in predicted_evidences if ev not in actual_evidences][:5 * len(true_evidences)]

    # 将每个真实证据作为一个正样本
    for ev in true_evidences:
        ml_training_data.append({
            'claim_id': row['claim_id'],
            'evidence_id': ev,
            'label': 1  # 正样本
        })

    # 将选定的假证据作为负样本
    for ev in false_evidences:
        ml_training_data.append({
            'claim_id': row['claim_id'],
            'evidence_id': ev,
            'label': 0  # 负样本
        })

# 转换为DataFrame
ml_training_df = pd.DataFrame(ml_training_data)
print(ml_training_df.head())


     claim_id       evidence_id  label
0  claim-1937   evidence-442946      1
1  claim-1937    evidence-55991      0
2  claim-1937  evidence-1167485      0
3  claim-1937   evidence-180631      0
4  claim-1937    evidence-66273      0


In [45]:
# 创建机器学习训练数据集
ml_training_data2 = []

for index, row in claims_df.iterrows():
    predicted_evidences = get_top_100_evidence(row['claim_text'], row.get('entities', []))
    actual_evidences = row['evidences']
    true_evidences = [ev for ev in predicted_evidences if ev in actual_evidences]
    false_evidences = [ev for ev in predicted_evidences if ev not in actual_evidences][:1 * len(true_evidences)]

    # 将每个真实证据作为一个正样本
    for ev in true_evidences:
        ml_training_data.append({
            'claim_id': row['claim_id'],
            'evidence_id': ev,
            'label': 1  # 正样本
        })

    # 将选定的假证据作为负样本
    for ev in false_evidences:
        ml_training_data.append({
            'claim_id': row['claim_id'],
            'evidence_id': ev,
            'label': 0  # 负样本
        })

# 转换为DataFrame
ml_training_df2 = pd.DataFrame(ml_training_data2)
print(ml_training_df2.head())


Empty DataFrame
Columns: []
Index: []


In [43]:
# 向训练数据集添加文本内容
ml_training_df['claim_text'] = ml_training_df['claim_id'].apply(lambda x: claims_df.loc[claims_df['claim_id'] == x, 'claim_text'].values[0])
ml_training_df['evidence_text'] = ml_training_df['evidence_id'].apply(lambda x: evidence_df.loc[evidence_df['evidence_id'] == x, 'text'].values[0])

print(ml_training_df.head())


     claim_id       evidence_id  label  \
0  claim-1937   evidence-442946      1   
1  claim-1937    evidence-55991      0   
2  claim-1937  evidence-1167485      0   
3  claim-1937   evidence-180631      0   
4  claim-1937    evidence-66273      0   

                                          claim_text  \
0  Not only is there no scientific evidence that ...   
1  Not only is there no scientific evidence that ...   
2  Not only is there no scientific evidence that ...   
3  Not only is there no scientific evidence that ...   
4  Not only is there no scientific evidence that ...   

                                       evidence_text  
0  At very high concentrations 100 times atmosphe...  
1  On July 21 2011 while a guest on the show he s...  
2  Less direct geological evidence indicates that...  
3  Fossil fuel power plants cause the emission of...  
4  Higher atmospheric CO2 concentrations have led...  


In [44]:
# 将训练和验证数据集保存为CSV文件
ml_training_df.to_csv('data/ml_training_data.csv', index=False)

In [49]:
# 计算train-claim中平均证据数量
total_evidence_count = 0
for index, row in claims_df.iterrows():
    total_evidence_count += len(row['evidences'])
average_evidence_count = total_evidence_count / len(claims_df)
print(f"Average evidence count: {average_evidence_count}")

Average evidence count: 3.3566775244299674


In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter
from torchtext.vocab import Vocab
from torchtext.data.utils import get_tokenizer
import pandas as pd


In [None]:
class TextDataset(Dataset):
    def __init__(self, claims, evidences, labels, vocab=None):
        self.tokenizer = get_tokenizer("basic_english")
        self.claims = [self.tokenizer(claim) for claim in claims]
        self.evidences = [self.tokenizer(evidence) for evidence in evidences]
        self.labels = labels
        self.vocab = vocab if vocab else self.build_vocab(self.claims + self.evidences)
        
    def build_vocab(self, data):
        counter = Counter()
        for text in data:
            counter.update(text)
        return Vocab(counter, specials=['<unk>', '<pad>'])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        claim = [self.vocab.stoi[word] for word in self.claims[idx]]
        evidence = [self.vocab.stoi[word] for word in self.evidences[idx]]
        return torch.tensor(claim), torch.tensor(evidence), torch.tensor(self.labels[idx])

    def collate_fn(batch):
        claims, evidences, labels = zip(*batch)
        claims_pad = nn.utils.rnn.pad_sequence(claims, padding_value=vocab.stoi['<pad>'])
        evidences_pad = nn.utils.rnn.pad_sequence(evidences, padding_value=vocab.stoi['<pad>'])
        return claims_pad, evidences_pad, torch.stack(labels)


In [None]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, claims, evidences):
        embedded_claims = self.embedding(claims)
        embedded_evidences = self.embedding(evidences)
        lstm_out_claims, _ = self.lstm(embedded_claims)
        lstm_out_evidences, _ = self.lstm(embedded_evidences)
        combined_features = torch.cat((lstm_out_claims[:, -1, :], lstm_out_evidences[:, -1, :]), dim=1)
        output = self.fc(combined_features)
        return output


In [None]:
# 分割数据
claims_train, claims_val, evidences_train, evidences_val, labels_train, labels_val = train_test_split(
    ml_training_df['claim_text'], ml_training_df['evidence_text'], ml_training_df['label'], test_size=0.2, random_state=42)

# 创建数据集
train_dataset = TextDataset(claims_train, evidences_train, labels_train)
val_dataset = TextDataset(claims_val, evidences_val, labels_val, vocab=train_dataset.vocab)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=TextDataset.collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=TextDataset.collate_fn)

# 定义模型
model = BiLSTM(len(train_dataset.vocab), embedding_dim=100, hidden_dim=128, output_dim=1)
if torch.cuda.is_available():
    model.cuda()

# 训练
optimizer = Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

for epoch in range(10):  # 进行10个epoch的训练
    model.train()
    for claims, evidences, labels in train_loader:
        if torch.cuda.is_available():
            claims, evidences, labels = claims.cuda(), evidences.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = model(claims, evidences)
        loss = criterion(outputs.squeeze(), labels.float())
        loss.backward()
        optimizer.step()

    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for claims, evidences, labels in val_loader:
            if torch.cuda.is_available():
                claims, evidences, labels = claims.cuda(), evidences.cuda(), labels.cuda()
            outputs = model(claims, evidences)
            predictions = torch.round(torch.sigmoid(outputs.squeeze()))
            all_preds.extend(predictions.tolist())
            all_labels.extend(labels.tolist())

# 输出报告
print(classification_report(all_labels, all_preds))
# 保存模型
torch.save(model.state_dict(), 'data/bilstm_model.pth')

In [47]:
print(evidence_df.head())
# 保存证据数据
evidence_df.to_csv('data/evidence_data.csv', index=False)

  evidence_id                                               text  \
0  evidence-0  John Bennet Lawes English entrepreneur and agr...   
1  evidence-1  Lindberg began his professional career at the ...   
2  evidence-2      Boston Ladies of Cambridge by Vampire Weekend   
3  evidence-3  Gerald Francis Goyer born October 20 1936 was ...   
4  evidence-4  He detected abnormalities of oxytocinergic fun...   

                                            entities  \
0                       [John Bennet Lawes, English]   
1     [Lindberg, the age of 16, New York City, 1977]   
2   [``Boston (Ladies of Cambridge, Vampire Weekend]   
3  [Gerald Francis Goyer, October 20, 1936, 40, t...   
4                                              [ECT]   

                                              tokens  
0  [john, bennet, lawes, english, entrepreneur, a...  
1  [lindberg, begin, professional, career, age, 1...  
2  [boston, ladies, cambridge, vampire, weekend, ...  
3  [gerald, francis, goyer, bear, 