In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.tokenize import word_tokenize
import re
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
nltk.download('punkt')

# 读取声明和证据数据
with open('/kaggle/input/train-dataset/train-claims.json', 'r') as file:
    data = json.load(file)
with open('/kaggle/input/train-dataset/evidence.json', 'r') as file:
    evidence_data = json.load(file)

# 准备输出数据列表
output_data = []
for claim_id, claim_info in data.items():
    claim_text = claim_info['claim_text']
    claim_label = claim_info['claim_label']
    if claim_label == "DISPUTED":
        continue  # 忽略 disputed 的声明
    label_mapping = {
        "SUPPORTS": "support",
        "REFUTES": "refute",
        "NOT_ENOUGH_INFO": "irrelevant"
    }
    if claim_label in label_mapping:
        for evidence_id in claim_info['evidences']:
            output_data.append({
                "claim_id": claim_id,
                "claim_text": claim_text,
                "evidence_id": evidence_id,
                "label": label_mapping[claim_label],
                "evidence_text": evidence_data.get(evidence_id, "")
            })

# 转换为 DataFrame 并分割数据集
df_output = pd.DataFrame(output_data)
train_df, valid_df = train_test_split(df_output, test_size=0.2, random_state=42)

# 函数来预处理和向量化文本
def preprocess_text(text):
    pattern = re.compile(r'\b[a-zA-Z0-9]+\b')
    tokens = word_tokenize(text.lower())
    filtered_tokens = [token for token in tokens if pattern.match(token)]
    return filtered_tokens

# 准备文档
train_documents = [TaggedDocument(words=preprocess_text(row['claim_text']) + preprocess_text(row['evidence_text']), tags=[str(i)]) for i, row in train_df.iterrows()]
valid_documents = [TaggedDocument(words=preprocess_text(row['claim_text']) + preprocess_text(row['evidence_text']), tags=[str(i)]) for i, row in valid_df.iterrows()]


In [None]:
# 初始化 Doc2Vec 模型
model_d2v = Doc2Vec(vector_size=200, window=5, min_count=1, workers=4, epochs=50)

# 构建词汇表并训练模型
model_d2v.build_vocab(train_documents)
model_d2v.train(train_documents, total_examples=model_d2v.corpus_count, epochs=model_d2v.epochs)


In [None]:
# 设置计算设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_vectors(documents):
    vectors = np.array([model_d2v.infer_vector(doc.words) for doc in documents])
    return torch.tensor(vectors, dtype=torch.float).view(vectors.shape[0], 1, -1).to(device)

# 获取训练和验证数据向量
train_vectors = get_vectors(train_documents)
valid_vectors = get_vectors(valid_documents)

# 获取标签并转换为 tensor
train_labels = torch.tensor(train_df['label'].astype('category').cat.codes.to_numpy(), dtype=torch.long).to(device)
valid_labels = torch.tensor(valid_df['label'].astype('category').cat.codes.to_numpy(), dtype=torch.long).to(device)


In [None]:


# LSTM 模型定义
class LSTMModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2,
                            bidirectional=True, dropout=0.5, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        return self.fc(hidden)


# 实例化模型
model = LSTMModel(200, 256, 3).to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
    
# 训练函数
def train(model, train_vectors, train_labels, valid_vectors, valid_labels, optimizer, criterion, n_epochs, patience):
    min_loss = float('inf')
    patience_counter = 0

    for epoch in range(n_epochs):
        model.train()  # 确保模型处于训练模式
        optimizer.zero_grad()
        outputs = model(train_vectors)
        loss = criterion(outputs, train_labels)
        loss.backward()
        optimizer.step()
        
        # 评估模式，计算验证损失和准确率
        model.eval()
        valid_loss, valid_accuracy = evaluate(model, valid_vectors, valid_labels)
        
        # 打印每个epoch的训练信息
        if epoch % 10 == 0:
            print(f'Epoch {epoch + 1}, Loss: {loss.item()}, Valid Loss: {valid_loss}, Valid Acc: {valid_accuracy:.2f}')
        
        # 早停判断
        if valid_loss < min_loss:
            min_loss = valid_loss
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= patience:
            print("Stopping early due to lack of improvement.")
            break

        model.train()  # 确保在下一个epoch开始之前，模型回到训练模式


def evaluate(model, valid_vectors, valid_labels):
    model.eval()
    with torch.no_grad():
        outputs = model(valid_vectors)
        loss = criterion(outputs, valid_labels)
        _, predicted = torch.max(outputs, 1)
        correct = (predicted == valid_labels).sum().item()
        accuracy = correct / valid_labels.size(0)
    return loss.item(), accuracy



# 执行训练
n_epochs = 800
train(model, train_vectors, train_labels, valid_vectors, valid_labels,
      optimizer, criterion, n_epochs=400, patience=200)

# 执行评估
# 正确提取准确率进行打印
validation_loss, validation_accuracy = evaluate(model, valid_vectors, valid_labels)
print(f'Validation Accuracy: {validation_accuracy:.2f}')
 

In [None]:
# 保存模型
torch.save(model.state_dict(), 'lstm_model.pth')


In [None]:
# 打印混淆矩阵
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

model.eval()
with torch.no_grad():
    outputs = model(valid_vectors)
    _, predicted = torch.max(outputs, 1)
    cm = confusion_matrix(valid_labels.cpu(), predicted.cpu())
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=['support', 'refute', 'irrelevant'], yticklabels=['support', 'refute', 'irrelevant'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

In [None]:
import nltk

# 指定 NLTK 使用已上传的数据文件
nltk.data.path.append('/kaggle/input/nlp111/nltk_data')  # 确保路径是正确的

# 测试 WordNet 是否正确加载
from nltk.corpus import wordnet
try:
    # 测试查找一个词的同义词集合以确认数据被正确加载
    synsets = wordnet.synsets('dog')
    print("WordNet加载成功，找到'狗'的同义词集：", synsets)
except Exception as e:
    print("加载WordNet数据时发生错误：", e)


In [None]:
import nltk
from nltk.corpus import wordnet
import random
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

import nltk

# 将下载目录添加到 nltk 的搜索路径
nltk.data.path.append('/kaggle/input/nlp111/nltk_data')

# 测试是否可以正确加载 WordNet
from nltk.corpus import wordnet
print(wordnet.synsets('dog'))  # 试着加载一个示例以确认 WordNet 可用


def synonym_replacement(text, num_replacements):
    """用同义词替换文本中的词来增强数据"""
    words = nltk.word_tokenize(text)
    new_text = words[:]
    random_word_list = list(set([word for word in words if word.isalpha()]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for word in random_word_list:
        synonyms = get_synonyms(word)
        if len(synonyms) > 0:
            synonym = random.choice(list(synonyms))
            new_text = [synonym if w == word else w for w in new_text]
            num_replaced += 1
        if num_replaced >= num_replacements:  # 控制替换数量
            break

    return ' '.join(new_text)

def get_synonyms(word):
    """获取单词的同义词集合"""
    synonyms = set()
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonym = l.name().replace('_', ' ').replace('-', ' ').lower()
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)
    return synonyms

# 接下来修改数据读取和预处理的部分，增加数据增强

import pandas as pd
import numpy as np

def augment_data(data, augment_rate=1):
    """根据给定的增强率增强数据"""
    new_rows = []
    for _, row in data.iterrows():
        new_rows.append(row)  # 添加原始行
        if row['claim_label'] in ['DISPUTED', 'REFUTES']:
            additions = 2  # 增加200%
        else:
            additions = 1  # 增加100%
        for _ in range(additions):
            new_row = row.copy()
            new_row['claim_text'] = synonym_replacement(row['claim_text'], num_replacements=2)
            new_rows.append(new_row)
    return pd.DataFrame(new_rows)

# 读取和增强数据
train_data = pd.read_csv('/kaggle/input/nlp111/training_data_top_50_evidences.csv')
dev_data = pd.read_csv('/kaggle/input/nlp111/dev_data_top_50_evidences.csv')

train_data = augment_data(train_data)
dev_data = augment_data(dev_data)  # 可选择是否对开发集进行数据增强





In [None]:
# 后续步骤（训练和验证模型）保持不变

# 继续使用之前定义的Doc2Vec模型和LSTM模型
# 假设这些模型都已经被定义和训练过了，并且已经加载到当前的环境中

def preprocess_and_vectorize(text):
    """文本预处理并返回向量化的结果"""
    tokens = preprocess_text(text)  # 使用前面定义的preprocess_text函数
    vector = model_d2v.infer_vector(tokens)  # 使用Doc2Vec模型生成向量
    return vector

def predict_evidence(model, evidence_vector):
    """预测单个证据的标签"""
    vector_tensor = torch.tensor(evidence_vector, dtype=torch.float).view(1, 1, -1).to(device)
    model.eval()
    with torch.no_grad():
        output = model(vector_tensor)
        _, predicted = torch.max(output, 1)
    return predicted.item()  # 返回预测的标签索引

def feature_engineering(predictions):
    """基于预测结果生成特征"""
    counts = np.bincount(predictions, minlength=3)
    return counts / np.sum(counts)  # 返回每个标签的相对频率作为特征

# 处理claims并生成训练特征和标签
def process_claims(data, model):
    features = []
    labels = []
    grouped = data.groupby('claim_id')
    for claim_id, group in grouped:
        predictions = []
        for _, row in group.iterrows():
            vector = preprocess_and_vectorize(row['evidence_text'])
            pred = predict_evidence(model, vector)
            predictions.append(pred)
        claim_features = feature_engineering(predictions)
        features.append(claim_features)
        labels.append(row['claim_label'])  # 假设每个group的最后一行包含claim的标签
    return np.array(features), np.array(labels)

# 生成训练和验证特征
train_features, train_labels = process_claims(train_data, model)
valid_features, valid_labels = process_claims(dev_data, model)


In [None]:
# 训练逻辑回归模型
lr_model = LogisticRegression(random_state=42)
lr_model.fit(train_features, train_labels)

# 验证模型
valid_predictions = lr_model.predict(valid_features)
accuracy = accuracy_score(valid_labels, valid_predictions)
print(f'Validation Accuracy: {accuracy:.2f}')

# 绘制混淆矩阵
cm = confusion_matrix(valid_labels, valid_predictions)
sns.heatmap(cm, annot=True, fmt='d', xticklabels=lr_model.classes_, yticklabels=lr_model.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
from sklearn.metrics import classification_report

print(classification_report(valid_labels, valid_predictions, target_names=lr_model.classes_))
print(lr_model.coef_)
print(len(valid_predictions))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# 加载数据，这里假设你已经有了处理好的特征和标签
# train_features, train_labels
# valid_features, valid_labels

# 定义基模型
base_models = [
    ('lr', LogisticRegression(random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svm', SVC(kernel='linear', probability=True, random_state=42)),
    ('mlp', MLPClassifier(hidden_layer_sizes=(50,), random_state=42)),
    ('gnb', GaussianNB()),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42))
]

# 定义元学习器
meta_learner = LGBMClassifier(random_state=42)

# 创建堆叠分类器
stack = StackingClassifier(estimators=base_models, final_estimator=meta_learner, cv=5, stack_method='predict_proba')

# 训练模型
stack.fit(train_features, train_labels)

# 验证模型
valid_predictions = stack.predict(valid_features)
accuracy = accuracy_score(valid_labels, valid_predictions)
print(f'Validation Accuracy: {accuracy:.2f}')

# 混淆矩阵和分类报告
print("\nClassification Report:\n", classification_report(valid_labels, valid_predictions))
cm = confusion_matrix(valid_labels, valid_predictions)
sns.heatmap(cm, annot=True, fmt='d', xticklabels=stack.classes_, yticklabels=stack.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier, Perceptron
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# 定义基模型，注意有些模型需要数据预处理，比如归一化
base_models = [
    ('lr', LogisticRegression(random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svm', make_pipeline(StandardScaler(), LinearSVC(random_state=42))),
    ('mlp', MLPClassifier(hidden_layer_sizes=(50,), random_state=42)),
    ('gnb', MultinomialNB()),
    ('sgd', make_pipeline(StandardScaler(), SGDClassifier(random_state=42))),
    ('lgbm', LGBMClassifier(random_state=42)),
    ('ridge', RidgeClassifier(random_state=42)),
    ('perceptron', make_pipeline(StandardScaler(), Perceptron(random_state=42))),
    ('pac', make_pipeline(StandardScaler(), PassiveAggressiveClassifier(random_state=42))),
    ('gbc', GradientBoostingClassifier(random_state=42)),
    ('etc', ExtraTreesClassifier(n_estimators=100, random_state=42)),
    ('catboost', CatBoostClassifier(verbose=0, random_state=42)),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)),
    ('knn', make_pipeline(StandardScaler(), KNeighborsClassifier()))
]

# 定义元学习器
meta_learner = LGBMClassifier(random_state=42)

# 创建堆叠分类器
stack = StackingClassifier(estimators=base_models, final_estimator=meta_learner, cv=5, stack_method='auto')

# 训练模型
stack.fit(train_features, train_labels)

# 验证模型
valid_predictions = stack.predict(valid_features)
accuracy = accuracy_score(valid_labels, valid_predictions)
print(f'Validation Accuracy: {accuracy:.2f}')

# 混淆矩阵和分类报告
print("\nClassification Report:\n", classification_report(valid_labels, valid_predictions))
cm = confusion_matrix(valid_labels, valid_predictions)
sns.heatmap(cm, annot=True, fmt='d', xticklabels=stack.classes_, yticklabels=stack.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
#保存模型
import joblib
joblib.dump(stack, 'stacking_model.joblib')


In [None]:


def preprocess_and_vectorize(text):
    """文本预处理并返回向量化的结果"""
    tokens = preprocess_text(text)  # 使用前面定义的preprocess_text函数
    vector = model_d2v.infer_vector(tokens)  # 使用Doc2Vec模型生成向量
    return vector

def predict_evidence(model, evidence_vector):
    """预测单个证据的标签"""
    vector_tensor = torch.tensor(evidence_vector, dtype=torch.float).view(1, 1, -1).to(device)
    model.eval()
    with torch.no_grad():
        output = model(vector_tensor)
        _, predicted = torch.max(output, 1)
    return predicted.item()  # 返回预测的标签索引

def feature_engineering(predictions):
    """基于预测结果生成特征"""
    counts = np.bincount(predictions, minlength=3)
    return counts / np.sum(counts)  # 返回每个标签的相对频率作为特征

# 处理claims并生成训练特征和标签
def process_claims(data, model):
    features = []
    labels = []
    grouped = data.groupby('claim_id')
    for claim_id, group in grouped:
        predictions = []
        for _, row in group.iterrows():
            vector = preprocess_and_vectorize(row['evidence_text'])
            pred = predict_evidence(model, vector)
            predictions.append(pred)
        claim_features = feature_engineering(predictions)
        features.append(claim_features)
        labels.append(row['claim_label'])  # 假设每个group的最后一行包含claim的标签
    return np.array(features), np.array(labels)

test_data = pd.read_csv('/kaggle/input/nlp111/test_data_top_50_evidences.csv')
# 将test_data的label列改名为claim_label
test_data.rename(columns={'label': 'claim_label'}, inplace=True)

test_features, test_labels = process_claims(test_data, model)


In [None]:
test_predictions = stack.predict(test_features)


In [None]:
print(test_predictions)

In [None]:
import pandas as pd
import json

# 假设 test_data 是你的测试数据集 DataFrame，加载方式类似于之前的加载方法
# 假设 test_predictions 包含按顺序的预测结果，如你所提供的列表

def format_output(data, predictions, num_evidences):
    results = {}
    grouped = data.groupby('claim_id')
    i = 0  # 使用索引来跟踪predictions中的位置
    for claim_id, group in grouped:
        claim_text = group.iloc[0]['claim_text']
        evidence_ids = group['evidence_id'].tolist()[:num_evidences]
        predicted_label = predictions[i]  # 获取对应索引的预测结果
        i += 1  # 移动到下一个claim的预测结果
        results[claim_id] = {
            "claim_text": claim_text,
            "claim_label": predicted_label,
            "evidences": evidence_ids
        }
    return results

# 格式化输出为JSON
results_3 = format_output(test_data, test_predictions, 3)
results_4 = format_output(test_data, test_predictions, 4)
results_5 = format_output(test_data, test_predictions, 5)

# 保存结果为JSON文件
def save_results(results, filename):
    with open(filename, 'w') as file:
        json.dump(results, file, indent=4)

save_results(results_3, 'predictions_3.json')
save_results(results_4, 'predictions_4.json')
save_results(results_5, 'predictions_5.json')


In [None]:
import json
import os

# 加载未处理的claim文本
with open('/kaggle/input/train-dataset/test-claims-unlabelled.json', 'r') as file:
    original_claims = json.load(file)

# 函数用于加载之前保存的预测结果
def load_predictions(filename):
    with open(filename, 'r') as file:
        return json.load(file)

# 加载之前的预测结果
results_3 = load_predictions('/kaggle/working/predictions_3.json')
results_4 = load_predictions('/kaggle/working/predictions_4.json')
results_5 = load_predictions('/kaggle/working/predictions_5.json')

# 函数用于合并原始文本和预测结果
def merge_results(original_claims, predictions):
    final_results = {}
    for claim_id, claim_text in original_claims.items():
        if claim_id in predictions:
            final_results[claim_id] = {
                "claim_text": claim_text,
                "claim_label": predictions[claim_id]['claim_label'],
                "evidences": predictions[claim_id]['evidences']
            }
    return final_results

# 合并结果并保存为新的JSON文件
def save_results(results, filename):
    with open(filename, 'w') as file:
        json.dump(results, file, indent=4)

# 合并和保存结果
final_results_3 = merge_results(original_claims, results_3)
final_results_4 = merge_results(original_claims, results_4)
final_results_5 = merge_results(original_claims, results_5)

save_results(final_results_3, '/kaggle/working/predictions_3.json')
save_results(final_results_4, '/kaggle/working/predictions_4.json')
save_results(final_results_5, '/kaggle/working/predictions_5.json')
