In [None]:
# 导入必要的库
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel

# 检查GPU可用性
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

# 载入数据
train_df = pd.read_csv('data/train_top/training_data_top_50_evidences.csv')
dev_df = pd.read_csv('data/dev_top//dev_data_top_50_evidences.csv')


In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# 设置nltk的停用词
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # 小写化
    text = re.sub(r'[^a-z0-9\s]', '', text)  # 保留字母和数字
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stop_words]  # 去除停用词
    return " ".join(filtered_tokens)

# 应用预处理到训练和验证数据集
train_df['filtered_text'] = train_df['claim_text'] + " " + train_df['evidence_text']
train_df['filtered_text'] = train_df['filtered_text'].apply(preprocess_text)
dev_df['filtered_text'] = dev_df['claim_text'] + " " + dev_df['evidence_text']
dev_df['filtered_text'] = dev_df['filtered_text'].apply(preprocess_text)


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from gensim.models import Word2Vec


class CustomDataset(Dataset):
    def __init__(self, dataframe, word2vec_model):
        self.labels = pd.Categorical(dataframe['claim_label']).codes
        self.texts = [torch.tensor([word2vec_model.wv[word] for word in text.split() if word in word2vec_model.wv])
                      for text in dataframe['filtered_text']]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {
            'text': torch.stack(self.texts[idx]),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# 加载Word2Vec模型
word2vec_model = Word2Vec.load("custom_word2vec.model")

# 创建Dataset和DataLoader
train_dataset = CustomDataset(train_df, word2vec_model)
dev_dataset = CustomDataset(dev_df, word2vec_model)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False)


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.hidden_dim = hidden_dim
        self.linear = nn.Linear(hidden_dim, 1)

    def forward(self, outputs):
        # outputs shape: (batch_size, seq_len, hidden_dim)
        scores = self.linear(outputs).squeeze(2)  # (batch_size, seq_len)
        scores = F.softmax(scores, dim=1)
        # Weighted sum of hidden states
        weighted = torch.bmm(scores.unsqueeze(1), outputs).squeeze(1)  # (batch_size, hidden_dim)
        return weighted

class TextClassifier(nn.Module):
    def __init__(self, embedding_layer, hidden_dim, output_dim, bidirectional=True):
        super(TextClassifier, self).__init__()
        self.embedding = embedding_layer
        self.transformer = nn.TransformerEncoderLayer(d_model=100, nhead=2, dim_feedforward=2048)
        self.lstm = nn.LSTM(100, hidden_dim, batch_first=True, bidirectional=bidirectional)
        self.attention = Attention(hidden_dim * 2 if bidirectional else hidden_dim)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)  # (batch_size, seq_len, emb_dim)
        transformed = self.transformer(embedded.permute(1, 0, 2)).permute(1, 0, 2)  # Apply Transformer
        lstm_out, _ = self.lstm(transformed)  # (batch_size, seq_len, hidden_dim * 2 if bidirectional)
        attn_out = self.attention(lstm_out)
        return self.fc(attn_out)

# 初始化模型
output_dim = 4  # 假设有4个类别
hidden_dim = 256
model = TextClassifier(embedding_layer, hidden_dim, output_dim).to(device)

print(model)


In [None]:
from torch.optim import Adam

criterion = nn.CrossEntropyLoss()  # 适用于多分类任务
optimizer = Adam(model.parameters(), lr=0.001)

# 训练函数（简化版本）
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch['text'].to(device))
        loss = criterion(predictions, batch['label'].to(device))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# 验证函数（简化版本）
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch['text'].to(device))
            loss = criterion(predictions, batch['label'].to(device))
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)


In [None]:
# 训练模型
N_EPOCHS = 10
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion)
    valid_loss = evaluate(model, dev_loader, criterion)
    print(f'Epoch: {epoch + 1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')
    
# 保存模型
torch.save(model.state_dict(), 'model.pth')


In [1]:
import nltk

# 指定下载 NLTK 数据的目录
download_directory = "./nltk_data/"

# 确保目录存在
import os
if not os.path.exists(download_directory):
    os.makedirs(download_directory)

# 下载所需的数据集
nltk.download('wordnet', download_dir=download_directory)
nltk.download('omw-1.4', download_dir=download_directory)

print("下载完成，数据存放在：", download_directory)


[nltk_data] Downloading package wordnet to ./nltk_data/...
[nltk_data] Downloading package omw-1.4 to ./nltk_data/...


下载完成，数据存放在： ./nltk_data/
