cnn rnn 文字分类

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 自定义数据集类
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        return text, label


In [21]:
# 自定义模型定义
class Net(nn.Module):
    def __init__(self, input_dim):
        super(Net, self).__init__()
        
        # 把整个模型分到了不同的模块中
        self.conv = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=4, kernel_size=5, stride=2), # (I - k + 2p) / s + 1
            nn.MaxPool1d(kernel_size=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=4, out_channels=8, kernel_size=2, stride=2),
            nn.MaxPool1d(kernel_size=2),
            nn.ReLU(),
        )
        
        self.dense = nn.Sequential(
            nn.Flatten(),
            nn.Linear(8 * ((((input_dim - 5) // 2 + 1) // 2 - 2) // 2 + 1)//2, 128),  # 计算卷积后的特征维度
            nn.ReLU(),
            nn.Linear(128, 1)
        )
        
    def forward(self, x):
        x = x.unsqueeze(1)  # 添加通道维度
        x = self.conv(x)
        y = self.dense(x)
        return y


In [12]:

# 读取数据
dftrain = pd.read_csv("data/train.tsv", sep="\t", header=0, names=["text", "label"])
dftrain.fillna('', inplace=True)

In [10]:
dftrain

Unnamed: 0,Unnamed: 1,text,label
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2
...,...,...,...
156056,8544,Hearst 's,2
156057,8544,forced avuncular chortles,1
156058,8544,avuncular chortles,3
156059,8544,avuncular,2


In [18]:
# 构建2-gram特征
# vectorizer = CountVectorizer(ngram_range=(1, 2))  # 使用1-gram和2-gram
vectorizer = CountVectorizer()  # 使用1-gram
X = vectorizer.fit_transform(dftrain['text'])
y = dftrain['label']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 将稀疏矩阵转换为密集矩阵并转换为PyTorch张量
X_train = torch.tensor(X_train.toarray(), dtype=torch.float32)
X_test = torch.tensor(X_test.toarray(), dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
y_test = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

# 定义模型、损失函数和优化器
input_dim = X_train.shape[1]
model = Net(input_dim=input_dim)
criterion = nn.BCEWithLogitsLoss()  # 二分类任务使用二元交叉熵损失


In [22]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [24]:
# 训练模型
num_epochs = 10
batch_size = 32

train_dataset = TextDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

from tqdm import tqdm

for epoch in range(num_epochs):
    model.train()
    for batch_x, batch_y in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# 评估模型
model.eval()
with torch.no_grad():
    outputs = model(X_test)
    predicted = (torch.sigmoid(outputs) > 0.5).float()
    accuracy = (predicted == y_test).float().mean()
    print(f"Accuracy: {accuracy:.4f}")

  2%|▏         | 71/3902 [00:11<10:19,  6.18it/s]


KeyboardInterrupt: 