# 神经网络分类器测试

In [10]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# 1. 加载数据集
data = pd.read_csv('train.csv')

# 2. 数据预处理
data['text'] = data['text'].apply(lambda x: x.lower())

# 3. 分词
data['tokens'] = data['text'].apply(word_tokenize)

# 4. 向量化
vectorizer = CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x)
X = vectorizer.fit_transform(data['tokens'])
y = data['target']

# 5. 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. 创建浅层神经网络
class ShallowNN(nn.Module):
    def __init__(self, input_size, output_size):
        super(ShallowNN, self).__init__()
        self.fc1 = nn.Linear(input_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        return x

input_size = X_train.shape[1]
output_size = 2

model = ShallowNN(input_size, output_size)

# 7. 训练模型
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def one_hot_encode(labels, num_classes):
    one_hot = torch.zeros(labels.size(0), num_classes)
    one_hot.scatter_(1, labels.unsqueeze(1), 1)
    return one_hot

num_classes = 2
y_train_one_hot = one_hot_encode(torch.tensor(y_train), num_classes)
y_test_one_hot = one_hot_encode(torch.tensor(y_test), num_classes)


epochs = 10
for epoch in range(epochs):
    for i, (inputs, labels) in enumerate(zip(X_train, y_train_one_hot)):
        inputs = torch.tensor(inputs.toarray(), dtype=torch.float32)
        labels = labels.to(dtype=torch.long)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.unsqueeze(0), labels.unsqueeze(0))
        loss.backward()
        optimizer.step()

# 8. 评估模型性能
correct = 0
total = 0
with torch.no_grad():
    for i, (inputs, labels) in enumerate(zip(X_test, y_test)):
        inputs = torch.tensor(inputs.toarray(), dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.long)

        outputs = model(inputs)
        _, predicted = torch.max(outputs, 0)
        total += 1
        correct += (predicted == labels).sum().item()

print(f'Accuracy: {100 * correct / total}%')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dustella\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


ValueError: could not determine the shape of object type 'Series'