In [15]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# 1. 加载数据集，数据集由id,keyword,location,text,target五个字段组成
data = pd.read_csv('train.csv')

# 2. 数据预处理
data['text'] = data['text'].apply(lambda x: word_tokenize(x))
data['text'] = data['text'].apply(lambda x: ' '.join(x))
data['text'] = data['text'].apply(lambda x: x.lower())

data['tokens'] = data['text'].apply(word_tokenize)

# 向量化
vectorizer = CountVectorizer()
vectorizer.fit(data['text'])
data['text'] = data['text'].apply(lambda x: vectorizer.transform([x]).toarray()[0])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\17100\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
# 3. 划分训练集和验证集
train, val = train_test_split(data, test_size=0.2, random_state=42)


In [17]:
# 使用CNN进行文本分类，首先构造数据集
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        target = self.data.iloc[index]['target']
        return text, target

    def __len__(self):
        return len(self.data)

train_dataset = MyDataset(train)
val_dataset = MyDataset(val)


In [18]:

# 4. 构造模型
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embedding_dim)) for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        # text = [batch size, sent len]
        text = text.permute(1, 0)
        # text = [sent len, batch size]
        embedded = self.embedding(text)
        # embedded = [sent len, batch size, emb dim]
        embedded = embedded.unsqueeze(1)
        # embedded = [sent len, batch size, emb dim]
        conved = [nn.functional.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        # conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        pooled = [nn.functional.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        # pooled_n = [batch size, n_filters]
        cat = self.dropout(torch.cat(pooled, dim=1))
        # cat = [batch size, n_filters * len(filter_sizes)]
        return self.fc(cat)


In [19]:

# 5. 训练模型
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for text, target in iterator:
        optimizer.zero_grad()
        predictions = model(text)
        loss = criterion(predictions, target)
        acc = binary_accuracy(predictions, target)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for text, target in iterator:
            predictions = model(text)
            loss = criterion(predictions, target)
            acc = binary_accuracy(predictions, target)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def binary_accuracy(predictions, target):
    rounded_preds = torch.round(torch.sigmoid(predictions))
    correct = (rounded_preds == target).float()
    acc = correct.sum() / len(correct)
    return acc


In [None]:
# 6. 训练模型
#使用cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
INPUT_DIM = len(vectorizer.vocabulary_)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2, 3, 4]
OUTPUT_DIM = 1
DROPOUT = 0.5
BATCH_SIZE = 21628
N_EPOCHS = 5
model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_iterator = DataLoader(val_dataset, batch_size=BATCH_SIZE)
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_iterator, criterion)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val Loss: {val_loss:.3f}, Val Acc: {val_acc*100:.2f}%')


KeyboardInterrupt: 

In [None]:
# 7. 评估模型，绘制混淆矩阵
from sklearn.metrics import confusion_matrix
import seaborn as sns

matrix = \
    confusion_matrix(model, val_iterator, criterion)

sns.heatmap(matrix, annot=True, fmt='d')