# 导包

In [1]:
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
import torch.utils.data as Data

from torchtext import data
from torchtext.vocab import Vectors
from torchtext.data import Iterator, BucketIterator

from sklearn.model_selection import train_test_split

import csv
import numpy as np
import pandas as pd

from google.colab import drive
import time
from tqdm import tqdm

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


# 定义超参数

In [16]:
BATCH_SIZE = 8
EPOCHS = 10
FIX_LENGTH = 2500
N_CLASS=14
LR=0.001
N_HIDDEN = 100
OUT_CHANNEL=1000
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 划分训练集验证集

In [4]:
train_df = pd.read_csv('/content/drive/MyDrive/data/News classification/train_set.csv', sep='\t', nrows=2000)

train, val = train_test_split(train_df, test_size=0.1)
train.to_csv("/content/drive/MyDrive/data/News classification/tmp/train.csv", index=False)
val.to_csv("/content/drive/MyDrive/data/News classification/tmp/val.csv", index=False)

# 构建Dataset

In [5]:
train_data = pd.read_csv('/content/drive/MyDrive/data/News classification/tmp/train.csv')
valid_data = pd.read_csv('/content/drive/MyDrive/data/News classification/tmp/val.csv')
test_data = pd.read_csv("/content/drive/MyDrive/data/News classification/test_a.csv", nrows=200)

tokenize = lambda x: x.split()
# fix_length指定了每条文本的长度，截断补长
TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, fix_length=FIX_LENGTH)
LABEL = data.Field(sequential=False, use_vocab=False)

def get_dataset(csv_data, text_field, label_field, test=False):
	# id数据对训练在训练过程中没用，使用None指定其对应的field
    fields = [("id", None), # we won't be needing the id, so we pass in None as the field
                 ("text", text_field), ("label", label_field)]       
    examples = []

    if test:
        # 如果为测试集，则不加载label
        for text in tqdm(csv_data['text']):
            examples.append(data.Example.fromlist([None, text, None], fields))
    else:
        for text, label in tqdm(zip(csv_data['text'], csv_data['label'])):
            examples.append(data.Example.fromlist([None, text, label], fields))
    return examples, fields


# 得到构建Dataset所需的examples和fields
train_examples, train_fields = get_dataset(train_data, TEXT, LABEL)
valid_examples, valid_fields = get_dataset(valid_data, TEXT, LABEL)
test_examples, test_fields = get_dataset(test_data, TEXT, None, test=True)

# 构建Dataset数据集
train = data.Dataset(train_examples, train_fields)
valid = data.Dataset(valid_examples, valid_fields)
test = data.Dataset(test_examples, test_fields)

1800it [00:00, 4233.32it/s]
200it [00:00, 4432.16it/s]
100%|██████████| 200/200 [00:00<00:00, 3970.53it/s]


# 构建迭代器

In [6]:
train_iter, val_iter = BucketIterator.splits(
        (train, valid),
        batch_sizes=(BATCH_SIZE, BATCH_SIZE),
        device = DEVICE, # 如果使用gpu，此处将-1更换为GPU的编号
        sort_key=lambda x: len(x.text), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

test_iter = Iterator(test, batch_size=BATCH_SIZE, device=DEVICE, sort=False, train=False, sort_within_batch=False, repeat=False) # train=False可以保证顺序不变

# 加载词向量

In [7]:
vectors = Vectors(name='/content/drive/MyDrive/data/News classification/tmp/mymodel.txt')
TEXT.build_vocab(train, vectors=vectors)
weight_matrix = TEXT.vocab.vectors

  0%|          | 0/6151 [00:00<?, ?it/s]Skipping token b'6151' with 1-dimensional vector [b'100']; likely a header
 77%|███████▋  | 4710/6151 [00:00<00:00, 18266.57it/s]


# 定义模型

In [21]:
class BiLSTM(nn.Module):
    def __init__(self):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(len(TEXT.vocab), 100)     
        self.embedding.weight.data.copy_(weight_matrix)

        self.lstm = nn.LSTM(input_size=100, hidden_size=N_HIDDEN, bidirectional=True) # 是否双向
        
        self.conv = nn.Sequential(
            # (8, 1, n_hidden*2, 1)
            nn.Conv2d(1, OUT_CHANNEL, (2, 1)), # input_channel(=1), output_channel, (filter_height, filter_width), stride=1]
            # (8, 1000, n_hidden*2-1, 1)
            nn.ReLU(),
            nn.MaxPool2d((N_HIDDEN*2-1, 1)), # ((filter_height, filter_width))
        )
        
        self.out = nn.Linear(OUT_CHANNEL, N_CLASS)

    def forward(self, X): # X: [fix_length, batch_size]
        input = self.embedding(X)

        hidden_state = torch.randn(1*2, BATCH_SIZE, N_HIDDEN).to(DEVICE)   # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]
        cell_state = torch.randn(1*2, BATCH_SIZE, N_HIDDEN).to(DEVICE)     # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]

        outputs, (_, _) = self.lstm(input, (hidden_state, cell_state))
        outputs = outputs[-1]  # [batch_size, n_hidden * 2]
        
        embeds = outputs.unsqueeze(1) # [batch, channel(=1), sequence_length, embedding_size]
        embeds = embeds.unsqueeze(3)
        
        conved = self.conv(embeds)
        flatten = conved.view(BATCH_SIZE, -1)
        y = self.out(flatten)

        return y

model = BiLSTM().to(DEVICE)
loss_fc = nn.CrossEntropyLoss().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LR)

# 训练

In [22]:
start = time.perf_counter()
for epoch in range(EPOCHS):
    for i, batch in enumerate(train_iter):
        optimizer.zero_grad()
        pred = model(batch.text)
	
        loss = loss_fc(pred, batch.label)
        loss.backward()
        optimizer.step()
        # if (i+1) % 50 == 0:
        #     end = time.perf_counter()
        #     print('Epoch: ', epoch, '| batch: ', (i+1)*8, '| train loss: %.4f' % loss, '| time: %.2f s' % (end-start))
end = time.perf_counter()
print('time: %.2f s' % (end-start))


time: 260.44 s


# 评估

In [23]:
true = 0.0
all = 0.0
for i, val_batch in enumerate(val_iter):
    pred_y = torch.max(model(val_batch.text), 1)[1].cpu().data.numpy()
    real_y = val_batch.label.cpu().data.numpy()
    true += float((pred_y == real_y).astype(int).sum())
    all += float(len(real_y))
accuracy = true / all
print(accuracy)


0.255


# 预测

In [None]:
# result=[['label']]
# for i, test_batch in enumerate(test_iter):
#     for label in torch.max(model(test_batch.text), 1)[1].cpu().data.numpy():
#        result.append([label]) 
# with open("/content/drive/MyDrive/data/News classification/tmp/result.csv", "a", newline='', encoding='utf-8') as file:
#     writer = csv.writer(file ,delimiter=',')
#     writer.writerows(result)