In [1]:
from collections import Counter
import os
import random
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

## Special tokens

It is important that they don't appear in the actual vocab, hence this weird look.

In [2]:
PAD = "@@PAD@@"#补零
UNK = "@@UNK@@"#填充

## Hyperparameters 



In [3]:
MAX_SEQ_LEN = 20  # -1 for no truncation
UNK_THRESHOLD = 5
BATCH_SIZE = 128
N_EPOCHS = 20
LEARNING_RATE = 1e-3
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
N_RNN_LAYERS = 2

## Seeding Utilities

In [4]:
def seed_everything(seed=1):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

## Data Utilities

In [5]:

def split_data(train_data, num_split=2000):
    """Splits the training data into training and development sets."""
    random.shuffle(train_data)
    return train_data[:-num_split], train_data[-num_split:]

In [6]:
import pandas as pd
train_file = r"D:\PycharmProjects\nlp\data\中文文本分类数据集\data\train.txt"
df = pd.read_csv(train_file, sep="\t", header=None, names=["id", "category", "sentence"])

rows = []
for index, row in df[['id', 'category', 'sentence']].iterrows():
    rows.append({
            'sentence': row['sentence'],
            'label': row['category'],
            })

In [7]:
rows[0]

{'sentence': '(2)若伴便秘者符合罗马Ⅳ功能性便秘诊断标准，若伴夜尿症者符合夜尿症的诊断标准；', 'label': 'Diagnostic'}

In [8]:
import jieba

def tokenize(data, max_seq_len=MAX_SEQ_LEN):
    for example in data:
        example['text']=[word for word in jieba.cut(example['sentence'])][:max_seq_len]
    # """
    # Here we use nltk to tokenize data. There are many othe possibilities. We also truncate the
    # sequences so that the training time and memory is more manageable. You can think of truncation
    # as making a decision only looking at the first X words.
    # """
      


def create_vocab(data, unk_threshold=UNK_THRESHOLD):
    counter=Counter(token for example in data for token in example['text'])
    vocab = {token for token in counter if counter[token]>unk_threshold}
    print(f'vocab size:{len(vocab)+2}')
    print(f'Most common tokens:{counter.most_common(10)}')
    token_to_idx={PAD:0,UNK:1}
    for token in vocab:
        token_to_idx[token]=len(token_to_idx)
        return token_to_idx

    # """
    # Creates a vocabulary with tokens that have frequency above unk_threshold and assigns each token
    # a unique index, including the special tokens.
    # """
    
    


def apply_vocab(data, token_to_idx):
    for example in data:
        example['text']=[token_to_idx.get(token,token_to_idx[UNK])for token in example['text']]
    # """
    # Applies the vocabulary to the data and maps the tokenized sentences to vocab indices as the
    # model input.
    # """
    

def apply_label_map(data, label_to_idx):
    for example in data:
        example['label']=label_to_idx[example['label']]
    # """Converts string labels to indices."""
     

In [9]:
tokenize(rows)
token_to_idx = create_vocab(rows)

Building prefix dict from the default dictionary ...
Loading model from cache D:\LOCALT~1\jieba.cache
Loading model cost 0.581 seconds.
Prefix dict has been built successfully.


vocab size:1452
Most common tokens:[('；', 3545), ('.', 2818), (' ', 2556), ('、', 2451), ('）', 2342), ('（', 2059), ('的', 1979), ('或', 1718), ('。', 1544), ('，', 1521)]


In [10]:
token_to_idx 

{'@@PAD@@': 0, '@@UNK@@': 1, '循环': 2}

In [11]:
rows[0]

{'sentence': '(2)若伴便秘者符合罗马Ⅳ功能性便秘诊断标准，若伴夜尿症者符合夜尿症的诊断标准；',
 'label': 'Diagnostic',
 'text': ['(',
  '2',
  ')',
  '若伴',
  '便秘',
  '者',
  '符合',
  '罗马',
  'Ⅳ',
  '功能性',
  '便秘',
  '诊断',
  '标准',
  '，',
  '若伴',
  '夜尿症',
  '者',
  '符合',
  '夜尿症',
  '的']}

In [12]:
class SentimentDataset(Dataset):
    def __init__(self, data, pad_idx):
        data = sorted(data, key=lambda example: len(example["text"]))
        self.texts = [example["text"] for example in data]
        self.labels = [example["label"] for example in data]
        self.pad_idx = pad_idx

    def __getitem__(self, index):
        return [self.texts[index], self.labels[index]]

    def __len__(self):
        return len(self.texts)

    def collate_fn(self, batch):
        def tensorize(elements, dtype):
            return [torch.tensor(element, dtype=dtype) for element in elements]
             
        def pad(tensors):
            max_len = max(len(tensor) for tensor in tensors)
            padded_tensors = [
                F.pad(tensor, (0, max_len - len(tensor)), value=self.pad_idx)
                for tensor in tensors
            ]
            return padded_tensors

        texts, labels = zip(*batch)
        return [
            torch.stack(pad(tensorize(texts, torch.long)), dim=0),
            torch.stack(tensorize(labels, torch.long), dim=0),
        ]

            

         

## Model

In [13]:
class SequenceClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_labels, n_rnn_layers, pad_idx):
        super().__init__()

        self.pad_idx = pad_idx

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(
            embedding_dim, hidden_dim, num_layers=n_rnn_layers, batch_first=True, bidirectional=True
        )
        # We take the final hidden state at all GRU layers as the sequence representation.
        # 2 because bidirectional.
        layered_hidden_dim = hidden_dim * n_rnn_layers * 2
        self.output = nn.Linear(layered_hidden_dim, n_labels)

    def forward(self, text):
        # text shape: (batch_size, max_seq_len) where max_seq_len is the max length
        # len shape: (batch_size,)
        non_padded_positions = text != self.pad_idx
        lens = non_padded_positions.sum(dim=1)

        # embedded shape: (batch_size, max_seq_len, embedding_dim)
        embedded = self.embedding(text)

        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, lens.cpu(), batch_first=True, enforce_sorted=False
        )#利用PyTorch提供的pack_padded_sequence()函数，将嵌入向量打包成压缩形式，便于GRU处理变长输入数据


        packed_output, hidden = self.rnn(packed_embedded)
        # shape: (batch_size, n_layers * n_directions * hidden_dim)
        hidden = hidden.transpose(0, 1).reshape(hidden.shape[1], -1)

        return self.output(hidden)

    # def forward(self, text):
    #     # text: [batch_size, seq_len]
    #
    #     # Masking for padding tokens.
    #     # Create a binary mask of shape [batch_size, seq_len], where padding tokens have value 0 and others have value 1.
    #     mask = (text != self.pad_idx).float()  # mask: [batch_size, seq_len]
    #
    #     # Word embedding layer.
    #     embedded = self.embedding(text)  # embedded: [batch_size, seq_len, embedding_dim]
    #
    #     # Bidirectional GRU layer.
    #     rnn_output, _ = self.rnn(embedded)  # rnn_output: [batch_size, seq_len, hidden_dim * 2]
    #
    #     # Apply the mask to the output of GRU.
    #     # The output of padding tokens are masked out, so they have no effect on the following operations.
    #     masked_output = rnn_output * mask.unsqueeze(-1)  # masked_output: [batch_size, seq_len, hidden_dim * 2]
    #
    #     # Sequence representation layer.
    #     # We take the final hidden state at all GRU layers as the sequence representation.
    #     # Concatenate the final hidden states from the forward and backward directions, then flatten them.
    #     # This results in a tensor of shape [batch_size, hidden_dim * 2 * n_rnn_layers].
    #     sequence_rep = torch.cat((
    #         masked_output[:, -1, :self.rnn.hidden_size],
    #         masked_output[:, 0, self.rnn.hidden_size:]
    #     ), dim=-1).view(-1, self.rnn.hidden_size * 2 * self.rnn.num_layers)
    #
    #     # Output layer.
    #     logits = self.output(sequence_rep)  # logits: [batch_size, n_labels]
    #
    #     return logits


In [14]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

## Training and Evaluation Functions

In [15]:
def train(model, dataloader, optimizer, device):
    model.train()
    for texts, labels in tqdm(dataloader):
        texts, labels = texts.to(device), labels.to(device)
        output = model(texts)
        loss = F.cross_entropy(output, labels)
        model.zero_grad()
        loss.backward()
        optimizer.step()

def evaluate(model, dataloader, device):
    count = correct = 0.0
    model.eval()
    with torch.no_grad():
        for texts, labels in tqdm(dataloader):
            texts, labels = texts.to(device), labels.to(device)
            # shape: (batch_size, n_labels)
            output = model(texts)
            # shape: (batch_size,)
            predicted = output.argmax(dim=-1)
            count += len(predicted)
            correct += (predicted == labels).sum().item()
    print(f"Auuracy:{correct / count}")
     
             

## Running training and evaluation

In [16]:
import pandas as pd
train_file = r"D:\PycharmProjects\nlp\data\中文文本分类数据集\data\train.txt"
df = pd.read_csv(train_file, sep="\t", header=None, names=["id", "category", "sentence"])

rows = []
for index, row in df[['id', 'category', 'sentence']].iterrows():
    rows.append({
            'sentence': row['sentence'],
            'label': row['category'],
            })
    

train_data, test_data = split_data(rows, int(0.1*len(rows)))
train_data, dev_data = split_data(train_data, int(0.1*len(train_data)))

for data in (train_data, dev_data, test_data):
    tokenize(data)

In [17]:
token_to_idx = create_vocab(train_data)

label_to_idx = {
    "Addictive Behavior": 0, 
    "Age": 1,
    "Allergy Intolerance": 2,
    "Compliance with Protocol": 3,
    "Consent": 4,
    "Diagnostic": 5,
    "Disease": 6,
    "Enrollment in other studies": 7,
    "Laboratory Examinations": 8,
    "Life Expectancy": 9,
    "Organ or Tissue Status": 10,
    "Pharmaceutical Substance or Drug": 11,
    "Risk Assessment": 12,
    "Smoking Status": 13,
    "Therapy or Surgery": 14,
}
for data in (train_data, dev_data, test_data):
    apply_vocab(data, token_to_idx)
    apply_label_map(data, label_to_idx)



vocab size:1263
Most common tokens:[('；', 2913), ('.', 2286), (' ', 2059), ('、', 1962), ('）', 1910), ('（', 1669), ('的', 1624), ('或', 1383), ('。', 1259), ('，', 1231)]


In [18]:
pad_idx = token_to_idx[PAD]
train_dataset = SentimentDataset(train_data, pad_idx)
dev_dataset = SentimentDataset(dev_data, pad_idx)
test_dataset = SentimentDataset(test_data, pad_idx)

In [19]:
train_dataloader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=train_dataset.collate_fn
)
dev_dataloader = DataLoader(
    dev_dataset, batch_size=BATCH_SIZE, collate_fn=dev_dataset.collate_fn
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, collate_fn=test_dataset.collate_fn
)

model = SequenceClassifier(
    len(token_to_idx), EMBEDDING_DIM, HIDDEN_DIM, len(label_to_idx), N_RNN_LAYERS, pad_idx
)
print(f"Model has {count_parameters(model)} parameters.")


Model has 1791375 parameters.


In [20]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"Random baseline")
evaluate(model, dev_dataloader, device)
for epoch in range(N_EPOCHS):
    print(f"Epoch {epoch + 1}")  # 0-based -> 1-based
    train(model, train_dataloader, optimizer, device)
    evaluate(model, dev_dataloader, device)
print(f"Test set performance")
evaluate(model, test_dataloader, device)

Random baseline


100%|██████████| 6/6 [00:00<00:00, 33.79it/s]


Auuracy:0.02361111111111111
Epoch 1


100%|██████████| 51/51 [00:01<00:00, 43.47it/s]
100%|██████████| 6/6 [00:00<00:00, 176.95it/s]


Auuracy:0.15555555555555556
Epoch 2


100%|██████████| 51/51 [00:01<00:00, 47.29it/s]
100%|██████████| 6/6 [00:00<00:00, 171.88it/s]


Auuracy:0.15138888888888888
Epoch 3


100%|██████████| 51/51 [00:01<00:00, 47.82it/s]
100%|██████████| 6/6 [00:00<00:00, 182.29it/s]


Auuracy:0.13194444444444445
Epoch 4


100%|██████████| 51/51 [00:01<00:00, 48.21it/s]
100%|██████████| 6/6 [00:00<00:00, 176.93it/s]


Auuracy:0.1527777777777778
Epoch 5


100%|██████████| 51/51 [00:01<00:00, 47.25it/s]
100%|██████████| 6/6 [00:00<00:00, 174.30it/s]


Auuracy:0.15833333333333333
Epoch 6


100%|██████████| 51/51 [00:01<00:00, 47.18it/s]
100%|██████████| 6/6 [00:00<00:00, 179.52it/s]


Auuracy:0.15555555555555556
Epoch 7


100%|██████████| 51/51 [00:01<00:00, 48.39it/s]
100%|██████████| 6/6 [00:00<00:00, 174.30it/s]


Auuracy:0.14166666666666666
Epoch 8


100%|██████████| 51/51 [00:01<00:00, 46.95it/s]
100%|██████████| 6/6 [00:00<00:00, 174.30it/s]


Auuracy:0.15
Epoch 9


100%|██████████| 51/51 [00:01<00:00, 48.00it/s]
100%|██████████| 6/6 [00:00<00:00, 160.39it/s]


Auuracy:0.17083333333333334
Epoch 10


100%|██████████| 51/51 [00:01<00:00, 48.36it/s]
100%|██████████| 6/6 [00:00<00:00, 162.36it/s]


Auuracy:0.14722222222222223
Epoch 11


100%|██████████| 51/51 [00:01<00:00, 48.30it/s]
100%|██████████| 6/6 [00:00<00:00, 174.30it/s]


Auuracy:0.15833333333333333
Epoch 12


100%|██████████| 51/51 [00:01<00:00, 47.86it/s]
100%|██████████| 6/6 [00:00<00:00, 160.37it/s]


Auuracy:0.16111111111111112
Epoch 13


100%|██████████| 51/51 [00:01<00:00, 48.00it/s]
100%|██████████| 6/6 [00:00<00:00, 169.40it/s]


Auuracy:0.15416666666666667
Epoch 14


100%|██████████| 51/51 [00:01<00:00, 48.16it/s]
100%|██████████| 6/6 [00:00<00:00, 174.30it/s]


Auuracy:0.16805555555555557
Epoch 15


100%|██████████| 51/51 [00:01<00:00, 47.80it/s]
100%|██████████| 6/6 [00:00<00:00, 174.36it/s]


Auuracy:0.15694444444444444
Epoch 16


100%|██████████| 51/51 [00:01<00:00, 48.21it/s]
100%|██████████| 6/6 [00:00<00:00, 169.26it/s]


Auuracy:0.15555555555555556
Epoch 17


100%|██████████| 51/51 [00:01<00:00, 48.30it/s]
100%|██████████| 6/6 [00:00<00:00, 164.76it/s]


Auuracy:0.15138888888888888
Epoch 18


100%|██████████| 51/51 [00:01<00:00, 47.56it/s]
100%|██████████| 6/6 [00:00<00:00, 174.30it/s]


Auuracy:0.1597222222222222
Epoch 19


100%|██████████| 51/51 [00:01<00:00, 47.92it/s]
100%|██████████| 6/6 [00:00<00:00, 179.50it/s]


Auuracy:0.16944444444444445
Epoch 20


100%|██████████| 51/51 [00:01<00:00, 47.50it/s]
100%|██████████| 6/6 [00:00<00:00, 179.54it/s]


Auuracy:0.15833333333333333
Test set performance


100%|██████████| 7/7 [00:00<00:00, 171.19it/s]

Auuracy:0.19125



