text classifiction(
-----------

In [1]:
import torch
from torchtext.legacy import data

SEED = 2021

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=float)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")



In [2]:
from torchtext.legacy import datasets
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [3]:
print(len(train_data))
print(len(test_data))

25000
25000


In [4]:
print(vars(train_data.examples[0]))

{'text': ['I', 'like', 'this', 'film', 'a', 'lot', '.', 'It', 'has', 'a', 'wonderful', 'chemistry', 'between', 'the', 'actors', 'and', 'tells', 'a', 'story', 'that', 'is', 'pretty', 'universal', ',', 'the', 'story', 'of', 'the', 'prodigal', 'son', '.', 'The', 'aspect', 'I', 'like', 'the', 'best', 'however', 'was', 'the', 'way', 'that', 'the', 'bath', 'house', 'was', 'more', 'than', 'just', 'a', 'background', 'for', 'the', 'story', '.', 'As', 'the', 'father', 'told', 'the', 'son', 'the', 'story', 'of', 'his', 'wife', "'s", 'family', 'in', 'the', 'northern', 'deserts', 'of', 'china', ',', 'the', 'element', 'of', 'water', 'and', 'bathing', 'becomes', 'an', 'almost', 'sacred', 'ritual', '.', 'Water', 'was', 'so', 'scarce', 'that', 'a', 'simple', 'bath', 'had', 'profound', 'depth', 'and', 'meaning.<br', '/><br', '/>Overall', 'the', 'film', 'was', 'very', 'effective', '.', 'There', 'were', 'moments', ',', 'however', ',', 'when', 'it', 'verged', 'on', '"', 'too', '"', 'sweet', '...', 'borderi

In [5]:
import random
train_data, val_data = train_data.split(random_state=random.seed(SEED))

In [6]:
print(len(train_data))
print(len(val_data))
print(len(test_data))

17500
7500
25000


- word2idx & idx2word
- idx2one-hot

In [7]:
# glove是斯坦福训练的一些高质量词向量
# 通常把embedding这层初始化成预训练的一些向量
TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.100d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [8]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 203725), (',', 193518), ('.', 166335), ('a', 110155), ('and', 110050), ('of', 101236), ('to', 94443), ('is', 76935), ('in', 61664), ('I', 54376), ('it', 53996), ('that', 49554), ('"', 44878), ("'s", 43639), ('this', 42540), ('-', 37461), ('/><br', 35862), ('was', 35140), ('as', 30678), ('with', 30114)]


In [9]:
BATCH_SIZE = 32
train_iter, val_iter, test_iter = data.BucketIterator.splits((train_data, val_data, test_data),
                          batch_size=BATCH_SIZE,
                          device=device,)

In [10]:
batch = next(iter(val_iter))
print(batch.text[:,0])
[TEXT.vocab.itos[i] for i in batch.text[:,0]]

tensor([1303,    2, 1147,  466,    7,  394,   68,    3,  304,    6,    2, 1060,
          28,    2,  418, 1216, 3982,    4,  172,  330, 3537,   31,   80,   16,
          22,    3,  575, 2256,   10,    2,  100,  517,    4, 4237,    3,    7,
         284,    3,   31,  204,   94,  561,   17, 2747,    4], device='cuda:0')


['Perhaps',
 'the',
 'biggest',
 'waste',
 'of',
 'production',
 'time',
 ',',
 'money',
 'and',
 'the',
 'space',
 'on',
 'the',
 'video',
 'store',
 'shelf',
 '.',
 'If',
 'someone',
 'suggests',
 'you',
 'see',
 'this',
 'movie',
 ',',
 'run',
 'screaming',
 'in',
 'the',
 'other',
 'direction',
 '.',
 'Unless',
 ',',
 'of',
 'course',
 ',',
 'you',
 "'re",
 'into',
 'self',
 '-',
 'abuse',
 '.']

weight averaging
---------
做句子里各个词向量的平均，然后做分类
- 训练出词向量
- 做平均，得到sentence vector
- 训练出sentence vector的分类器

In [11]:
import torch.nn as nn
import torch.nn.functional as F

class WordAVGModel(nn.Module):
    def __init__(self, vocab_size, embed_size, output_size, pad_idx):
        super(WordAVGModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size, padding_idx=pad_idx)
        self.linear = nn.Linear(embed_size, output_size)
    
    def forward(self, text):
        # text:[seq_length, batch_size]
        embedded = self.embed(text) # embedded:[seq_length, batch_size, embed_size]
        embedded = embedded.permute(1,0,2)# embedded:[batch_size, seq_length, embed_size]
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze() # pooled:[bsz, embed_size]
        return self.linear(pooled) # ->here we get logits

In [12]:
VOCAB_SIZE = len(TEXT.vocab)
EMBED_SIZE = 100
OUTPUT_SIZE = 1 # 阈值分类
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = WordAVGModel(vocab_size=VOCAB_SIZE,
                     embed_size=EMBED_SIZE,
                     output_size=OUTPUT_SIZE,
                     pad_idx=PAD_IDX)

In [13]:
model

WordAVGModel(
  (embed): Embedding(25002, 100, padding_idx=1)
  (linear): Linear(in_features=100, out_features=1, bias=True)
)

In [14]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

2500301

In [15]:
pretrained_embedding = TEXT.vocab.vectors
model.embed.weight.data.copy_(pretrained_embedding)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBED_SIZE)
model.embed.weight.data[UNK_IDX] = torch.zeros(EMBED_SIZE)

训练模型
----


In [16]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5) # lr decay

model = model.to(device)
criterion = criterion.to(device)

In [17]:
def binary_accuracy(preds, y):
    rounded = torch.round(torch.sigmoid(preds))
    correct = (rounded == y).float()
    acc = correct.sum()/len(correct)
    return acc

In [18]:
def train(model, iterator, optimizer, criterion):
    epoch_loss, epoch_acc, total_len = 0., 0., 0.
    model.train()
    for it in iterator:
        preds = model(it.text).squeeze() # original preds:[bsz, 1]
        loss = criterion(preds, it.label)
        acc = binary_accuracy(preds, it.label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item() * len(batch.label)
        epoch_acc += acc.item() * len(batch.label)
        total_len += len(batch.label)
 
    return epoch_loss / total_len, epoch_acc / total_len

In [19]:
def evaluate(model, iterator, optimizer, criterion):
    epoch_loss, epoch_acc, total_len = 0., 0., 0.
    model.eval()
    for it in iterator:
        preds = model(it.text).squeeze()
        loss = criterion(preds, it.label)
        acc = binary_accuracy(preds, it.label)
        
        epoch_loss += loss.item() * len(batch.label)
        epoch_acc += acc.item() * len(batch.label)
        total_len += len(batch.label)
        
    model.train()
    
    return epoch_loss / total_len, epoch_acc / total_len

In [20]:
NUM_EPOCHS = 10
best_valid_acc = 0.
for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_iter, optimizer, criterion)
    
    print("Epoch", epoch, "Train Loss", train_loss, "Train Acc", train_acc)
    print("Epoch", epoch, "Valid Loss", val_loss, "Valid Acc", val_acc)
    
    if val_acc > best_valid_acc:
        best_valid_acc = val_acc
        torch.save(model.state_dict(), "wordavg_model.pth")
        print("best model saved to wordavg_model.pth, val_loss=", val_acc)

Epoch 0 Train Loss 0.6749341328372044 Train Acc 0.6569029121119971
Epoch 0 Valid Loss 0.5570298828189971 Valid Acc 0.7408687944107867
best model saved to wordavg_model.pth, val_loss= 0.7408687944107867
Epoch 1 Train Loss 0.5821926220603701 Train Acc 0.7796585271083895
Epoch 1 Valid Loss 0.4138156174300788 Valid Acc 0.813874113559723
best model saved to wordavg_model.pth, val_loss= 0.813874113559723
Epoch 2 Train Loss 0.46654784509904834 Train Acc 0.8414566466751656
Epoch 2 Valid Loss 0.35676279022230456 Valid Acc 0.8549645390916378
best model saved to wordavg_model.pth, val_loss= 0.8549645390916378
Epoch 3 Train Loss 0.38706389309972644 Train Acc 0.8767547010285763
Epoch 3 Valid Loss 0.3557774397370671 Valid Acc 0.8691932624958931
best model saved to wordavg_model.pth, val_loss= 0.8691932624958931
Epoch 4 Train Loss 0.3324609484885088 Train Acc 0.8932488901741544
Epoch 4 Valid Loss 0.36928652644013593 Valid Acc 0.8781028369639782
best model saved to wordavg_model.pth, val_loss= 0.87810

In [21]:
model.load_state_dict(torch.load("wordavg_model.pth"))

<All keys matched successfully>

In [22]:
import spacy
nlp = spacy.load("en_core_web_sm")

def predict_sentiment(sen):
    tokenized = [tok.text for tok in nlp.tokenizer(sen)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1) # ->[seq_length, bsz]
    pred = torch.sigmoid(model(tensor))
    return pred.item()

In [23]:
predict_sentiment("i want to see more")

6.584218413541176e-12

In [24]:
predict_sentiment("the film is terrible")

1.0

RNN模型
-----
用RNN模型encode这个句子，把传到最后的隐状态作为encode

In [29]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, output_size, pad_idx, hidden_size, dropout):
        super(RNNModel, self).__init__()     
        self.embed = nn.Embedding(vocab_size, embed_size, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=2, bidirectional=True)
        self.linear = nn.Linear(hidden_size*2, output_size)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):
        # text:[seq_length, batch_size]
        embedded = self.embed(text) # embedded:[seq_length, batch_size, embed_size]
        embedded = self.dropout(embedded)
        output, (hidden, cell) = self.lstm(embedded)
        
        # hidden:[num_layers*num_directions, batch_size, hidden_size]
        # 两层各一个前向一个反向的hidden state，所以取最后两个（第二层的）
        hidden = torch.cat([hidden[-1], hidden[-2]], dim=1)
        hidden = self.dropout(hidden.squeeze())
        return self.linear(hidden) # ->here we get logits

In [30]:
model = RNNModel(vocab_size=VOCAB_SIZE,
                 embed_size=EMBED_SIZE,
                 output_size=OUTPUT_SIZE,
                 pad_idx=PAD_IDX,
                 hidden_size=100,
                 dropout=0.5)

In [31]:
pretrained_embedding = TEXT.vocab.vectors
model.embed.weight.data.copy_(pretrained_embedding)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBED_SIZE)
model.embed.weight.data[UNK_IDX] = torch.zeros(EMBED_SIZE)

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5) # lr decay

model = model.to(device)
criterion = criterion.to(device)

In [32]:
NUM_EPOCHS = 10
for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_iter, optimizer, criterion)
    
    print("Epoch", epoch, "Train Loss", train_loss, "Train Acc", train_acc)
    print("Epoch", epoch, "Valid Loss", val_loss, "Valid Acc", val_acc)
    
    if val_acc > best_valid_acc:
        best_valid_acc = val_acc
        torch.save(model.state_dict(), "LSTM_model.pth")
        print("best model saved to LSTM_model.pth, val_loss=", val_acc)

Epoch 0 Train Loss 0.6664477060409713 Train Acc 0.5916851658707781
Epoch 0 Valid Loss 0.6132248257504815 Valid Acc 0.6838209220703612
Epoch 1 Train Loss 0.6061158321576084 Train Acc 0.6854759729322611
Epoch 1 Valid Loss 0.6833972782938549 Valid Acc 0.5235372340425531
Epoch 2 Train Loss 0.4740264137214915 Train Acc 0.7776916297305873
Epoch 2 Valid Loss 0.3365465347378528 Valid Acc 0.8560283688788718
Epoch 3 Train Loss 0.30914569001010567 Train Acc 0.874461347703742
Epoch 3 Valid Loss 0.3086086222090026 Valid Acc 0.8757092199427017
Epoch 4 Train Loss 0.2577675924705586 Train Acc 0.899378101394207
Epoch 4 Valid Loss 0.2846900516757779 Valid Acc 0.884751773134191
Epoch 5 Train Loss 0.2143628503715951 Train Acc 0.9197897623400365
Epoch 5 Valid Loss 0.2739277087139929 Valid Acc 0.8968528369639782
Epoch 6 Train Loss 0.18410167872320696 Train Acc 0.9315421781766567
Epoch 6 Valid Loss 0.28143334175580637 Valid Acc 0.8957890071767441
Epoch 7 Train Loss 0.15773891922953404 Train Acc 0.94096043362

CNN模型
----

In [43]:
class CNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, output_size, pad_idx, num_filters, filter_sizes, dropout):
        super(CNNModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters,
                      kernel_size=(fsz, embed_size))
            for fsz in filter_sizes
        ])
#         self.conv = nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(filter_size, embed_size))
        self.linear = nn.Linear(num_filters*len(filter_sizes), output_size)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):
        text = text.permute(1, 0) # text:[batch_size, seq_length]
        embedded = self.embed(text) # embedded:[batch_size, seq_length, embed_size]
        
        # [N, C, H, W]
        embedded = embedded.unsqueeze(1) # embedded:[batch_size, 1, seq_length, embed_size]
#         conved = F.relu(self.conv(embedded)) # conved:[batch_size, num_filters, seq_length-filter_size+1, 1]
#         conved = conved.squeeze(3)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
    
        # maxpooling
#         pooled = F.max_pool1d(conved, conved.shape[2]) # pooled:[batch_size, num_filters, 1]
#         pooled = pooled.squeeze(2)
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        pooled = torch.cat(pooled, dim=1) # pooled:[batch_size, 3*num_filters]
        pooled = self.dropout(pooled)
        
        return self.linear(pooled) # ->here we get logits

In [44]:
model = CNNModel(vocab_size=VOCAB_SIZE,
                 embed_size=EMBED_SIZE,
                 output_size=OUTPUT_SIZE,
                 pad_idx=PAD_IDX,
                 num_filters=100,
                 filter_sizes=[3, 4, 5],
                 dropout=0.5)

In [45]:
pretrained_embedding = TEXT.vocab.vectors
model.embed.weight.data.copy_(pretrained_embedding)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBED_SIZE)
model.embed.weight.data[UNK_IDX] = torch.zeros(EMBED_SIZE)

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5) # lr decay

model = model.to(device)
criterion = criterion.to(device)

In [46]:
NUM_EPOCHS = 10
for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_iter, optimizer, criterion)
    
    print("Epoch", epoch, "Train Loss", train_loss, "Train Acc", train_acc)
    print("Epoch", epoch, "Valid Loss", val_loss, "Valid Acc", val_acc)
    
    if val_acc > best_valid_acc:
        best_valid_acc = val_acc
        torch.save(model.state_dict(), "CNN_model.pth")
        print("best model saved to CNN_model.pth, val_loss=", val_acc)

Epoch 0 Train Loss 0.607937832928573 Train Acc 0.6585351920650908
Epoch 0 Valid Loss 0.41745231992751686 Valid Acc 0.8156028369639782
Epoch 1 Train Loss 0.3730899749583265 Train Acc 0.8323893314959581
Epoch 1 Valid Loss 0.3314560703259949 Valid Acc 0.8618351063829788
Epoch 2 Train Loss 0.2560321971560451 Train Acc 0.8958442152309244
Epoch 2 Valid Loss 0.31487216588940503 Valid Acc 0.8733156029214251
Epoch 3 Train Loss 0.17023584130432842 Train Acc 0.9365777618488402
Epoch 3 Valid Loss 0.3302223778614733 Valid Acc 0.876374113559723
Epoch 4 Train Loss 0.11057369865897392 Train Acc 0.9607518281535649
Epoch 4 Valid Loss 0.35572421963634865 Valid Acc 0.8731826241980207
Epoch 5 Train Loss 0.07154549534732031 Train Acc 0.9767400104777251
Epoch 5 Valid Loss 0.4056132808631031 Valid Acc 0.8713209220703612
Epoch 6 Train Loss 0.0500120494589295 Train Acc 0.9833099373515825
Epoch 6 Valid Loss 0.45166121195074616 Valid Acc 0.8685283688788719
Epoch 7 Train Loss 0.04202591477006022 Train Acc 0.986680