情感分析
=
模型从简单到复杂，依次构建：
-
* Word Averaging 模型
* RNN/LSTM 模型
* CNN 模型

In [2]:
import torch
from torchtext import data

SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True # why??????

TEXT = data.Field(tokenize = 'spacy', tokenizer_language = 'en_core_web_sm')
LABEL = data.LabelField(dtype = torch.float)


In [3]:
from torchtext import datasets
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [3]:
print(f'Numbere of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Numbere of training examples: 25000
Number of testing examples: 25000


In [4]:
print(vars(train_data.examples[0]))

{'text': ['While', 'Urban', 'Cowboy', 'did', 'not', 'ooze', 'with', 'the', 'same', 'testosterone', 'you', 'might', 'find', 'at', 'a', 'rodeo', ',', 'it', 'did', 'provide', 'an', 'accurate', 'glimpse', 'of', 'that', 'day', 'and', 'age', ',', 'in', 'urban', 'Texas', '.', 'I', 'also', 'think', 'that', 'to', 'truly', 'critique', 'this', 'movie', ',', 'one', 'would', 'have', 'to', 'have', 'lived', 'in', 'the', 'time', 'and', 'relative', 'place', 'that', 'it', 'was', 'made', '.', 'There', 'was', 'good', 'music', ',', 'fun', 'times', 'and', ',', 'yes', ',', 'a', 'few', '"', 'rough', 'and', 'tumbles', '"', 'at', 'the', 'honky', 'tonk', 'roadhouses', '.', 'The', 'relationship', 'of', 'Bud', 'and', 'Sissy', ',', 'like', '"', 'two', 'ships', 'passing', 'in', 'the', 'night', '"', ',', 'was', 'well', 'conceived', '.', 'When', 'Pam', 'tore', 'up', 'the', 'note', 'that', 'Sissy', 'had', 'written', 'to', 'Bud', ',', 'it', 'echoed', 'the', 'tragedy', 'of', 'many', 'true', 'life', 'romances', '.', 'The'

In [4]:
import random
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

检查一下每部分有多少数据

In [6]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


* 下一步我们需要创建 vocabulary . vocabulary 就是把每个单词一一映射到一个数字 one hot
* 我们使用最常见的25k个单词来构建我们的单词表， 用Max_size这个参数可以做到这一点
* 所有其他的单词都用<unk>来表示

In [5]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

定义循环以及各种参数
-
note: BucketIterator 还表示会把vocabulary进行大小的顺序排序

In [6]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

构建模型
-


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class WordAVGModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, output_size, pad_idx):
        super(WordAVGModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size, padding_idx = pad_idx) # 在embed中把padding的词向量初始化为0
        self.linear = nn.Linear(embedding_size, output_size)
        
        
        
    def forward(self, text):
        embedded = self.embed(text) # [seq_len, batch_size, embedding_size]
#         embedded = embedded.transpose(1, 0 ) # [batch_size, seq_len, embedding_size]
        embedded = embedded.permute(1, 0 ,2) # [batch_size, seq_len, embedding_size] permute即为重排序
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)) # [batch_size, 1 , embedding_size] embedded.shape[1]表示这个维度全压扁， 1 表示根本不压
        pooled = pooled.squeeze() # [batch_size, embedding_size]
        return self.linear(pooled)

        

In [8]:
VOCAB_SIZE = len(TEXT.vocab)
EMBEDDING_SIZE = 100
OUTPUT_SIZE = 1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = WordAVGModel(vocab_size = VOCAB_SIZE,
                     embedding_size = EMBEDDING_SIZE,
                     output_size = OUTPUT_SIZE,
                     pad_idx = PAD_IDX)


In [22]:
def count_parameters(model):
    return sum( p.numel() for p in model.parameters() if p.requires_grad) #numel 数一共有多少参数


count_parameters(model)

2500301

In [23]:
next(model.parameters()).numel()

2500200

模型词向量的初始化，利用stanford的glove.6B.100d

In [9]:
pretrained_embedding = TEXT.vocab.vectors
# pretrained_embedding.shape
model.embed.weight.data.copy_(pretrained_embedding)  # 把embed.weight.data 初始化为 pretrained_embedding 的形式

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
model.embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_SIZE)

训练模型
-

In [10]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss() # 需要学习各种criterion

model = model.to(device)
crierion = criterion.to(device)


In [11]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds)) # 四舍五入为 0 1 的数
    correct = (rounded_preds == y).float() # rounded_preds ==y 返回的是 True False, 后面加上 .float() 就会变成0 1
    acc = correct.sum() / len(correct)
    return acc
    

In [12]:
def train(model, iterator, optimizer, criterion):
    epoch_loss, epoch_acc = 0., 0.
    model.train()
    total_len = 0.
    for batch in iterator:
        preds = model(batch.text).squeeze()
        loss = criterion(preds, batch.label)
        acc = binary_accuracy(preds, batch.label)
        
        #sgd
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item() * len(batch.label)
        epoch_acc += acc.item() * len(batch.label)
        total_len += len(batch.label)
        
    return epoch_loss / total_len, epoch_acc/ total_len
        

In [40]:
len(train_iterator)

274

In [13]:
def evaluate(model, iterator, criterion):
    epoch_loss, epoch_acc = 0., 0.
    model.eval()
    total_len = 0.
    for batch in iterator:
        preds = model(batch.text).squeeze()
        loss = criterion(preds, batch.label)
        acc = binary_accuracy(preds, batch.label)
        
        
        epoch_loss += loss.item() * len(batch.label)
        epoch_acc += acc.item() * len(batch.label)
        total_len += len(batch.label)
        
    model.train()
    
    return epoch_loss / total_len, epoch_acc / total_len


In [44]:
N_EPOCHS = 10
best_valid_acc = 0.
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        torch.save(model.state_dict(), "wordavg-model.pth")
        
    print("Epoch", epoch, "Train Loss", train_loss, "Train Acc", train_acc)
    print("Epoch", epoch, "Valid Loss", valid_loss, "Valid Acc", valid_acc)
    

Epoch 0 Train Loss 0.5512726911408561 Train Acc 0.7856571429116386
Epoch 0 Valid Loss 0.4839723803520203 Valid Acc 0.8262666666984558
Epoch 1 Train Loss 0.4278651227542332 Train Acc 0.8459428571973528
Epoch 1 Valid Loss 0.39040902093251545 Valid Acc 0.8608000000317891
Epoch 2 Train Loss 0.34556904196739197 Train Acc 0.876685714326586
Epoch 2 Valid Loss 0.33838990990320844 Valid Acc 0.8756000000317892
Epoch 3 Train Loss 0.2923483277797699 Train Acc 0.89828571434021
Epoch 3 Valid Loss 0.30775027653376263 Valid Acc 0.8817333333651225
Epoch 4 Train Loss 0.2545447506427765 Train Acc 0.9115428572382246
Epoch 4 Valid Loss 0.28786284338633217 Valid Acc 0.8886666666984558
Epoch 5 Train Loss 0.2248978919846671 Train Acc 0.9250285714558193
Epoch 5 Valid Loss 0.2757629540920258 Valid Acc 0.8918666666984558
Epoch 6 Train Loss 0.20098931844575066 Train Acc 0.9345142857960292
Epoch 6 Valid Loss 0.26628181886672975 Valid Acc 0.8934666666984558
Epoch 7 Train Loss 0.18019415139470782 Train Acc 0.9418857

In [45]:
model.load_state_dict(torch.load("wordavg-model.pth"))

In [46]:
import spacy
nlp = spacy.load("en_core_web_sm")

def predict_sentiment(sentence):
    tokenized = [ tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device) # [seq_len]
    tensor = tensor.unsqueeze(1) # [seq_len, batch_size] batch_size = 1
    pred = torch.sigmoid(model(tensor))
    return pred.item()

In [49]:
predict_sentiment("This film is dangerous!") # the result is very weird

0.9999974966049194

WordAVG结束
-

RNN模型
-


In [80]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, output_size, pad_idx, hidden_size, dropout):
        super(RNNModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size, padding_idx = pad_idx)
        self.lstm = nn.LSTM(embedding_size, hidden_size, bidirectional = True, num_layers = 2)
        self.linear = nn.Linear(hidden_size * 2, output_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        embedded = self.embed(text)
        embedded = self.dropout(embedded)
        output, (hidden, cell) = self.lstm(embedded)   # output是所有hidden state， hidden是最后一层的hidden state
        
        # hidden: [2, batch_size, hidden_size]
        hidden = torch.cat([hidden[-1],hidden[-2]],dim = 1) # why?????
        hidden = self.dropout(hidden.squeeze())
        return self.linear(hidden)
        
   
        
    

In [81]:
model = RNNModel(vocab_size = VOCAB_SIZE,
                 embedding_size = EMBEDDING_SIZE,
                 output_size = OUTPUT_SIZE,
                 pad_idx = PAD_IDX,
                 hidden_size = 100,
                 dropout = 0.5)

In [82]:
pretrained_embedding = TEXT.vocab.vectors
# pretrained_embedding.shape
model.embed.weight.data.copy_(pretrained_embedding)  # 把embed.weight.data 初始化为 pretrained_embedding 的形式

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
model.embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_SIZE)

In [83]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss() # 需要学习各种criterion

model = model.to(device)
crierion = criterion.to(device)


In [84]:
N_EPOCHS = 10
best_valid_acc = 0.
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        torch.save(model.state_dict(), "lstm-model.pth")
        
    print("Epoch", epoch, "Train Loss", train_loss, "Train Acc", train_acc)
    print("Epoch", epoch, "Valid Loss", valid_loss, "Valid Acc", valid_acc)
    

Epoch 0 Train Loss 0.6586226647921971 Train Acc 0.597371428666796
Epoch 0 Valid Loss 0.6078889149983724 Valid Acc 0.6725333333651224
Epoch 1 Train Loss 0.597791870376042 Train Acc 0.6787428572246007
Epoch 1 Valid Loss 0.48876981029510497 Valid Acc 0.7682666666666667
Epoch 2 Train Loss 0.5519022122383118 Train Acc 0.7268000000953674
Epoch 2 Valid Loss 0.61093304438591 Valid Acc 0.6617333333333333
Epoch 3 Train Loss 0.4441910290036883 Train Acc 0.8013142857824053
Epoch 3 Valid Loss 0.42243079606691997 Valid Acc 0.799066666730245
Epoch 4 Train Loss 0.4548588158743722 Train Acc 0.7856000000681196
Epoch 4 Valid Loss 0.3499969251314799 Valid Acc 0.8554666666984558
Epoch 5 Train Loss 0.31490092885153637 Train Acc 0.8705142857687814
Epoch 5 Valid Loss 0.29885362601280213 Valid Acc 0.8794666666984559
Epoch 6 Train Loss 0.2671865736280169 Train Acc 0.8939428571837289
Epoch 6 Valid Loss 0.3194438885132472 Valid Acc 0.8702666666666666
Epoch 7 Train Loss 0.28326265663419453 Train Acc 0.891142857183

CNN模型
-

Yoon Kim, Convolutional Neural Networks for Sentence Classification, 2014
--

定义的这个模型是3个filter_size不同的CNN模型，理论上K折检验可以直接用这种方法写出来
-


In [27]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, output_size, pad_idx, num_filters, filter_sizes, dropout):
        super(CNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size,padding_idx = pad_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels = 1, out_channels = num_filters, 
                      kernel_size = (fs, embedding_size)) 
            for fs in filter_sizes
        ])
#         self.conv = nn.Conv2d(in_channels = 1, out_channels = num_filters, kernel_size = (filter_size, embedding_size))
        self.linear = nn.Linear(num_filters * len(filter_sizes), output_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        text = text.permute(1,0) # [batch_size, seq_len]
        embedded = self.embed(text) # [batch_size ,seq_len, embedding_size]
        embedded = embedded.unsqueeze(1) # [batch_size, 1, seq_len, embedding_size]
#         conved = F.relu(self.conv(embedded)) # [batch_size, num_filters, seq_len - filter_size +1 , 1]
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs] 
#         conved = conved.squeeze(3) # [batch_size, num_filters, seq_len - filter_size +1 ]
        # max over time pooling
#         pooled = F.max_pool1d(conved, conved.shape[2]) # [batch_size, num_filters, 1]
#         pooled = pooled.squeeze(2)
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        pooled = torch.cat(pooled, dim = 1) # [batch_size, 3*num_filters]
        pooled = self.dropout(pooled)
        
        return self.linear(pooled)

In [28]:
model = CNN(vocab_size = VOCAB_SIZE,
            embedding_size = EMBEDDING_SIZE,
            output_size = OUTPUT_SIZE,
            pad_idx = PAD_IDX,
            num_filters = 100, 
            filter_sizes =[3, 4, 5],
            dropout = 0.5)

In [29]:
pretrained_embedding = TEXT.vocab.vectors
# pretrained_embedding.shape
model.embed.weight.data.copy_(pretrained_embedding)  # 把embed.weight.data 初始化为 pretrained_embedding 的形式

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
model.embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_SIZE)

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss() # 需要学习各种criterion

model = model.to(device)
criterion = criterion.to(device)


In [30]:
N_EPOCHS = 10
best_valid_acc = 0.
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        torch.save(model.state_dict(), "cnn-model.pth")
        
    print("Epoch", epoch, "Train Loss", train_loss, "Train Acc", train_acc)
    print("Epoch", epoch, "Valid Loss", valid_loss, "Valid Acc", valid_acc)
    

Epoch 0 Train Loss 0.6529268033300127 Train Acc 0.610971428666796
Epoch 0 Valid Loss 0.5137638531684875 Valid Acc 0.7816
Epoch 1 Train Loss 0.43114568466459 Train Acc 0.797485714326586
Epoch 1 Valid Loss 0.35205441596508025 Valid Acc 0.8484
Epoch 2 Train Loss 0.3050284328392574 Train Acc 0.8746857143947057
Epoch 2 Valid Loss 0.3147250477353732 Valid Acc 0.8672
Epoch 3 Train Loss 0.22456137347902572 Train Acc 0.9123428572109767
Epoch 3 Valid Loss 0.3078390346844991 Valid Acc 0.8684000000317892
Epoch 4 Train Loss 0.16277963508878435 Train Acc 0.9375428572518485
Epoch 4 Valid Loss 0.30748442580302554 Valid Acc 0.8754666666666666
Epoch 5 Train Loss 0.11796935871669224 Train Acc 0.9583428572518485
Epoch 5 Valid Loss 0.3260279593884945 Valid Acc 0.8728
Epoch 6 Train Loss 0.08232758175475256 Train Acc 0.9712000000272478
Epoch 6 Valid Loss 0.3485170791953802 Valid Acc 0.8741333333333333
Epoch 7 Train Loss 0.054858031742913385 Train Acc 0.9836000000272478
Epoch 7 Valid Loss 0.37702733241418995 