In [19]:
import torch 
import torch.nn as nn 
import torch.optim as optim 
import torch.nn.functional as F 
from torch.utils.data import * 
from keras.preprocessing.sequence import pad_sequences 
from keras.datasets import imdb 
import random
import numpy as np

torch.__version__

'1.10.2+cu102'

# LSTM sentiment classification

In [2]:
MAX_WORDS = 10000  # imdb’s vocab_size 即词汇表大小
MAX_LEN = 250      # max length
BATCH_SIZE = 256
EMB_SIZE = 128   # embedding size
HID_SIZE = 128   # lstm hidden size
DROPOUT = 0.2 
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
# 借助Keras加载imdb数据集
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=MAX_WORDS)

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [4]:
word_index = imdb.get_word_index() # 获取字典，字典值从0开始
index_word = dict([(value,key) for (key,value) in word_index.items()]) # 倒转字典
# 字典在转化为序列时每个值都加了3，为0：padding，1：序列起始和2：未知位置，这里减回来。
index_word[-3]='<PAD>'
index_word[-2]='<BOS>'
index_word[-1]='<UNK>'

In [5]:
x_train = pad_sequences(x_train, maxlen=MAX_LEN, padding="post", truncating="post")
x_test = pad_sequences(x_test, maxlen=MAX_LEN, padding="post", truncating="post")
print(x_train.shape, x_test.shape)

(25000, 250) (25000, 250)


In [6]:
x_train

array([[   1,   14,   22, ...,    0,    0,    0],
       [   1,  194, 1153, ...,    0,    0,    0],
       [   1,   14,   47, ...,    0,    0,    0],
       ...,
       [   1,   11,    6, ...,    0,    0,    0],
       [   1, 1446, 7079, ...,    0,    0,    0],
       [   1,   17,    6, ...,    0,    0,    0]], dtype=int32)

In [7]:
# id2word
decode_review = ' '.join([index_word.get(i-3) for i in x_train[2]])
decode_review

"<BOS> this has to be one of the worst films of the 1990s when my friends i were watching this film being the target audience it was aimed at we just sat watched the first half an hour with our jaws touching the floor at how bad it really was the rest of the time everyone else in the theatre just started talking to each other leaving or generally crying into their popcorn that they actually paid money they had <UNK> working to watch this feeble excuse for a film it must have looked like a great idea on paper but on film it looks like no one in the film has a clue what is going on crap acting crap costumes i can't get across how <UNK> this is to watch save yourself an hour a bit of your life <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

In [8]:
y_train

array([1, 0, 0, ..., 0, 1, 0])

In [9]:
# 转化为TensorDataset
train_data = TensorDataset(torch.LongTensor(x_train), torch.LongTensor(y_train))
test_data = TensorDataset(torch.LongTensor(x_test), torch.LongTensor(y_test))

In [10]:
# 转化为 DataLoader
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

test_sampler = SequentialSampler(test_data)
test_loader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

In [11]:
# 定义lstm模型用于文本分类
class Model(nn.Module):
    def __init__(self, max_words, emb_size, hid_size, dropout):
        super(Model, self).__init__()
        self.max_words = max_words
        self.emb_size = emb_size
        self.hid_size = hid_size
        self.dropout = dropout
        self.Embedding = nn.Embedding(self.max_words, self.emb_size)
        self.LSTM = nn.LSTM(self.emb_size, self.hid_size, num_layers=2,
                            batch_first=True, bidirectional=True)   # 2层单向LSTM
        self.dp = nn.Dropout(self.dropout)
        self.fc1 = nn.Linear(self.hid_size*2, self.hid_size)
        self.fc2 = nn.Linear(self.hid_size, 2)
    
    def forward(self, x):
        """
        input : [bs, maxlen]
        output: [bs, 2] 
        """
        x = x.long()
        x = self.Embedding(x)  # [bs, ml, emb_size]
        # print("After embedding, x shape:",x.shape)
        x = self.dp(x)
        x, _ = self.LSTM(x)  # [bs, ml, 2*hid_size]
        # print("After lstm, x shape:",x.shape)
        x = self.dp(x)
        x = F.relu(self.fc1(x))   # [bs, ml, hid_size]
        # print("After relu, x shape:",x.shape)
        x = F.avg_pool2d(x, (x.shape[1], 1)).squeeze()  # [bs, 1, hid_size] => [bs, hid_size]
        out = self.fc2(x)    # [bs, 2]
        out = out.reshape([-1,2])
        out = nn.Softmax(dim=1)(out)
        return out  # [bs, 2]

In [12]:
def train(model, device, train_loader, optimizer, epoch):   # 训练模型
    model.train()
    criterion = nn.CrossEntropyLoss()
    for batch_idx, (x, y) in enumerate(train_loader):
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        y_ = model(x)
        loss = criterion(y_, y)  # 得到loss
        loss.backward()
        optimizer.step()
        if(batch_idx + 1) % 10 == 0:    # 打印loss
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(x), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

In [13]:
def test(model, device, test_loader):    # 测试模型
    model.eval()
    criterion = nn.CrossEntropyLoss(reduction='sum')  # 累加loss
    test_loss = 0.0 
    acc = 0 
    for batch_idx, (x, y) in enumerate(test_loader):
        x, y = x.to(DEVICE), y.to(DEVICE)
        with torch.no_grad():
            y_ = model(x)
        test_loss += criterion(y_, y)
        pred = y_.max(-1, keepdim=True)[1]   # .max() 2输出，分别为最大值和最大值的index
        acc += pred.eq(y.view_as(pred)).sum().item()    # 记得加item()
    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
        test_loss, acc, len(test_loader.dataset),
        100. * acc / len(test_loader.dataset)))
    return acc / len(test_loader.dataset)

In [15]:
model = Model(MAX_WORDS, EMB_SIZE, HID_SIZE, DROPOUT).to(DEVICE)
print(model)
optimizer = optim.Adam(model.parameters())

best_acc = 0.0 
PATH = 'Model/model.pth'  # 定义模型保存路径

Model(
  (Embedding): Embedding(10000, 128)
  (LSTM): LSTM(128, 128, num_layers=2, batch_first=True, bidirectional=True)
  (dp): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=2, bias=True)
)


In [18]:
for epoch in range(1, 11):  # 10个epoch
    train(model, DEVICE, train_loader, optimizer, epoch)
    acc = test(model, DEVICE, test_loader)
    if best_acc < acc: 
        best_acc = acc 
        torch.save(model.state_dict(), PATH)
    print("acc is: {:.4f}, best acc is {:.4f}\n".format(acc, best_acc)) 

Model(
  (Embedding): Embedding(10000, 128)
  (LSTM): LSTM(128, 128, num_layers=2, batch_first=True, bidirectional=True)
  (dp): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=2, bias=True)
)

Test set: Average loss: 0.5780, Accuracy: 17980/25000 (72%)
acc is: 0.7192, best acc is 0.7192


Test set: Average loss: 0.5351, Accuracy: 19109/25000 (76%)
acc is: 0.7644, best acc is 0.7644


Test set: Average loss: 0.4828, Accuracy: 20532/25000 (82%)
acc is: 0.8213, best acc is 0.8213


Test set: Average loss: 0.4711, Accuracy: 20820/25000 (83%)
acc is: 0.8328, best acc is 0.8328


Test set: Average loss: 0.4632, Accuracy: 21058/25000 (84%)
acc is: 0.8423, best acc is 0.8423


Test set: Average loss: 0.4496, Accuracy: 21431/25000 (86%)
acc is: 0.8572, best acc is 0.8572


Test set: Average loss: 0.4543, Accuracy: 21313/25000 (85%)
acc is: 0.8525, best acc is 0.8572


Test set: Average loss: 0.4421, Accur

In [140]:
# 检验保存的模型
best_model = Model(MAX_WORDS, EMB_SIZE, HID_SIZE, DROPOUT).to(DEVICE)
best_model.load_state_dict(torch.load(PATH))
test(best_model, DEVICE, test_loader)


Test set: Average loss: 0.4374, Accuracy: 21769/25000 (87%)


0.87076

# IntegeratedGradients

## select 10 items from test data

In [307]:
random.seed(2022)
idxs_selected=random.sample(range(0,x_test.shape[0]),5)
idxs_selected

[17420, 9467, 14503, 17883, 10159]

In [302]:
x_test_selected=x_test[idxs_selected,:]
y_test_selected=y_test[idxs_selected]

In [303]:
#  '''
# 用于计算NLP模型中的积分梯度；
# 1、由于NLP是离散型输入，因此只能通过对embedding layer的权重进行线性插值来实现输入的线性插值
# 2、计算之后得到的结果是（input_len,dim），计算每一个词向量累加和当做词的重要性
# :return:
# '''
def getIntegeratedGrads(x,y,steps,model):
    # x转tensor
    # 增加一维 1*max_len: 1*250
    x=torch.tensor(x)
    x=torch.unsqueeze(x,dim=0).long()
    
    # 除embedding层外，固定住所有的模型参数
    for name, weight in model.named_parameters():
        if "Embedding" not in name:
            weight.requires_grad = False
    # 获取原始的embedding权重
    # max_words*embedding_dim =10000*128
    init_embed_weight = model.Embedding.weight
    # 获取输入之后的embedding
    # max_len*embedding_dim: 250*128
    init_word_embedding = init_embed_weight[x[0]]
    
    # 获取baseline
    baseline = 0 * init_embed_weight
    baseline_word_embedding = baseline[x[0]]
    
    # 对目标权重进行线性缩放计算的路径
    gradient_list = []
    for i in range(steps + 1):
        # 进行线性缩放
        scale_weight = baseline + float(i / steps) * (init_embed_weight - baseline)
        # 更换模型embedding的权重
        model.state_dict()['Embedding.weight'] = scale_weight
        # 前馈计算
        pred = model(x.to(DEVICE))
        # 直接取对应维度的输出(没经过softmax)
        target_pred = pred[:, y]
        # 计算梯度
        target_pred.backward()
        # 获取输入变量的梯度
        gradient_list.append(model.Embedding.weight.grad[x[0]].cpu().numpy())
        # print(gradient_list[-1])
        # 梯度清零，防止累加
        best_model.zero_grad()
    
    # steps,max_len,dim: 50*250*128
    gradient_list = np.asarray(gradient_list)
    # max_len,dim:250*128
    avg_gradient = np.average(gradient_list, axis=0)
    # x-baseline
    delta_x = init_word_embedding - baseline_word_embedding
    delta_x = delta_x.detach().cpu().numpy()
    # print(delta_x.shape)
    # 获取积分梯度
    # maxlen: 250 获得了每个词的attribution
    ig = avg_gradient * delta_x
    # 对每一行进行相加得到(input_len,)
    word_ig = np.sum(ig, axis=1)
    return word_ig

In [304]:
# 翻译回原始句子并给出情感
def getOriginSentence(x,y,index_word,words_exact):
    x=x[words_exact]
    sentence=' '.join([index_word.get(j-3) for j in x])
    if y==1:
        sentiment="positive"
    else:
        sentiment="negative"
    return sentence,sentiment

In [305]:
# 可视化attributions
# red (very positive attributions) to blue (very negative), while gray color is no attribution
from IPython.display import display, HTML

def visualize_token_attrs(x,y,index_word,words_exact,ig):
    x=x[words_exact]
    tokens=[index_word.get(j-3) for j in x]
    attrs=ig[words_exact]
#   """
#   Visualize attributions for given set of tokens.
#   Args:
#   - tokens: An array of tokens
#   - attrs: An array of attributions, of same size as 'tokens',
#     with attrs[i] being the attribution to tokens[i]
  
#   Returns:
#   - visualization: An IPython.core.display.HTML object showing
#     tokens color-coded based on strength of their attribution.
#   """
    
    def get_color(attr):
        if attr > 0:
            r = int(128*attr) + 127
            g = 128 - int(64*attr)
            b = 128 - int(64*attr) 
        else:
            r = 128 + int(64*attr)
            g = 128 + int(64*attr) 
            b = int(-128*attr) + 127
        return r,g,b

  # normalize attributions for visualization.
    # bound = max(abs(attrs.max()), abs(attrs.min()))
    bound = np.percentile(abs(attrs),75)
    attrs = attrs/bound
    attrs[attrs > 1] = 1
    html_text = ""
    for i, tok in enumerate(tokens):
        r,g,b = get_color(attrs[i])
        html_text += " <span style='color:rgb(%d,%d,%d)'>%s</span>" % (r, g, b, tok)
    return HTML(html_text)

In [306]:
steps=70
for i in range(0,5):
    xi=x_test_selected[i,:]
    yi=y_test_selected[i]
    words_exact=(xi!=0)
    ig=getIntegeratedGrads(xi,yi,steps,best_model)
    sentence,sentiment=getOriginSentence(xi,yi,index_word,words_exact)
    print("Sentiment:",sentiment)
    print("Sentence:",sentence)
    display(visualize_token_attrs(xi,yi,index_word,words_exact,ig))
    # red (very positive attributions) to blue (very negative), while gray color is no attribution
    print("\n")

Sentiment: positive
Sentence: <BOS> now i myself had previously seen a few episodes of the of gentleman which i found hilarious when i brought the film i was not sure if i knew enough about the series to get it boy was i wrong this is one of the best comedy films i have seen ever and the clever acting of the makes the film it has a very good and funny plot as well as using only a few characters at any one time helps because it doesn't make it too confusing which would have <UNK> the film even if you have never seen the of gentleman get this film it will make you laugh and this is a film that can be watched more then once and is an excellent film to watch with your mates it truly deserves it review a definite 10 out of 10




Sentiment: negative
Sentence: <BOS> this show is pretty alright and fun to watch its a great disney channel shows and sometimes entertaining br br i really enjoyed the first season but i hated the second and third seasons this show has completely changed around in the first season it was more about science and animals all that is gone now in the season and third season its more about her life and dating ever since that gay kid ben came along this show has sucked the writers took a perfectly good kid show and changed it to a crappy teen comedy disney took a turn for the worse i cant stand to watch the <UNK> episodes anymore they're all garbage




Sentiment: positive
Sentence: <BOS> the title refers not to a questionable poker hand but to six comic players they come in <UNK> charles and mary <UNK> as a couple driving to california for a second honeymoon george burns and <UNK> allen as another couple who go along to share <UNK> and w c fields and alison as a sheriff and a hotel owner in a tiny <UNK> town no attempt is made to fashion a coherent a collection of comic bits strung together all the first couple want to do is spend time together but burns and allen's characters aren't married so the men <UNK> together as do the women there is a bit of a plot a bad guy plants <UNK> in the <UNK> <UNK> is taking out of town but because the expedition is being guided by <UNK> the <UNK> cannot be found the bad guy shows up in <UNK> and fields accidentally captures him a bunch of pleasant bits confused expression <UNK> <UNK> <UNK> talk and fields playing <UNK> with a <UNK> cue and doing a <UNK> craven backwards stepping double take when h



Sentiment: positive
Sentence: <BOS> i was fortunate enough to be an extra in this movie when i was about 13 during the roller <UNK> scenes my junior high school drama class was invited to participate it was a fantastic experience br br gary busey charles martin smith and don <UNK> played the music live all day as a musician i can appreciate the <UNK> work and dedication these guys put in to their roles they must have played those songs 20 times it's very difficult to maintain <UNK> and energy under those conditions this is visible during a cut to a close up on that'll be the day but fortunately the unsuspecting public probably wouldn't have picked it up br br <UNK> around all day getting the day off from school and being transported back in time was a incredible thrill i also had my first date on film i had to walk a girl up to the ticket booth woo <UNK> even with an out of date <UNK> and hot lights melting the <UNK> in my hair it was still worth it fun stuff br br the movie is top n



Sentiment: negative
Sentence: <BOS> i don't know why all the previous comments are <UNK> of this movie it is well not by far but the <UNK> movie i've seen lately full of clichés bad acting actually no very bad acting and has a silly plot if i would have seen it in a cinema i would have walked out after the first 20 minutes i f you hate somebody make him her watch this movie that's how bad it is a girl who has an imaginary boy friend that gives up a relationship with a real one because her imagination is jealous but i think it figures she takes after her parents who also have some mental issues plus the character who is supposed to be i think the laughing stock of it the thing that should make you laugh <UNK> room mate is a serious nut case and just makes me feel sorry for him and the whole movie






In [308]:
a=ig[words_exact]
b=np.percentile(abs(a),75)
a=a/b
a[a>1]=1
df=pd.DataFrame({'word_id':xi[words_exact],
                'attrs':a,
                'sentiment':yi})
df['word']=df['word_id'].map(lambda x:index_word[x-3])
df.groupby('word').mean().sort_values(by=['attrs'],ascending=False)

Unnamed: 0_level_0,word_id,attrs,sentiment
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
of,7,1.0,0
her,41,1.0,0
supposed,424,1.0,0
just,43,1.0,0
sorry,806,1.0,0
mental,1752,1.0,0
bad,78,1.0,0
minutes,234,1.0,0
for,18,0.972146,0
mate,3678,0.811338,0
