# How to apply pretrained RNNs to sentiment classification

In [201]:
import re
import torch
from torch.autograd import Variable

import sys
# git clone https://github.com/Cadene/skip-thoughts.torch your-path/skip-thoughts.torch 
sys.path.append('/Users/remicadene/Documents/skip-thoughts.torch/pytorch')
import skipthoughts

In [212]:
list_text = []
list_text.append("""This product is so cool!""")
list_text.append("""This product is awesome!.""")
list_text.append("""This product is the worst.""")
list_text.append("""This product is bad.""")

In [202]:
def tokenize_mcb(s):
    t_str = s.lower()
    for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']:
        t_str = re.sub( i, '', t_str)
    for i in [r'\-',r'\/']:
        t_str = re.sub( i, ' ', t_str)
    q_list = re.sub(r'\?','',t_str.lower()).split(' ')
    q_list = list(filter(lambda x: len(x) > 0, q_list))
    return q_list

In [203]:
list_words = []
for text in list_text:
    list_words.append(tokenize_mcb(text))

print("Sentence: ", list_text[0])
print()
print("Tokenized: ", list_words[0])

Sentence:  This product is so cool!

Tokenized:  ['this', 'product', 'is', 'so', 'cool']


In [204]:
vocab = sorted(list(set(sum(list_words, []))))
wid_to_word = {k+1:v for k,v in enumerate(vocab)}
word_to_wid = {v:k+1 for k,v in enumerate(vocab)}

print("Vocab: ", vocab)
print()
print("wid_to_word: ", wid_to_word)
print()
print("word_to_wid: ", word_to_wid)

Vocab:  ['awesome', 'bad', 'cool', 'is', 'product', 'so', 'the', 'this', 'worst']

wid_to_word:  {1: 'awesome', 2: 'bad', 3: 'cool', 4: 'is', 5: 'product', 6: 'so', 7: 'the', 8: 'this', 9: 'worst'}

word_to_wid:  {'awesome': 1, 'bad': 2, 'cool': 3, 'is': 4, 'product': 5, 'so': 6, 'the': 7, 'this': 8, 'worst': 9}


In [206]:
model = skipthoughts.UniSkip(dir_st='/local/cadene/data/skip-thoughts', vocab=vocab)
model.eval()
print(model)

UniSkip (
  (embedding): Embedding(10, 620, padding_idx=0)
  (rnn): GRU(620, 2400, batch_first=True, dropout=0.25)
)


In [207]:
input_data = torch.zeros(len(list_words), 30).long()
for i, words in enumerate(list_words):
    for j, word in enumerate(words):
        input_data[i, j] = word_to_wid[word]

print('Text:', list_text[0])
print()
print('Tokens/words:', list_words[0])
print()
print('Words indices (wid):', input_data[0])

Text: This product is so cool!

Tokens/words: ['this', 'product', 'is', 'so', 'cool']

Words indices (wid): 
 8
 5
 4
 6
 3
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
[torch.LongTensor of size 30]



In [208]:
input = Variable(input_data, requires_grad=False)
output = model(input)

print(output)

Variable containing:
 1.1492e-01 -1.3318e-01 -3.1362e-02  ...  -1.0996e-01 -1.5351e-01  1.1315e-01
 1.1731e-01 -1.2809e-01  1.8069e-01  ...  -3.5556e-03 -7.3061e-02  8.4156e-02
 6.8998e-02 -8.0519e-02 -3.5652e-02  ...  -2.2748e-04  5.7240e-02  8.8556e-02
 9.8589e-02 -1.2645e-01  9.2250e-02  ...  -2.8879e-03  9.3828e-03  8.0683e-02
[torch.FloatTensor of size 4x2400]



In [209]:
print("1. Good sentence:", list_text[0])
print()
print("2. Good sentence:", list_text[1])
print()
print("3. Bad sentence:", list_text[2])
print()
print("4. Bad sentence:", list_text[3])

1. Good sentence: This product is so cool!

2. Good sentence: This product is awesome!.

3. Bad sentence: This product is the worst.

4. Bad sentence: This product is bad.


In [210]:
for i in range(4):
    print("Distance between 1. and "+str(i+1)+". :", torch.dist(output.data[0], output.data[i]))
    
print()

for i in range(4):
    print("Distance between 3. and "+str(i+1)+". :", torch.dist(output.data[2], output.data[i]))

Distance between 1. and 1. : 0.0
Distance between 1. and 2. : 6.261634373944153
Distance between 1. and 3. : 7.528564480834965
Distance between 1. and 4. : 6.926180566075845

Distance between 3. and 1. : 7.528564480834965
Distance between 3. and 2. : 7.2155281572546555
Distance between 3. and 3. : 0.0
Distance between 3. and 4. : 6.143091070593966
