In [23]:
import re
import tensorflow as tf
import random
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

## Load data

In [2]:
with open('./data/rt-polaritydata/rt-polarity-utf8.pos', 'r') as f:
    raw_positive_sentences = f.readlines()
    raw_positive_sentences = [raw_sentence.replace(' \n', '') for raw_sentence in raw_positive_sentences]

In [3]:
with open('./data/rt-polaritydata/rt-polarity-utf8.neg', 'r') as f:
    raw_negative_sentences = f.readlines()
    raw_negative_sentences = [raw_sentence.replace(' \n', '') for raw_sentence in raw_negative_sentences]

## Pre-processings

In [4]:
def _string_cleaner(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [5]:
raw_positive_sentences = [_string_cleaner(sentence) for sentence in raw_positive_sentences]
raw_negative_sentences = [_string_cleaner(sentence) for sentence in raw_negative_sentences]

In [6]:
vocab = ['<unk>']

for sentence in raw_positive_sentences:
    for word in sentence.split(" "):
        if word not in vocab:
            vocab.append(word)

for sentence in raw_negative_sentences:
    for word in sentence.split(" "):
        if word not in vocab:
            vocab.append(word)

In [7]:
len(vocab)

18765

In [8]:
positive_sentences = []
negative_sentences = []
positive_labels = []
negative_labels = []
all_sentences = []
all_labels = []
sentence_length = 0

for sentence in raw_positive_sentences:
    s = []
    for word in sentence.split(" "):
        s.append(vocab.index(word))
    positive_sentences.append(s)
    positive_labels.append(1)
    all_sentences.append(s)
    all_labels.append(1)

for sentence in raw_negative_sentences:
    s = []
    for word in sentence.split(" "):
        s.append(vocab.index(word))
    negative_sentences.append(s)
    negative_labels.append(0)
    all_sentences.append(s)
    all_labels.append(0)
    
for sentence in positive_sentences:
    if sentence_length < len(sentence):
        sentence_length = len(sentence)
        
for sentence in negative_sentences:
    if sentence_length < len(sentence):
        sentence_length = len(sentence)

In [9]:
print(positive_sentences[0])
print(raw_positive_sentences[0])
print(sentence_length)

[1, 2, 3, 4, 5, 6, 1, 7, 8, 9, 10, 11, 12, 13, 14, 9, 15, 5, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
the rock is destined to be the 21st century 's new conan and that he 's going to make a splash even greater than arnold schwarzenegger , jean claud van damme or steven segal
56


In [10]:
def paddinger(sentence_list, sentence_length):
    for sentence in sentence_list:
        if len(sentence) != sentence_length:
            padding_list = [0 for _ in range(0, sentence_length - len(sentence))]
            sentence.extend(padding_list)
    return sentence_list

In [11]:
positive_sentences = paddinger(positive_sentences, sentence_length)
negative_sentences = paddinger(negative_sentences, sentence_length)
all_sentences = paddinger(all_sentences, sentence_length)

In [27]:
index_list = [i for i in range(0, len(all_sentences))]

random.seed(0)
random.shuffle(index_list)

shuffled_sentence = []
shuffled_labels = []
for index in index_list:
    shuffled_sentence.append(all_sentences[index])
    shuffled_labels.append(all_labels[index])
    
counter = 0
train_data = []
test_data = []
train_label = []
test_label = []

for sentence in shuffled_sentence:
    if counter < (len(shuffled_sentence) - (len(shuffled_sentence) * 0.01)):
            train_data.append(sentence)
            counter += 1
    else:
        test_data.append(sentence)
        counter += 1

counter = 0
for label in shuffled_labels:
    if counter < (len(shuffled_labels) - (len(shuffled_labels) * 0.01)):
        train_label.append(label)
        counter += 1
    else:
        test_label.append(label)
        counter += 1
sample_data = [train_data, test_data]
sample_label = [train_label, test_label]

In [28]:
train_data = np.array(sample_data[0])
train_label = np.array(sample_label[0])
print("data: {}, label: {}".format(len(train_data), len(train_label)))

data: 10556, label: 10556


## Word enbeddings

In [29]:
embedding_dim = 128
vocab_size = len(vocab)

model = keras.Sequential([
    layers.Embedding(vocab_size, embedding_dim, input_length=sentence_length),
    layers.GRU(units=32, return_sequences=True),
    layers.GRU(units=32),
    layers.Dense(1, activation='sigmoid')
])

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 56, 128)           2401920   
_________________________________________________________________
unified_gru_6 (UnifiedGRU)   (None, 56, 32)            15552     
_________________________________________________________________
unified_gru_7 (UnifiedGRU)   (None, 32)                6336      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 2,423,841
Trainable params: 2,423,841
Non-trainable params: 0
_________________________________________________________________


In [30]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(train_data, train_label, epochs=30, batch_size=256, validation_split=0.2)

Train on 8444 samples, validate on 2112 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [31]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(18765, 128)


In [32]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(vocab_size):
    word = vocab[word_num]
    embeddings = weights[word_num]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()