In [112]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.manifold import TSNE

from tensorflow.keras import layers
import tensorflow_datasets as tfds

import matplotlib.pyplot as plt
from highlight_text import ax_text, fig_text

import altair as alt

In [2]:
data = pd.read_csv('./combined_sentiment_labelled.tsv', sep='\t')

In [40]:
# data

In [21]:
# len(set([x for l in data.text.apply(lambda x: x.split(" ")).values for x in l]))

8018

https://www.tensorflow.org/tutorials/text/text_classification_rnn

In [3]:
dataset = tfds.load('imdb_reviews',
                    as_supervised=True)



In [4]:
train_dataset, test_dataset = dataset['train'], dataset['test']

In [8]:
# for example, label in train_dataset.take(1):
#   print('text: ', example.numpy())
#   print('label: ', label.numpy())

In [5]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

In [6]:
VOCAB_SIZE=1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [7]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary())+2,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  optimizer=tf.keras.optimizers.Adam(1e-4),
                  metrics=['accuracy'])

In [8]:
# with tf.device('/GPU:0'):
#     model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
#                   optimizer=tf.keras.optimizers.Adam(1e-4),
#                   metrics=['accuracy'])
#     history = model.fit(train_dataset, epochs=10,
#                         validation_data=test_dataset, 
#                         validation_steps=30)

In [147]:
# model.save_weights('first_model/')

In [9]:
model.load_weights('first_model/')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1bccf0b7a20>

----

In [10]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    GREEN = '\033[92m'
    WARNING = '\033[93m'
    RED = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [11]:
text = data.text[7]


def color_text(text, model=model):
    tokens = text.split(" ")
    probs = [0]
    for k in range(0,len(tokens)):
        probs.append(model.predict(np.array([" ".join(tokens[:k+1])]))[0][0])
    pred = probs[-1]
    probs = np.diff(probs)
    colors = [bcolors.ENDC if abs(p / max(np.abs(probs))) < 0.1 
                  else (bcolors.RED if p < 0 else bcolors.GREEN) 
              for p in probs]
    ends = [bcolors.ENDC] * len(probs)
    return " ".join([c+t+e for c,t,e in zip(colors, tokens, ends)]), pred

https://stackoverflow.com/questions/287871/how-to-print-colored-text-in-python

In [12]:
for text in data.text[:20]:
    pred = color_text(text)
    print("NEGATIVE" if pred[1] < 0 else "POSITIVE", "|", pred[0])

NEGATIVE | [0mSo[0m [0mthere[0m [0mis[0m [91mno[0m [0mway[0m [0mfor[0m [0mme[0m [0mto[0m [0mplug[0m [92mit[0m [0min[0m [91mhere[0m [0min[0m [0mthe[0m [92mUS[0m [91munless[0m [0mI[0m [0mgo[0m [0mby[0m [0ma[0m [0mconverter.[0m
POSITIVE | [92mGood[0m [0mcase,[0m [92mExcellent[0m [0mvalue.[0m
POSITIVE | [92mGreat[0m [0mfor[0m [0mthe[0m [0mjawbone.[0m
POSITIVE | [91mTied[0m [91mto[0m [0mcharger[0m [0mfor[0m [0mconversations[0m [0mlasting[0m [92mmore[0m [0mthan[0m [0m45[0m [0mminutes.MAJOR[0m [92mPROBLEMS!![0m
POSITIVE | [0mThe[0m [0mmic[0m [0mis[0m [92mgreat.[0m
NEGATIVE | [0mI[0m [91mhave[0m [91mto[0m [0mjiggle[0m [0mthe[0m [0mplug[0m [91mto[0m [92mget[0m [92mit[0m [0mto[0m [91mline[0m [0mup[0m [92mright[0m [0mto[0m [92mget[0m [91mdecent[0m [0mvolume.[0m
POSITIVE | [91mIf[0m [92myou[0m [0mhave[0m [92mseveral[0m [0mdozen[0m [0mor[0m [92mseveral[0m [0mhundr

-----------

### Average of word embeddings

In [173]:
def sen2vec(x):
    return model.get_layer(name='embedding')(model.get_layer(name="text_vectorization")(x))

In [174]:
a = sen2vec([[x] for x in data.text.values[:3]])

In [175]:
# samples, words, embedding
a.shape

TensorShape([3, 21, 64])

### Processed sentences

In [217]:
sen2vec_model = tf.keras.Sequential([
    model.get_layer(name="text_vectorization"),
    model.get_layer(name='embedding'),
    model.get_layer(name='lstm'),
    model.get_layer(name='dense')
])

In [219]:
sentences = data.sample(n=50).text.values

tsne = TSNE()
tsned_space_raw = tsne.fit_transform(sen2vec([[x] for x in sentences]).numpy().mean(axis=1))

tsned_space_proc = tsne.fit_transform(sen2vec_model.predict(sentences))

In [220]:
tsne_plot_data = pd.DataFrame({'x_raw': tsned_space_raw[:,0], 
                               'y_raw': tsned_space_raw[:,1],
                               'x_proc': tsned_space_proc[:,0], 
                               'y_proc': tsned_space_proc[:,1],
                               'sentence': sentences, 
                               'pred': ['Positive' if x else 'Negative' 
                                        for x in (model.predict(sentences).reshape(-1) > 0)]})

In [221]:
selector_raw = alt.selection_interval(empty='none', encodings=['x', 'y'])
selector_proc = alt.selection_interval(empty='none', encodings=['x', 'y'])

In [225]:
words_tsned = alt.Chart(tsne_plot_data).mark_circle(size=200).encode(
    x = 'x_raw',
    y = 'y_raw',
    tooltip = alt.Tooltip('sentence'),
    color = alt.Color('pred', scale=alt.Scale(domain=['Negative', 'Positive'], 
                                              range=['red', 'green'])),
    opacity=alt.condition(selector_proc, alt.value(1), alt.value(0.1))
).properties(
    title='Raw sentences'
).add_selection(
    selector_raw
)

In [226]:
sentences_tsned = alt.Chart(tsne_plot_data).mark_circle(size=200).encode(
    x = 'x_proc',
    y = 'y_proc',
    tooltip = alt.Tooltip('sentence'),
    color = alt.Color('pred', scale=alt.Scale(domain=['Negative', 'Positive'], 
                                              range=['red', 'green'])),
    opacity=alt.condition(selector_raw, alt.value(1), alt.value(0.1))
).properties(
    title='Processed sentences'
).add_selection(
    selector_proc
)

In [227]:
words_tsned | sentences_tsned