In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.manifold import TSNE

from tensorflow.keras import layers
import tensorflow_datasets as tfds

import matplotlib.pyplot as plt
from highlight_text import ax_text, fig_text

import altair as alt

from tqdm import tqdm

In [2]:
def probability(x):
    return np.round(np.abs(2 * (1 / (1 + np.exp(-x)) - 0.5)), 2)

In [3]:
data = pd.read_csv('./combined_sentiment_labelled.tsv', sep='\t')

In [4]:
# data

In [5]:
# len(set([x for l in data.text.apply(lambda x: x.split(" ")).values for x in l]))

https://www.tensorflow.org/tutorials/text/text_classification_rnn

In [6]:
dataset = tfds.load('imdb_reviews',
                    as_supervised=True)



In [7]:
train_dataset, test_dataset = dataset['train'], dataset['test']

In [8]:
# for example, label in train_dataset.take(1):
#   print('text: ', example.numpy())
#   print('label: ', label.numpy())

In [9]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

In [10]:
VOCAB_SIZE=5000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [11]:
# sum([1 for i in train_dataset.as_numpy_iterator()])

In [12]:
# checkpoint_path = "training/cp-{epoch:04d}.ckpt"

# cp_callback = tf.keras.callbacks.ModelCheckpoint(
#     filepath=checkpoint_path, 
#     verbose=1, 
#     save_weights_only=True)

In [13]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()) + 2,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1)
])
model.predict(train_dataset.take(1))
print()




In [14]:
# with tf.device('/GPU:0'):
#     model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
#                   optimizer=tf.keras.optimizers.Adam(1e-4),
#                   metrics=['accuracy'])
    
#     model.predict(train_dataset.take(1))
#     model.save_weights(checkpoint_path.format(epoch=0))
    
#     history = model.fit(train_dataset, 
#                         epochs=5,
#                         validation_data=test_dataset, 
#                         validation_steps=30,
#                         callbacks=[cp_callback])

In [15]:
# model.save_weights('trained_model/')

In [16]:
model.load_weights('trained_model/')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x26cbd083ef0>

In [None]:
history.history

In [None]:
plt.plot(history.history['accuracy'], label='train accuracy')
plt.plot(history.history['val_accuracy'], label='test accuracy')
plt.title('Model Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend()
plt.show()

----

In [53]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    GREEN = '\033[92m'
    WARNING = '\033[93m'
    RED = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [54]:
def color_text(text, model=model):
    tokens = text.split(" ")
    probs = [0]
    for k in range(0,len(tokens)):
        probs.append(model.predict(np.array([" ".join(tokens[:k+1])]))[0][0])
    pred = probs[-1]
    probs = np.diff(probs)
    colors = [bcolors.ENDC if abs(p / max(np.abs(probs))) < 0.1 
                  else (bcolors.RED if p < 0 else bcolors.GREEN) 
              for p in probs]
    ends = [bcolors.ENDC] * len(probs)
    return " ".join([c+t+e for c,t,e in zip(colors, tokens, ends)]), pred

https://stackoverflow.com/questions/287871/how-to-print-colored-text-in-python

In [98]:
reviews = data.text[np.random.randint(0, len(data), size=5)]
model.load_weights('./training/cp-0005.ckpt')

for text in reviews:
    pred = color_text(text, model=model)
    print("NEGATIVE" if pred[1] < 0 else "POSITIVE", "|", probability(pred[1]), "|", pred[0])

NEGATIVE | 0.58 | [91mThen,[0m [91mas[0m [91mif[0m [0mI[0m [0mhadn't[0m [91mwasted[0m [91menough[0m [0mof[0m [0mmy[0m [0mlife[0m [91mthere,[0m [0mthey[0m [91mpoured[0m [0msalt[0m [0min[0m [0mthe[0m [0mwound[0m [0mby[0m [91mdrawing[0m [0mout[0m [0mthe[0m [0mtime[0m [0mit[0m [0mtook[0m [0mto[0m [0mbring[0m [0mthe[0m [0mcheck.[0m
POSITIVE | 0.48 | [0mThis[0m [92mwonderful[0m [92mexperience[0m [0mmade[0m [0mthis[0m [0mplace[0m [0ma[0m [0mmust-stop[0m [0mwhenever[0m [0mwe[0m [0mare[0m [0min[0m [0mtown[0m [0magain.[0m
POSITIVE | 0.35 | [0mThis[0m [0mwas[0m [91msuch[0m [0man[0m [92mawesome[0m [0mmovie[0m [0mthat[0m [0mi[0m [92mbought[0m [92mit[0m [91moff[0m [0mof[0m [91mEbay.[0m [0m[0m [0m[0m
NEGATIVE | 0.4 | [92mThe[0m [92mlast[0m [0m15[0m [91mminutes[0m [0mof[0m [91mmovie[0m [91mare[0m [92malso[0m [91mnot[0m [91mbad[0m [91mas[0m [92mwell.[0m [0m[0m [0m

In [None]:
text = data.text[np.random.randint(0, len(data))]

for i in range(6):
    model.load_weights(f'./training/cp-000{i}.ckpt')
    pred = color_text(text, model=model)
    print(f"Epoch {i}", "|", "NEGATIVE" if pred[1] < 0 else "POSITIVE", "|", probability(pred[1]), "|", pred[0])

In [128]:
import pickle

colored = []

def color_text(text, model):
    def make_colored_text(text, p, probs):
        if abs(p / max(np.abs(probs))) < 0.1:
            return f"<span style='color:grey; opacity:0.3'>{text}</span>"
        elif p < 0:
            return f"<span style='color:red; opacity:{abs(p / max(np.abs(probs))) + 0.2}'>{text}</span>"
        else:
            return f"<span style='color:green; opacity:{abs(p / max(np.abs(probs))) + 0.2}'>{text}</span>"

    tokens = text.split(" ")
    probs = [0]
    for k in range(0, len(tokens)):
        probs.append(model.predict(np.array([" ".join(tokens[:k+1])]))[0][0])
    fin_prob = probs[-1]
    probs = np.diff(probs)
    colored_texts = [make_colored_text(token, p, probs)
                     for token, p in zip(tokens, probs)]
    return " ".join(colored_texts), fin_prob

for text in tqdm(data.text):
    try:
        pred = color_text(text, model=model)
        colored.append(("NEG" if pred[1] < 0 else "POS") + " | " +
                                str(probability(pred[1])) + " | " +
                                pred[0])
    except:
        pass
    
with open('colored.txt', 'wb') as f:
    pickle.dump(colored, f)

In [127]:
display(Markdown(color_text(data.iloc[0].text, model)[0]))

<span style='color:grey; opacity:0.3'>So</span> <span style='color:red; opacity:0.4014657391256363'>there</span> <span style='color:grey; opacity:0.3'>is</span> <span style='color:red; opacity:0.7757818412859647'>no</span> <span style='color:grey; opacity:0.3'>way</span> <span style='color:grey; opacity:0.3'>for</span> <span style='color:grey; opacity:0.3'>me</span> <span style='color:grey; opacity:0.3'>to</span> <span style='color:red; opacity:0.3662201889201698'>plug</span> <span style='color:green; opacity:0.40451070797823185'>it</span> <span style='color:grey; opacity:0.3'>in</span> <span style='color:red; opacity:0.3899029169566821'>here</span> <span style='color:grey; opacity:0.3'>in</span> <span style='color:grey; opacity:0.3'>the</span> <span style='color:green; opacity:0.5628545339016471'>US</span> <span style='color:red; opacity:1.2'>unless</span> <span style='color:grey; opacity:0.3'>I</span> <span style='color:grey; opacity:0.3'>go</span> <span style='color:grey; opacity:0.3'>by</span> <span style='color:grey; opacity:0.3'>a</span> <span style='color:grey; opacity:0.3'>converter.</span>

In [120]:
from IPython.display import display, Markdown
display(Markdown("<span style='color:red; opacity:0.3'>there</span>"))

<span style='color:red; opacity:0.3'>there</span>

-----------

### Average of word embeddings

In [20]:
def sen2vec(x):
    return model.get_layer(name='embedding')(model.get_layer(name="text_vectorization")(x))

In [21]:
a = sen2vec([[x] for x in data.text.values[:3]])

In [None]:
# samples, words, embedding
a.shape

### Processed sentences

In [22]:
sen2vec_model = tf.keras.Sequential([
    model.get_layer(name="text_vectorization"),
    model.get_layer(name='embedding'),
    model.get_layer(name='lstm'),
    model.get_layer(name='dense')
])

In [24]:
sen2vec_model_interm = tf.keras.Sequential([
    model.get_layer(name="text_vectorization"),
    model.get_layer(name='embedding'),
    model.get_layer(name='lstm')
])

In [25]:
sentences = data.sample(n=50).text.values

tsne = TSNE()
tsned_space_raw = tsne.fit_transform(sen2vec([[x] for x in sentences]).numpy().mean(axis=1))

tsned_space_proc = tsne.fit_transform(sen2vec_model.predict(sentences))

tsned_space_intermediate = tsne.fit_transform(sen2vec_model_interm.predict(sentences))

In [26]:
tsne_plot_data = pd.DataFrame({'x_raw': tsned_space_raw[:,0], 
                               'y_raw': tsned_space_raw[:,1],
                               'x_interm': tsned_space_intermediate[:,0], 
                               'y_interm': tsned_space_intermediate[:,1],
                               'x_proc': tsned_space_proc[:,0], 
                               'y_proc': tsned_space_proc[:,1],
                               'sentence': sentences, 
                               'opacity': np.abs(model.predict(sentences).reshape(-1)),
                               'prob': model.predict(sentences).reshape(-1).round(2).astype(str),
                               'pred': ['Positive' if x else 'Negative' 
                                        for x in (model.predict(sentences).reshape(-1) > 0)]})

In [27]:
selector_embs = alt.selection_interval(empty='all', encodings=['x', 'y'])

In [76]:
words_tsned = alt.Chart(tsne_plot_data).mark_circle(size=200).encode(
    x = 'x_raw',
    y = 'y_raw',
    tooltip =[alt.Tooltip('sentence'), alt.Tooltip('prob')],
    color = alt.Color('pred', 
                      scale=alt.Scale(domain=['Negative', 'Positive'], 
                                      range=['red', 'green']), 
                      legend=None),
    opacity=alt.condition(selector_embs, 'opacity', alt.value(0.05), legend=None)
).properties(
    title='Raw sentences'
).add_selection(
    selector_embs
)

In [77]:
interm_tsned = alt.Chart(tsne_plot_data).mark_circle(size=200).encode(
    x = 'x_interm',
    y = 'y_interm',
    tooltip =[alt.Tooltip('sentence'), alt.Tooltip('prob')],
    color = alt.Color('pred', 
                      scale=alt.Scale(domain=['Negative', 'Positive'], 
                                      range=['red', 'green']), 
                      legend=None),
    opacity=alt.condition(selector_embs, 'opacity', alt.value(0.05), legend=None)
).properties(
    title='Intermediate state sentences'
).add_selection(
    selector_embs
)

In [101]:
sentences_tsned = alt.Chart(data=tsne_plot_data).mark_circle(size=200).encode(
    x = 'x_proc',
    y = 'y_proc',
    tooltip =[alt.Tooltip('sentence'), alt.Tooltip('prob')],
    color = alt.Color('pred', 
                      scale=alt.Scale(domain=['Negative', 'Positive'], 
                                      range=['red', 'green']), 
                      legend=alt.Legend(symbolOpacity=1)),
    opacity=alt.condition(selector_embs, 'opacity', alt.value(0.05), legend=None)
).properties(
    title='Processed sentences'
).add_selection(
    selector_embs
)

In [105]:
import vega

In [125]:
alt.Chart(data=tsne_plot_data).mark_circle(size=200).encode(
    x = 'x_proc',
    color = alt.Color('pred', 
                      legend=alt.Legend(symbolOpacity=1)),
    opacity=alt.value(0.05)
)

In [89]:
dummy_leg = alt.Chart(data=pd.DataFrame([[0,0,'Negative']], columns=['x', 'y', 'Prediction'])).mark_circle(size=0).encode(
    x = 'x',
    y = 'y',
    color = alt.Color('Prediction', 
                      scale=alt.Scale(domain=['Negative', 'Positive'], 
                                      range=['red', 'green']))
)

In [91]:
(words_tsned + dummy_leg) & interm_tsned & sentences_tsned