In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.manifold import TSNE

from tensorflow.keras import layers
import tensorflow_datasets as tfds

import matplotlib.pyplot as plt
from highlight_text import ax_text, fig_text

import altair as alt

In [2]:
train = pd.read_csv('./combined_sentiment_labelled.tsv', sep='\t')
dataset = tfds.load('imdb_reviews',
                    as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

In [3]:

original_data, targets = [], []
for example, label in train_dataset.take(-1):
    original_data.append(example.numpy().decode("utf-8"))
    targets.append(label)
    
original_data = np.array(original_data)
targets = np.array(targets)

original_data[:2]

array(["This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
       'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot de

In [4]:
import re
import nltk
import string
from nltk.stem.wordnet import WordNetLemmatizer 

nltk.download("stopwords", quiet = True)
nltk.download("wordnet", quiet = True)
nltk.download("punkt", quiet = True)
nltk.download('averaged_perceptron_tagger', quiet = True)

lemmatizer = WordNetLemmatizer()
english_stopwords = set(nltk.corpus.stopwords.words('english'))

def process_text(text):
    def get_pos(tag):
        if tag.startswith("J"):
            return "a"
        elif tag.startswith("V"):
            return "v"
        elif tag.startswith("R"):
            return "r"
        else:
            return "n"
        
    text = text.replace("<br />", "")
    text = text.replace("\'", "'")
    
    text = re.sub(r"'s", "", text.lower())
    text = re.sub(r"([a-z0-9]+)'([^s])", r"\1\2", text)
    text = re.sub(rf"[^{string.ascii_letters}0-9]", " ", text)
    
    
    tokenized = []
    for token in nltk.word_tokenize(text):
        token, tag = nltk.pos_tag([token])[0]
        t = lemmatizer.lemmatize(token, pos=get_pos(tag))
        if t not in english_stopwords and len(t) > 1:
            tokenized.append(t)
    return " ".join(tokenized)

In [5]:
processed_data = np.array([process_text(t) for t in original_data])
processed_data[:10]

array(['absolutely terrible movie dont lure christopher walken michael ironside great actor must simply bad role history even great act could redeem movie ridiculous storyline movie early ninety propaganda piece pathetic scene columbian rebel make case revolution maria conchita alonso appear phony pseudo love affair walken nothing pathetic emotional plug movie devoid real meaning disappointed movie like ruin actor like christopher walken good name could barely sit',
       'know fall asleep film usually due combination thing include really tire warm comfortable sette eat lot however occasion fell asleep film rubbish plot development constant constantly slow boring thing seem happen explanation cause admit may miss part film watch majority everything seem happen accord without real concern anything else cant recommend film',
       'mann photograph alberta rocky mountain superb fashion jimmy stewart walter brennan give enjoyable performance always seem come hollywood mountie tell people

In [6]:
np.save("processed_imdb_train.npy", processed_data)
np.save("processed_imdb_target.npy", targets)

In [7]:
ixs = list(range(len(processed_data)))
np.random.shuffle(ixs)
processed_data = processed_data[ixs]
targets = targets[ixs]

X_train = processed_data[10000:]
X_test = processed_data[:10000]

y_train = targets[10000:]
y_test = targets[:10000]

In [68]:
save_model(X_train, "X_train.pkl")

https://www.tensorflow.org/tutorials/text/text_classification_rnn

In [8]:
VOCAB_SIZE=10000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(X_train)

In [9]:
len(encoder.get_vocabulary())

9999

In [15]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()) + 2,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1)
])

In [16]:
checkpoint_path = "checkpoints/cp-{epoch:04d}.ckpt"

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=True)

with tf.device('/GPU:0'):
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  optimizer=tf.keras.optimizers.Adam(1e-4),
                  metrics=['accuracy'])
    
    model.predict((X_train[0], y_train[0]))
    #model.predict(train_dataset.take(1))
    model.save_weights(checkpoint_path.format(epoch=0))
    
    history = model.fit(X_train, y_train,
                        epochs=14,
                        batch_size=64,
                        validation_data=(X_test, y_test),
                        validation_steps=30,
                        callbacks=[cp_callback])

Epoch 1/14
Epoch 00001: saving model to checkpoints/cp-0001.ckpt
Epoch 2/14
Epoch 00002: saving model to checkpoints/cp-0002.ckpt
Epoch 3/14
Epoch 00003: saving model to checkpoints/cp-0003.ckpt
Epoch 4/14
Epoch 00004: saving model to checkpoints/cp-0004.ckpt
Epoch 5/14
Epoch 00005: saving model to checkpoints/cp-0005.ckpt
Epoch 6/14
Epoch 00006: saving model to checkpoints/cp-0006.ckpt
Epoch 7/14
Epoch 00007: saving model to checkpoints/cp-0007.ckpt
Epoch 8/14
Epoch 00008: saving model to checkpoints/cp-0008.ckpt
Epoch 9/14
Epoch 00009: saving model to checkpoints/cp-0009.ckpt
Epoch 10/14
Epoch 00010: saving model to checkpoints/cp-0010.ckpt
Epoch 11/14
Epoch 00011: saving model to checkpoints/cp-0011.ckpt
Epoch 12/14
Epoch 00012: saving model to checkpoints/cp-0012.ckpt
Epoch 13/14
Epoch 00013: saving model to checkpoints/cp-0013.ckpt
Epoch 14/14
Epoch 00014: saving model to checkpoints/cp-0014.ckpt


In [21]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect multiple                  0         
_________________________________________________________________
embedding_2 (Embedding)      multiple                  640064    
_________________________________________________________________
lstm_2 (LSTM)                multiple                  33024     
_________________________________________________________________
dropout_4 (Dropout)          multiple                  0         
_________________________________________________________________
dense_4 (Dense)              multiple                  4160      
_________________________________________________________________
dropout_5 (Dropout)          multiple                  0         
_________________________________________________________________
dense_5 (Dense)              multiple                 

In [29]:
processed = train["text"].apply(process_text).values
processed[0:3]

array(['way plug unless go converter', 'good case excellent value',
       'great jawbone'], dtype=object)

In [53]:
sen2vec_model = tf.keras.Sequential([                                                                   
    model.get_layer(name="text_vectorization"),                                                
    model.get_layer(name='embedding_2'),                                                         
    model.get_layer(name='lstm_2'),                                                              
    model.get_layer(name='dense_4')  
])

sen2vec_model_interm = tf.keras.Sequential([
    model.get_layer(name="text_vectorization"),                                                
    model.get_layer(name='embedding_2'),                                                         
    model.get_layer(name='lstm_2'),                                                              
])

def sen2vec(x):
    return model.get_layer(name='embedding_2')(model.get_layer(name="text_vectorization")(x))

sen2vec_model.predict(processed[0:2]).shape

(2, 64)

In [65]:
#sen2vec([["hoho"]])
#processed.reshape(3000, 1)

sen2vec_model.predict(processed).shape
sen2vec(processed.reshape(len(processed), 1)).numpy().mean(axis=1)

array([[-0.01332534, -0.00558405, -0.0346331 , ..., -0.02683705,
        -0.02472805,  0.00096498],
       [-0.01564137, -0.00621819, -0.03694702, ..., -0.03018918,
        -0.02193866,  0.00052662],
       [-0.01300378, -0.00523942, -0.0398652 , ..., -0.03211461,
        -0.02640506,  0.00077586],
       ...,
       [-0.01149158, -0.00635796, -0.03401456, ..., -0.02991116,
        -0.02379806,  0.00208982],
       [-0.01230131, -0.00636091, -0.03203618, ..., -0.02609388,
        -0.01960468, -0.00163094],
       [-0.0140381 , -0.00547017, -0.02717642, ..., -0.02144883,
        -0.01998052, -0.00238675]], dtype=float32)

In [66]:
import umap
import pickle
def save_model(model, path):
    with open(path, "wb") as w_obj:
        pickle.dump(model, w_obj)
        
for i in range(11):
    fname = f'./training/cp-000{i}.ckpt' if i < 10 else f'./training/cp-00{i}.ckpt'
    
    model.load_weights(fname)
    emb_proc = sen2vec_model.predict(processed)
    umap_ = umap.UMAP(n_neighbors=5, random_state=100100)
    X_proc = umap_.fit(emb_proc)
    outpath = f'./embedding/umap-proc-{i}.pkl'
    save_model(umap_, outpath)
    
    emb_interm = sen2vec_model_interm.predict(processed)
    X_proc = umap_.fit(emb_interm)
    outpath = f'./embedding/umap-intermediate-{i}.pkl'
    save_model(umap_, outpath)
    
    emb_raw = sen2vec(processed.reshape(len(processed), 1)).numpy().mean(axis=1)
    X_proc = umap_.fit(emb_raw)
    outpath = f'./embedding/umap-raw-{i}.pkl'
    save_model(umap_, outpath)

In [37]:
umap_ = umap.UMAP(n_neighbors=5, random_state=100100)
X = umap_.fit(emb)

In [40]:
umap_.transform([[0]*64])

array([[8.97181 , 8.403928]], dtype=float32)

----

In [12]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    GREEN = '\033[92m'
    WARNING = '\033[93m'
    RED = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [25]:
original_data[:10][1]

'The film was shot at Movie Flats, just off route 395, near Lone Pine, California, north of the road to Whitney Portals. You can still find splashes of cement and iron joists plastered across the rocks where the sets were built. And you\'ll recognize the area from any Randolph Scott movie.<br /><br />I won\'t bother with the plot, since I\'m sure it\'s covered elsewhere. The movie stars three athletes -- Fairbanks fils, who must have learned a good deal from his Dad -- Grant, an acrobat in his youth -- and MacLaughlin, a professional boxer from South Africa. Their physical skills are all on display.<br /><br />Not a moment of this movie is to be taken seriously. It\'s about Thugees, a sect in India, whence our English word "thug." I can\'t go through all the felicities of this movie but probably ought to point out that the director, George Stevens, was a polymath with a background in Laurel and Hardy movies -- see his choreography of the fight scenes -- and went on to the infinitely lo

In [26]:
def color_text(text, model=model):
    tokens = text.split(" ")
    probs = [0]
    for k in range(0,len(tokens)):
        pred = model.predict(np.array([process_text(" ".join(tokens[:k+1]))]))[0][0]
        probs.append(pred)
    pred = probs[-1]
    probs = np.diff(probs)
    colors = [bcolors.ENDC if abs(p / max(np.abs(probs))) < 0.1 
                  else (bcolors.RED if p < 0 else bcolors.GREEN) 
              for p in probs]
    ends = [bcolors.ENDC] * len(probs)
    return " ".join([c+t+e for c,t,e in zip(colors, tokens, ends)]), pred

https://stackoverflow.com/questions/287871/how-to-print-colored-text-in-python

In [27]:
for text in original_data[:10]:
    pred = color_text(text)
    print("NEGATIVE" if pred[1] < 0 else "POSITIVE", "|", pred[0])

['set', 'japan', 'ashura', 'story', 'demon', 'take', 'earth', 'premise', 'far', 'complicate', 'arch', 'storyline', 'forgotten', 'japan', 'turmoil', 'demon', 'occupy', 'human', 'form', 'roam', 'land', 'generally', 'speak', 'demon', 'look', 'act', 'like', 'human', 'evil', 'japanese', 'word', 'use', 'demon', 'rather', 'classical', 'form', 'ogre', 'mythological', 'creature', 'historic', 'stature', 'talk', 'creature', 'would', 'appear', 'like', 'god', 'simple', 'ugly', 'child', 'eat', 'monster', 'however', 'human', 'form', 'remains', 'green', 'eye', 'green', 'teeth', 'appear', 'put', 'sort', 'stress', 'order', 'save', 'world', 'demon', 'demon', 'slayer', 'train', 'skilled', 'warrior', 'spot', 'defeat', 'every', 'kind', 'demon', 'guard', 'passage', 'way', 'realm', 'hell', 'real', 'world', 'basic', 'premise', 'story', 'begin', 'festival', 'local', 'town', 'amid', 'festivity', 'men', 'ride', 'dress', 'black', 'seemingly', 'intent', 'harm', 'villager', 'run', 'except', 'demonic', 'nature', 'tur

KeyboardInterrupt: 

-----------

### Average of word embeddings

In [28]:
def sen2vec(x):
    return model.get_layer(name='embedding_1')(model.get_layer(name="text_vectorization")(x))

In [29]:
a = sen2vec([[x] for x in data.text.values[:3]])

In [30]:
# samples, words, embedding
a.shape

TensorShape([3, 21, 64])

In [52]:
X = data["text"].apply(sen2vec)

In [54]:
np.save("combined_sentiment_labelled_embedding", X.values)

In [56]:
import os
os.listdir(".")

['IDS_Final-Copy1.ipynb',
 'data',
 'Proposal.md',
 'jupyter',
 'combined_sentiment_labelled.tsv',
 'TOPHTO',
 'combined_sentiment_labelled_embedding.npy',
 'LICENSE',
 're_first_model',
 '.ipynb_checkpoints',
 '.git',
 'requirements.txt',
 'first_model',
 'README.md',
 'Report.md',
 'IDS_Final.ipynb']

### Processed sentences

In [36]:
sen2vec_model = tf.keras.Sequential([
    model.get_layer(name="text_vectorization"),
    model.get_layer(name='embedding_1'),
    model.get_layer(name='lstm_1'),
    model.get_layer(name='dense_2')
])

In [37]:
sen2vec_model_interm = tf.keras.Sequential([
    model.get_layer(name="text_vectorization"),
    model.get_layer(name='embedding_1'),
    model.get_layer(name='lstm_1')
])

In [38]:
sentences = data.sample(n=50).text.values

tsne = TSNE()
tsned_space_raw = tsne.fit_transform(sen2vec([[x] for x in sentences]).numpy().mean(axis=1))

tsned_space_proc = tsne.fit_transform(sen2vec_model.predict(sentences))

tsned_space_intermediate = tsne.fit_transform(sen2vec_model_interm.predict(sentences))