# Imports

In [1]:
import typing
import itertools
import json
import PIL
import PIL.Image
import matplotlib.pyplot as plt
import os
import os.path
import numpy as np
import pandas as pd
from tensorflow import keras
import csv
import ipywidgets
from IPython.display import display
import math
import html
import re
import random

import tensorflow.keras.utils as ku
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers

2023-05-30 20:02:05.083544: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Data

## Twitter

In [2]:
positive_json = pd.read_json("./data/positive.json")
negative_json = pd.read_json("./data/negative.json")

data_twitter = pd.concat(
    [
        negative_json.assign(emotion=0),
        positive_json.assign(emotion=1),
    ],
    ignore_index=True,
)

data_twitter = data_twitter[["user", "emotion", "created_at", "text"]]
data_twitter["user"] = data_twitter["user"].map(lambda a: a["id"])

RE_TWITTER_USERNAME = re.compile("@[A-Za-z0-9_]{,15}")
RE_TWITTER_URL = re.compile("https?://t[.]co/[A-Za-z0-9]{10}")

text_normalized = (
    data_twitter["text"]
        .map(lambda a: html.unescape(a))
        .replace(RE_TWITTER_USERNAME, " ")
        .replace(RE_TWITTER_URL, " ")
        .replace(re.compile("\\s"), " ")
        .replace(re.compile("[^A-Za-z0-9 ]"), " ")
        .map(lambda a: a.lower())
        .replace(re.compile("\\s+"), " ")
        .rename("text_normalized")
)

data_twitter = pd.concat(
    [
        data_twitter,
        text_normalized,
    ],
    axis=1,
)

data_twitter

Unnamed: 0,user,emotion,created_at,text,text_normalized
0,3078803375,0,2015-07-24 10:42:49+00:00,hopeless for tmr :(,hopeless for tmr
1,383849833,0,2015-07-24 10:42:48+00:00,Everything in the kids section of IKEA is so c...,everything in the kids section of ikea is so c...
2,486942332,0,2015-07-24 10:42:48+00:00,@Hegelbon That heart sliding into the waste ba...,that heart sliding into the waste basket
3,359645394,0,2015-07-24 10:42:48+00:00,"“@ketchBurning: I hate Japanese call him ""bani...",i hate japanese call him bani me too
4,490280208,0,2015-07-24 10:42:47+00:00,"Dang starting next week I have ""work"" :(",dang starting next week i have work
...,...,...,...,...,...
9995,2399336389,1,2015-07-24 08:11:16+00:00,"@chriswiggin3 Chris, that's great to hear :) D...",chris that s great to hear due times reminder...
9996,16451669,1,2015-07-24 08:11:16+00:00,@RachelLiskeard Thanks for the shout-out :) It...,thanks for the shout out it s great to have y...
9997,2528349649,1,2015-07-24 08:11:16+00:00,@side556 Hey! :) Long time no talk...,hey long time no talk
9998,3065747142,1,2015-07-24 08:11:16+00:00,@staybubbly69 as Matt would say. WELCOME TO AD...,as matt would say welcome to adulthood


In [3]:
vocabulary = " ".join(data_twitter["text_normalized"]).split()
tokenizer = Tokenizer()
tokenizer.fit_on_texts(vocabulary)

In [4]:
a = data_twitter.groupby("user", as_index=False).agg(
    size=pd.NamedAgg(column="emotion", aggfunc="size"),
    emo=pd.NamedAgg(column="emotion", aggfunc="sum"),
).sort_values("size")

a[(a["emo"] > 0) & (a["size"] - a["emo"] > 0)]

Unnamed: 0,user,size,emo
3846,569749124,2,1
4076,618850249,2,1
8299,3242573418,2,1
4539,898630568,2,1
4046,613416758,2,1
264,19970375,2,1
8072,3179744365,2,1
8068,3179220806,2,1
4448,843288175,2,1
8053,3170716296,2,1


In [5]:
data_twitter.groupby("user", as_index=False).get_group(1463512856)

Unnamed: 0,user,emotion,created_at,text,text_normalized
1089,1463512856,0,2015-07-24 10:31:16+00:00,5SOS Calum5SOS Luke5SOS Ashton5SOS I bet $20 t...,5sos calum5sos luke5sos ashton5sos i bet 20 to...
1159,1463512856,0,2015-07-24 10:30:37+00:00,James_Yammouni I bet $20 that you will follow ...,james yammouni i bet 20 that you will follow c...
2659,1463512856,0,2015-07-24 10:15:33+00:00,ladygaga I bet $20 to a friend that you will f...,ladygaga i bet 20 to a friend that you will fo...
2682,1463512856,0,2015-07-24 10:15:21+00:00,justinbieber I bet $20 to a friend that you wi...,justinbieber i bet 20 to a friend that you wil...
8032,1463512856,1,2015-07-24 08:16:10+00:00,5SOS Calum5SOS Luke5SOS Ashton5SOS Hey guys! P...,5sos calum5sos luke5sos ashton5sos hey guys pl...
8104,1463512856,1,2015-07-24 08:15:59+00:00,James_Yammouni Hey JAMES! Thanks for the follo...,james yammouni hey james thanks for the follow...


In [6]:
SPECIAL_TOKEN = {
    "NULL": 0,
    "UNKNOWN": 1,
    "BEGIN": 2,
    "POSITIVE": 3,
    "NEGATIVE": 4,
}

In [7]:
tokens_len = len(SPECIAL_TOKEN) + len(tokenizer.word_counts)

In [8]:
data_twitter_tokenized = (
    data_twitter
        .groupby("user", as_index=True)
        .apply(lambda group:
               list(itertools.chain.from_iterable(
                   (
                         [SPECIAL_TOKEN["BEGIN"], SPECIAL_TOKEN["POSITIVE"] if emotion == 1 else SPECIAL_TOKEN["NEGATIVE"]]
                       + [word-1+len(SPECIAL_TOKEN) for word in sequence]
                       for sequence, emotion, _
                       in zip(
                           tokenizer.texts_to_sequences(group["text_normalized"]),
                           group["emotion"],
                           range(4),
                       )
                    )
               ))
        )
)

data_twitter_tokenized

user
28513                               [2, 3, 6, 199, 11, 42, 163]
75493                        [2, 3, 5, 60, 10225, 1259, 17, 54]
634553        [2, 3, 111, 25, 42, 98, 1001, 1052, 54, 7, 88,...
666743        [2, 4, 132, 64, 5, 299, 7, 7546, 5, 527, 18, 2...
675253        [2, 3, 4170, 9, 83, 680, 1105, 8622, 8, 8623, ...
                                    ...                        
3388656496         [2, 4, 5, 141, 5, 129, 8, 395, 7, 1248, 110]
3389146985                                          [2, 4, 165]
3389164456                                   [2, 3, 779, 59, 6]
3390537291              [2, 3, 402, 23, 17, 104, 4403, 79, 107]
3390756107    [2, 4, 6261, 1731, 135, 7, 6262, 6263, 6264, 1...
Length: 8559, dtype: object

In [9]:
max_sequence_len = data_twitter_tokenized.map(lambda a: len(a)).max()
max_sequence_len

109

In [52]:
max_sequence_len = 4
ngrams = []
for text in data_twitter_tokenized:
    if len(text) < max_sequence_len:
        ngrams.append(text + [SPECIAL_TOKEN["NULL"]] * (max_sequence_len - len(text)))
    else:
        for i in range(len(text) - max_sequence_len + 1):
            ngrams.append(text[i:i+max_sequence_len])

ngrams = np.array(ngrams, dtype=int)

data_in = ngrams[:,:max_sequence_len-1]
data_out = ngrams[:,max_sequence_len-1]
label = ku.to_categorical(data_out, num_classes=tokens_len)



# Solution

## Algo1

Plan:

Encode the text as follows:

`begin_text` `text_emotion` `tokenized_words` `end_text`

The model will predict the next token.

You take its prediction, only look at the probabilities of the emotion signifiers, and return the likeliest one.

In [53]:
TWITTER_CHECKPOINT_PATH = "./model_2/cp.ckpt"

In [54]:
model = Sequential()
model.add(Embedding(tokens_len, 100,
                    input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150, return_sequences=True)))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(tokens_len/20, activation='relu',
                kernel_regularizer=regularizers.l2(0.01)))
# model.add(Dense(tokens_len/20, activation='relu'))
model.add(Dense(tokens_len, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.load_weights(TWITTER_CHECKPOINT_PATH)

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 3, 100)            1207200   
                                                                 
 bidirectional_6 (Bidirectio  (None, 3, 300)           301200    
 nal)                                                            
                                                                 
 dropout_6 (Dropout)         (None, 3, 300)            0         
                                                                 
 lstm_13 (LSTM)              (None, 100)               160400    
                                                                 
 dense_12 (Dense)            (None, 603)               60903     
                                                                 
 dense_13 (Dense)            (None, 12072)             7291488   
                                                      

NotFoundError: Unsuccessful TensorSliceReader constructor: Failed to find any matching files for ./model_2/cp.ckpt

In [55]:
cp_callback = keras.callbacks.ModelCheckpoint(
    filepath=TWITTER_CHECKPOINT_PATH,
    save_weights_only=True,
    verbose=1,
)

history = model.fit(
    data_in,
    label,
    epochs=2,
    verbose=1,
    callbacks=[cp_callback],
)

Epoch 1/2

KeyboardInterrupt: 

In [None]:
print(tokenizer.index_word[1])

In [38]:
def twitter_predict(normalized_sentences: list[str], emotions: list[str], token_count) -> list[int]:
    texts = [sentence.split() for sentence in normalized_sentences]
    sequences = [
        [word-1+len(SPECIAL_TOKEN) for word in sequence]
        for sequence in tokenizer.texts_to_sequences(texts)
    ]
    data_in = []
    for sequence, emotion in zip(sequences, emotions):
        data_in += [SPECIAL_TOKEN["BEGIN"]]
        data_in += [SPECIAL_TOKEN[emotion]]
        data_in += sequence
    data_in += [SPECIAL_TOKEN["BEGIN"]]
    
    tokens_index = len(data_in)
    for token_index in range(token_count):
        data_in_array = np.full((1, max_sequence_len), SPECIAL_TOKEN["NULL"])
        data_in_array[0,:len(data_in)] = data_in
        prediction = model.predict(data_in_array)[0]
        
        if token_index == 0:
            token = None
            for i in [SPECIAL_TOKEN["POSITIVE"], SPECIAL_TOKEN["NEGATIVE"]]:
                if token is None or prediction[i] > prediction[token]:
                    token = i
        else:
            token = None
            for i in range(len(prediction)):
                if i in {SPECIAL_TOKEN["BEGIN"], SPECIAL_TOKEN["POSITIVE"], SPECIAL_TOKEN["NEGATIVE"], SPECIAL_TOKEN["NULL"], SPECIAL_TOKEN["UNKNOWN"]}:
                    continue

                if token is None or prediction[i] > prediction[token]:
                    token = i
        
        data_in.append(token)
    return data_in[tokens_index:]
            
    
def twitter_predict_emotion(normalized_sentences: list[str], emotions: list[str]) -> str:
    tokens = twitter_predict(normalized_sentences, emotions, 1)
    
    if tokens[0] == SPECIAL_TOKEN["POSITIVE"]:
        return "POSITIVE"
    elif tokens[0] == SPECIAL_TOKEN["NEGATIVE"]:
        return "NEGATIVE"
    else:
        assert False


def twitter_predict_text(normalized_sentences: list[str], emotions: list[str], words: int) -> str:
    tokens = twitter_predict(normalized_sentences, emotions, words)
    tokens = [token+1-len(SPECIAL_TOKEN) for token in tokens]
    text = tokenizer.sequences_to_texts([tokens])[0]
    return " ".join(text)


print(twitter_predict_emotion(
    ["hopeless for tmr", "shame i m nearly 19"],
    ["NEGATIVE", "NEGATIVE"],
))

print(twitter_predict_emotion(
    ["for being top engaged members in my community this week", "hey james many thanks"],
    ["POSITIVE", "POSITIVE"],
))

print(twitter_predict_text(
    ["hopeless for tmr", "shame i m nearly 19"],
    ["NEGATIVE", "NEGATIVE"],
    10,
))

print(twitter_predict_text(
    ["for being top engaged members in my community this week", "hey james many thanks"],
    ["POSITIVE", "POSITIVE"],
    10,
))

NEGATIVE
NEGATIVE
i   i   i   i   i   i   i   i   i
i   i   i   i   i   i   i   i   i


In [None]:
print(tokenizer.texts_to_sequences([["i"]]))