# Imports

In [25]:
import typing
import itertools
import json
import PIL
import PIL.Image
import matplotlib.pyplot as plt
import os
import os.path
import numpy as np
import pandas as pd
from tensorflow import keras
import csv
import ipywidgets
from IPython.display import display
import math
import html
import re
import random
import time

import tensorflow.keras.utils as ku
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers

from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, Reshape, LeakyReLU, Conv2D, Conv2DTranspose, Flatten, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop

# Data

## Twitter

In [2]:
positive_json = pd.read_json("./data/positive.json")
negative_json = pd.read_json("./data/negative.json")

data_twitter = pd.concat(
    [
        negative_json.assign(emotion=0),
        positive_json.assign(emotion=1),
    ],
    ignore_index=True,
)

data_twitter = data_twitter[["user", "emotion", "created_at", "text"]]
data_twitter["user"] = data_twitter["user"].map(lambda a: a["id"])

RE_TWITTER_USERNAME = re.compile("@[A-Za-z0-9_]{,15}")
RE_TWITTER_URL = re.compile("https?://t[.]co/[A-Za-z0-9]{10}")

text_normalized = (
    data_twitter["text"]
        .map(lambda a: html.unescape(a))
        .replace(RE_TWITTER_USERNAME, " ")
        .replace(RE_TWITTER_URL, " ")
        .replace(re.compile("\\s"), " ")
        .replace(re.compile("[^A-Za-z0-9 ]"), " ")
        .map(lambda a: a.lower())
        .replace(re.compile("\\s+"), " ")
        .rename("text_normalized")
)

data_twitter = pd.concat(
    [
        data_twitter,
        text_normalized,
    ],
    axis=1,
)

data_twitter

Unnamed: 0,user,emotion,created_at,text,text_normalized
0,3078803375,0,2015-07-24 10:42:49+00:00,hopeless for tmr :(,hopeless for tmr
1,383849833,0,2015-07-24 10:42:48+00:00,Everything in the kids section of IKEA is so c...,everything in the kids section of ikea is so c...
2,486942332,0,2015-07-24 10:42:48+00:00,@Hegelbon That heart sliding into the waste ba...,that heart sliding into the waste basket
3,359645394,0,2015-07-24 10:42:48+00:00,"“@ketchBurning: I hate Japanese call him ""bani...",i hate japanese call him bani me too
4,490280208,0,2015-07-24 10:42:47+00:00,"Dang starting next week I have ""work"" :(",dang starting next week i have work
...,...,...,...,...,...
9995,2399336389,1,2015-07-24 08:11:16+00:00,"@chriswiggin3 Chris, that's great to hear :) D...",chris that s great to hear due times reminder...
9996,16451669,1,2015-07-24 08:11:16+00:00,@RachelLiskeard Thanks for the shout-out :) It...,thanks for the shout out it s great to have y...
9997,2528349649,1,2015-07-24 08:11:16+00:00,@side556 Hey! :) Long time no talk...,hey long time no talk
9998,3065747142,1,2015-07-24 08:11:16+00:00,@staybubbly69 as Matt would say. WELCOME TO AD...,as matt would say welcome to adulthood


In [3]:
vocabulary = " ".join(data_twitter["text_normalized"]).split()
tokenizer = Tokenizer()
tokenizer.fit_on_texts(vocabulary)

In [4]:
vocabulary = sorted(list(tokenizer.word_counts.items()), key=lambda a: a[1], reverse=True)
vocabulary = [word for word,_ in vocabulary]
vocabulary = vocabulary[:1200]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(vocabulary)

In [5]:
a = data_twitter.groupby("user", as_index=False).agg(
    size=pd.NamedAgg(column="emotion", aggfunc="size"),
    emo=pd.NamedAgg(column="emotion", aggfunc="sum"),
).sort_values("size")

a[(a["emo"] > 0) & (a["size"] - a["emo"] > 0)]

Unnamed: 0,user,size,emo
3846,569749124,2,1
4076,618850249,2,1
8299,3242573418,2,1
4539,898630568,2,1
4046,613416758,2,1
264,19970375,2,1
8072,3179744365,2,1
8068,3179220806,2,1
4448,843288175,2,1
8053,3170716296,2,1


In [6]:
data_twitter.groupby("user", as_index=False).get_group(1463512856)

Unnamed: 0,user,emotion,created_at,text,text_normalized
1089,1463512856,0,2015-07-24 10:31:16+00:00,5SOS Calum5SOS Luke5SOS Ashton5SOS I bet $20 t...,5sos calum5sos luke5sos ashton5sos i bet 20 to...
1159,1463512856,0,2015-07-24 10:30:37+00:00,James_Yammouni I bet $20 that you will follow ...,james yammouni i bet 20 that you will follow c...
2659,1463512856,0,2015-07-24 10:15:33+00:00,ladygaga I bet $20 to a friend that you will f...,ladygaga i bet 20 to a friend that you will fo...
2682,1463512856,0,2015-07-24 10:15:21+00:00,justinbieber I bet $20 to a friend that you wi...,justinbieber i bet 20 to a friend that you wil...
8032,1463512856,1,2015-07-24 08:16:10+00:00,5SOS Calum5SOS Luke5SOS Ashton5SOS Hey guys! P...,5sos calum5sos luke5sos ashton5sos hey guys pl...
8104,1463512856,1,2015-07-24 08:15:59+00:00,James_Yammouni Hey JAMES! Thanks for the follo...,james yammouni hey james thanks for the follow...


In [7]:
SPECIAL_TOKEN = {
    "NULL": 0,
    "UNKNOWN": 1,
    "BEGIN": 2,
    "POSITIVE": 3,
    "NEGATIVE": 4,
}

In [8]:
tokens_len = len(SPECIAL_TOKEN) + len(tokenizer.word_counts)

In [9]:
data_twitter_tokenized = (
    data_twitter
        .groupby("user", as_index=True)
        .apply(lambda group:
               list(itertools.chain.from_iterable(
                   (
                         [SPECIAL_TOKEN["BEGIN"], SPECIAL_TOKEN["POSITIVE"] if emotion == 1 else SPECIAL_TOKEN["NEGATIVE"]]
                       + [word-1+len(SPECIAL_TOKEN) for word in sequence]
                       for sequence, emotion, _
                       in zip(
                           tokenizer.texts_to_sequences(group["text_normalized"]),
                           group["emotion"],
                           range(4),
                       )
                    )
               ))
        )
)

data_twitter_tokenized

user
28513                               [2, 3, 6, 199, 11, 42, 163]
75493                                     [2, 3, 5, 60, 17, 54]
634553        [2, 3, 111, 25, 42, 98, 1001, 1052, 54, 7, 88,...
666743         [2, 4, 132, 64, 5, 299, 7, 5, 527, 18, 207, 928]
675253        [2, 3, 9, 83, 680, 1105, 8, 20, 9, 680, 133, 4...
                                    ...                        
3388656496               [2, 4, 5, 141, 5, 129, 8, 395, 7, 110]
3389146985                                          [2, 4, 165]
3389164456                                   [2, 3, 779, 59, 6]
3390537291                    [2, 3, 402, 23, 17, 104, 79, 107]
3390756107                      [2, 4, 135, 7, 10, 44, 19, 116]
Length: 8559, dtype: object

In [10]:
max_sequence_len = data_twitter_tokenized.map(lambda a: len(a)).max()
max_sequence_len

102

In [11]:
mul = 10
data_in = np.full((len(data_twitter_tokenized)*mul, max_sequence_len), SPECIAL_TOKEN["NULL"], dtype=int)
data_out = np.full(len(data_twitter_tokenized)*mul, SPECIAL_TOKEN["NULL"], dtype=int)
for i, a in enumerate(data_twitter_tokenized):
    for j in range(mul):
        index = random.randint(0, len(a)-1)
        data_in[i*mul + j,0:index] = a[:index]
        data_out[i*mul + j] = a[index]
    # data_in[i,0:len(a)] = a

indices = np.arange(len(data_twitter_tokenized)*mul)
np.random.shuffle(indices)
data_in = data_in[indices]
data_out = data_out[indices]
label = ku.to_categorical(data_out, num_classes=tokens_len)

## Face

In [21]:
# if False:
if True:
    image_shape = (48, 48)
    
    face_emotions = []

    with open("./data/fer.csv", "rb") as f:
        row_count = (
              sum(1 for _ in f)
            - 1 # subtract one to skip header
        )

    widget_progress = ipywidgets.IntProgress(max=row_count-1)
    display(widget_progress)
    
    face_images = np.zeros((row_count,) + image_shape, dtype=int)
    face_emotions = np.zeros((row_count,), dtype=int)

    with (
        open("./data/fer.csv") as f
    ):
        f = csv.reader(f)
        _ = next(f)
        for i, row in enumerate(f):
            # if i >= 500:
            #     break
            
            face_emotions[i] = int(row[0])
            image = np.fromiter((int(val) for val in row[2].split()), int)
            image = np.reshape(image, image_shape)
            face_images[i] = image

            widget_progress.value = i

IntProgress(value=0, max=35886)

# Solution

## Algo1 and Algo2

Plan:

Encode the text as follows:

`begin_text` `text_emotion` `tokenized_words`

The model will predict the next token.

You take its prediction, only look at the probabilities of the emotion signifiers, and return the likeliest one.

In [12]:
TWITTER_CHECKPOINT_PATH = "./model_3/cp.ckpt"

In [13]:
# model = Sequential()
# model.add(Embedding(tokens_len, 10,
#                     input_length=max_sequence_len))
# model.add(Bidirectional(LSTM(150, return_sequences=True)))
# model.add(Dropout(0.2))
# model.add(LSTM(100))
# model.add(Dense(tokens_len, activation='relu',
#                 kernel_regularizer=regularizers.l2(0.01)))
# # model.add(Dense(tokens_len/20, activation='relu'))
# model.add(Dense(tokens_len, activation='softmax'))
# model.compile(loss='categorical_crossentropy',
#               optimizer='adam', metrics=['accuracy'])
# print(model.summary())

# model.load_weights(TWITTER_CHECKPOINT_PATH)

In [14]:
model = Sequential()
model.add(Embedding(tokens_len, 10,
                    input_length=max_sequence_len))
# model.add(Bidirectional(LSTM(150, return_sequences=True)))
# model.add(Dropout(0.2))
# model.add(LSTM(100))
model.add(Flatten())
model.add(Dense(tokens_len, activation='relu',
                kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(tokens_len/20, activation='relu'))
model.add(Dense(tokens_len, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.load_weights(TWITTER_CHECKPOINT_PATH)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 102, 10)           12050     
                                                                 
 flatten (Flatten)           (None, 1020)              0         
                                                                 
 dense (Dense)               (None, 1205)              1230305   
                                                                 
 dense_1 (Dense)             (None, 60)                72360     
                                                                 
 dense_2 (Dense)             (None, 1205)              73505     
                                                                 
Total params: 1,388,220
Trainable params: 1,388,220
Non-trainable params: 0
_________________________________________________________________
None


2023-05-31 06:39:00.272059: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7f90cf4d5f00>

In [90]:
cp_callback = keras.callbacks.ModelCheckpoint(
    filepath=TWITTER_CHECKPOINT_PATH,
    save_weights_only=True,
    verbose=1,
)

history = model.fit(
    data_in,
    label,
    epochs=10000,
    verbose=1,
    callbacks=[cp_callback],
)

Epoch 1/10000
Epoch 1: saving model to ./model_3/cp.ckpt
Epoch 2/10000
Epoch 2: saving model to ./model_3/cp.ckpt
Epoch 3/10000
Epoch 3: saving model to ./model_3/cp.ckpt
Epoch 4/10000
Epoch 4: saving model to ./model_3/cp.ckpt
Epoch 5/10000
  25/2675 [..............................] - ETA: 30s - loss: 3.4254 - accuracy: 0.3275

KeyboardInterrupt: 

In [None]:
print(tokenizer.index_word[1])

In [91]:
def twitter_predict(normalized_sentences: list[str], emotions: list[str], token_count) -> list[int]:
    texts = [sentence.split() for sentence in normalized_sentences]
    sequences = [
        [word-1+len(SPECIAL_TOKEN) for word in sequence]
        for sequence in tokenizer.texts_to_sequences(texts)
    ]
    data_in = []
    for sequence, emotion in zip(sequences, emotions):
        data_in += [SPECIAL_TOKEN["BEGIN"]]
        data_in += [SPECIAL_TOKEN[emotion]]
        data_in += sequence
    data_in += [SPECIAL_TOKEN["BEGIN"]]
    
    tokens_index = len(data_in)
    for token_index in range(token_count):
        data_in_array = np.full((1, max_sequence_len), SPECIAL_TOKEN["NULL"])
        data_in_array[0,:len(data_in)] = data_in
        prediction = model.predict(data_in_array)[0]
        
        if token_index == 0:
            token = None
            for i in [SPECIAL_TOKEN["POSITIVE"], SPECIAL_TOKEN["NEGATIVE"]]:
                if token is None or prediction[i] > prediction[token]:
                    token = i
        else:
            token = None
            for i in range(len(prediction)):
                if i in {SPECIAL_TOKEN["BEGIN"], SPECIAL_TOKEN["POSITIVE"], SPECIAL_TOKEN["NEGATIVE"], SPECIAL_TOKEN["NULL"], SPECIAL_TOKEN["UNKNOWN"]}:
                    continue

                if token is None or prediction[i] > prediction[token]:
                    token = i
        
        data_in.append(token)
    return data_in[tokens_index:]
            
    
def twitter_predict_emotion(normalized_sentences: list[str], emotions: list[str]) -> str:
    tokens = twitter_predict(normalized_sentences, emotions, 1)
    
    if tokens[0] == SPECIAL_TOKEN["POSITIVE"]:
        return "POSITIVE"
    elif tokens[0] == SPECIAL_TOKEN["NEGATIVE"]:
        return "NEGATIVE"
    else:
        assert False


def twitter_predict_text(normalized_sentences: list[str], emotions: list[str], words: int) -> str:
    tokens = twitter_predict(normalized_sentences, emotions, words)
    tokens = [token+1-len(SPECIAL_TOKEN) for token in tokens]
    text = tokenizer.sequences_to_texts([tokens])[0]
    return text


print(twitter_predict_emotion(
    ["hopeless for tmr", "shame i m nearly 19"],
    ["NEGATIVE", "NEGATIVE"],
))

print(twitter_predict_emotion(
    ["for being top engaged members in my community this week", "hey james many thanks"],
    ["POSITIVE", "POSITIVE"],
))

print(twitter_predict_text(
    ["hopeless for tmr", "shame i m nearly 19"],
    ["NEGATIVE", "NEGATIVE"],
    10,
))

print(twitter_predict_text(
    ["for being top engaged members in my community this week", "hey james many thanks"],
    ["POSITIVE", "POSITIVE"],
    10,
))

print(twitter_predict_text(
    ["we are living in america", "it s wunderbar"],
    ["POSITIVE", "POSITIVE"],
    10,
))

POSITIVE
NEGATIVE
i use i i have i m i m
snapchat snapchat is is is is is is is
thanks great d you you you you you you


## Algo3

In [56]:
FACE_HEIGHT = 48
FACE_WIDTH = 48

LATENT_DIM = 32
CHANNELS = 1

def create_generator():
    gen_in = Input(shape=(LATENT_DIM, ))

    gen_out = Dense(64 * 6 * 6)(gen_in)
    gen_out = LeakyReLU()(gen_out)
    gen_out = Reshape((6, 6, 64))(gen_out)

    gen_out = Conv2D(128, 5, padding="same")(gen_out)
    gen_out = LeakyReLU()(gen_out)

    gen_out = Conv2DTranspose(128, 4, strides=2, padding="same")(gen_out)
    gen_out = LeakyReLU()(gen_out)

    gen_out = Conv2DTranspose(128, 4, strides=2, padding="same")(gen_out)
    gen_out = LeakyReLU()(gen_out)

    gen_out = Conv2DTranspose(128, 4, strides=2, padding="same")(gen_out)
    gen_out = LeakyReLU()(gen_out)

    gen_out = Conv2D(256, 5, padding="same")(gen_out)
    gen_out = LeakyReLU()(gen_out)
    gen_out = Conv2D(256, 5, padding="same")(gen_out)
    gen_out = LeakyReLU()(gen_out)
    gen_out = Conv2D(CHANNELS, 7, activation="tanh", padding="same")(gen_out)

    generator = Model(gen_in, gen_out)
    return generator

In [57]:
generator = create_generator()
generator.summary()

AttributeError: 'Model' object has no attribute 'add'

In [58]:
def create_discriminator():
    disc_in = Input(shape=(FACE_HEIGHT, FACE_WIDTH, CHANNELS))

    disc_out = Conv2D(128, 3)(disc_in)
    disc_out = LeakyReLU()(disc_out)

    disc_out = Conv2D(128, 4, strides=2)(disc_out)
    disc_out = LeakyReLU()(disc_out)

    disc_out = Conv2D(128, 4, strides=2)(disc_out)
    disc_out = LeakyReLU()(disc_out)

    disc_out = Conv2D(128, 4, strides=2)(disc_out)
    disc_out = LeakyReLU()(disc_out)

    disc_out = Conv2D(128, 4, strides=2)(disc_out)
    disc_out = LeakyReLU()(disc_out)

    disc_out = Flatten()(disc_out)
    disc_out = Dropout(0.4)(disc_out)

    disc_out = Dense(1, activation='sigmoid')(disc_out)
    discriminator = Model(disc_in, disc_out)

    optimizer = RMSprop(
        learning_rate=.0001,
        clipvalue=1.0,
        decay=1e-8
    )

    discriminator.compile(
        optimizer=optimizer,
        loss='binary_crossentropy'
    )

    return discriminator

In [59]:
discriminator = create_discriminator()
discriminator.trainable = False
discriminator.summary()

Model: "model_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_20 (InputLayer)       [(None, 48, 48, 1)]       0         
                                                                 
 conv2d_60 (Conv2D)          (None, 46, 46, 128)       1280      
                                                                 
 leaky_re_lu_90 (LeakyReLU)  (None, 46, 46, 128)       0         
                                                                 
 conv2d_61 (Conv2D)          (None, 22, 22, 128)       262272    
                                                                 
 leaky_re_lu_91 (LeakyReLU)  (None, 22, 22, 128)       0         
                                                                 
 conv2d_62 (Conv2D)          (None, 10, 10, 128)       262272    
                                                                 
 leaky_re_lu_92 (LeakyReLU)  (None, 10, 10, 128)       0  

In [60]:
gan_input = Input(shape=(LATENT_DIM, ))
gan_output = discriminator(generator(gan_input))
gan = Model(gan_input, gan_output)

In [61]:
optimizer = RMSprop(lr=.0001, clipvalue=1.0, decay=1e-8)
gan.compile(optimizer=optimizer, loss="binary_crossentropy")

In [62]:
gan.summary()

Model: "model_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_21 (InputLayer)       [(None, 32)]              0         
                                                                 
 model_13 (Functional)       (None, 48, 48, 1)         3538433   
                                                                 
 model_14 (Functional)       (None, 1)                 1050497   
                                                                 
Total params: 4,588,930
Trainable params: 3,538,433
Non-trainable params: 1,050,497
_________________________________________________________________


In [82]:
images = face_images / 255
images = images.reshape(images.shape + (1,))

images = images[face_emotions == 0]

In [None]:
gan.load_weights("./gan.h5")

In [101]:
iterations = 15000
batch_size = 16

RES_DIR = 'res2'
FILE_PATH = '%s/generated_%d.png'
if not os.path.isdir(RES_DIR):
    os.mkdir(RES_DIR)

CONTROL_SIZE_SQRT = 6
control_vectors = np.random.normal(size=(CONTROL_SIZE_SQRT**2, LATENT_DIM)) / 2

start = 0
images_saved = 0
for step in range(iterations):
    start_time = time.time()
    latent_vectors = np.random.normal(size=(batch_size, LATENT_DIM))
    generated = generator.predict(latent_vectors)

    real = images[start:start + batch_size]
    combined_images = np.concatenate([generated, real])

    labels = np.concatenate([np.ones((batch_size, 1)), np.zeros((batch_size, 1))])
    labels += .05 * np.random.random(labels.shape)

    disc_loss = discriminator.train_on_batch(combined_images, labels)

    latent_vectors = np.random.normal(size=(batch_size, LATENT_DIM))
    misleading_targets = np.zeros((batch_size, 1))

    gan_loss = gan.train_on_batch(latent_vectors, misleading_targets)

    start += batch_size
    if start > images.shape[0] - batch_size:
        start = 0

    if step % 25 == 0:
        gan.save_weights("./gan.h5")

        print(f"{step+1}/{iterations}: disc_loss: {disc_loss:.4f}, gan_loss: {gan_loss:.4f} ({time.time() - start_time:.1f} sec)")
        
        control_image = np.zeros((FACE_WIDTH * CONTROL_SIZE_SQRT, FACE_HEIGHT * CONTROL_SIZE_SQRT, CHANNELS))
        control_generated = generator.predict(control_vectors)
        
        for i in range(CONTROL_SIZE_SQRT ** 2):
            x_off = i % CONTROL_SIZE_SQRT
            y_off = i // CONTROL_SIZE_SQRT
            control_image[x_off * FACE_WIDTH:(x_off + 1) * FACE_WIDTH, y_off * FACE_HEIGHT:(y_off + 1) * FACE_HEIGHT, :] = control_generated[i, :, :, :]
        control_image = np.uint8(control_image * 255)
        control_image = np.reshape(control_image, control_image.shape[:2])
        im = PIL.Image.fromarray(control_image)
        im.save(f"{RES_DIR}/generated_{images_saved}.png")
        images_saved += 1

1/15000: disc_loss: 0.6975, gan_loss: 0.7278 (4.5 sec)



KeyboardInterrupt

