In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa

import zipfile 
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

import string
import re

import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
print(tf.__version__)

device_name = tf.test.gpu_device_name()
if "GPU" not in device_name:
    print("GPU device not found")
else:
    print('Found GPU at: {}'.format(device_name))

2.3.0
Found GPU at: /device:GPU:0


# Load the data
we have 3 training datasets and need to combined them. 

In [None]:
flatten = lambda t: [item for sublist in t for item in sublist]

In [None]:
# load dataset 1
with zipfile.ZipFile('/content/drive/MyDrive/CSC2515/part2/2019_Train_EmotionPush.zip','r') as z:
    with z.open('EmotionPush/emotionpush.json') as f:
        data = json.loads(f.read())
        data = flatten(data)
        df1 = pd.DataFrame(data)

In [None]:
# load dataset 2
with zipfile.ZipFile('/content/drive/MyDrive/CSC2515/part2/2019_Train_Friends.zip','r') as z:
    with z.open('Friends/friends.json') as f:
        data = json.loads(f.read())
        data = flatten(data)
        df2 = pd.DataFrame(data)

In [None]:
# load dataset 3
df3 = pd.read_csv('/content/drive/MyDrive/CSC2515/part2/text_emotion.csv')

In [None]:
# combine all three datasets
df1 = df1.drop(['speaker', 'annotation'], axis=1)
df2 = df2.drop(['speaker', 'annotation'], axis=1)
df1 = pd.concat([df1, df2], axis=0)
df3 = df3.drop(['tweet_id', 'author'], axis=1)
df3.columns = ['emotion', 'utterance']
df = pd.concat([df1, df3], axis=0)
# df.to_csv('/content/drive/MyDrive/CSC2515/part2/emotion_combined.csv')

In [None]:
# df = pd.read_csv('/content/drive/MyDrive/CSC2515/part2/emotion_combined.csv')

# Data Proprocessing 

In [None]:
# remove @users 
df['utterance'] = df.utterance.str.replace('@[a-zA-Z]+', 'someone')

# drop missing values
df.dropna(inplace=True)

In [None]:
# count the words in each sample
df['words'] = df['utterance'].map(lambda x: len(x.split()))

In [None]:
# data cleaning functions
def clean_numbers(x):
    '''
    replace number with # sign
    '''
    x = re.sub('[0-9]{3,}', '### ', x)
    x = re.sub('[0-9]{2}', ' ## ', x)
    return x

def clean_punc(x):
    '''
    replace punctuation with space
    '''
    x = str(x)
    for punct in string.punctuation:
        x = x.replace(punct, ' ')
    return x

def preprocess(df, feature):
    '''
    preprocess the feature column in df
    remove punctuations, change to lowercase, repalce numbers with #
    fill with 'None' if the string is empty
    '''
    df[feature] = df[feature].map(clean_punc)
    df[feature] = df[feature].str.lower()
    df[feature] = df[feature].map(clean_numbers)
    df[feature] = df[feature].map(lambda x: 'None' if x == '' else x)
    return df


In [None]:
# preprocess the data
df = preprocess(df, 'utterance')

# shuffle the data
df = df.sample(frac=1, random_state=0)

In [None]:
# get unique number of classes (17)
num_cls = len(df.emotion.unique())

In [None]:
# encode the class labels
LE = LabelEncoder()
df['emotion_code'] = LE.fit_transform(df.emotion)


In [None]:
# train validation split
mask = np.random.rand(df.shape[0]) < 0.9
df_train = df[mask]
df_val = df[~mask]

In [None]:
# load the data into tensorflow dataset 
train = tf.data.Dataset.from_tensor_slices((df_train.utterance.values, df_train.emotion_code.values))
val = tf.data.Dataset.from_tensor_slices((df_val.utterance.values, df_val.emotion_code.values))

# convert class labels into one-hot encoding
train = train.map(lambda x, y: (x, tf.one_hot(y, depth=num_cls)))
val = val.map(lambda x, y: (x, tf.one_hot(y, depth=num_cls)))



In [None]:
# check how many words appeared at least 3 times
vec = CountVectorizer(ngram_range=(1, 1), min_df=3).fit(df['utterance'].values)
len(vec.get_feature_names())

9210

In [None]:
# keras text vectorization layer
# max token is set to contain only words with frequency large than 3
max_token = len(vec.get_feature_names()) + 2
vectorize = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=max_token, standardize=None)

In [None]:
# adapt the text vectorization layer on the training data
vectorize.adapt(df['utterance'].values)
words = vectorize.get_vocabulary()


In [None]:
# cach the dataset into memory
cached_train = train.batch(1024).cache()
cached_val = val.batch(1024).cache()

# word embedding functions

In [None]:
# generate embedding model
# this would be the model we are interested in
def embedding_model(regularizers):
    '''
    generate a word embedding layer that takes input of string and outputs word embedding
    '''
    input = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
    x = vectorize(input)
    output = tf.keras.layers.Embedding(len(words) + 2, 
                                       64, 
                                       embeddings_regularizer=regularizers,
                                       )(x)
    return tf.keras.Model(inputs=input, outputs=output)

In [None]:
def get_embedding_dict(model):
    '''
    extract the word embedding dictionary from the model
    '''
    embedding_dict = {}
    for word in words[2:]:
        embedding_dict[word] = model.layers[0](tf.constant(word)).numpy().flatten()
    return embedding_dict


# Average model

In [None]:
def Average_model():
    '''
    return a training model that uses average layer to summarize the sentence into one vector
    uses categorical cross entropy as loss function
    uses macro F1 score as metrics
    '''
    model = tf.keras.Sequential([
        embedding_model(
            None,
            # tf.keras.regularizers.L2(1e-7)
        ),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(num_cls, 
                              activation='softmax', 
                              kernel_regularizer=tf.keras.regularizers.l2(0.01))                           
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(0.3),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=[tfa.metrics.F1Score(num_cls, average='macro')]
              )

    return model


In [None]:
# train the model
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_f1_score', patience=10, mode='max')

ave_model = Average_model()

ave_model.fit(cached_train, epochs=1000, verbose=2, validation_data=cached_val, 
              callbacks=[early_stopping],
              )

In [None]:
ave_dict = get_embedding_dict(ave_model)


In [None]:
with open('/content/drive/MyDrive/CSC2515/part2/ave_dict', 'wb') as fp:
    pickle.dump(ave_dict, fp)

# LSTM model

In [None]:
def LSTM_model():
    '''
    return a training model that uses LSTM layer to summarize the sentence into one vector
    uses categorical cross entropy as loss function
    uses macro F1 score as metrics
    '''
    model = tf.keras.Sequential([
        embedding_model(
            None,
            # tf.keras.regularizers.l2(1e-7),
            ),
        tf.keras.layers.LSTM(64),
        # tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(num_cls, activation='softmax')                           
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(0.3),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=[tfa.metrics.F1Score(num_cls, average='macro')]
              )

    return model

In [None]:
lstm_model = LSTM_model()

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_f1_score', patience=100, mode='max')

lstm_model.fit(cached_train, epochs=1000, validation_data=cached_val, callbacks=[early_stopping])

In [None]:
lstm_dict = get_embedding_dict(lstm_model)

In [None]:
with open('/content/drive/MyDrive/CSC2515/part2/lstm_dict', 'wb') as fp:
    pickle.dump(lstm_dict, fp)

# attention functions

In [None]:
# attention model functions

def get_angles(pos, i, d_model):
    '''
    generate angles for positional encoding
    '''
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    '''
    generate positional encoding
    '''
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

def add_position():
    '''
    return a layer that adds positional encoding into embeddings
    the layer takes sequence of word embeddings as input, 
    outputs positional encoded embeddings
    '''
    x = tf.keras.layers.Input(shape=(None, 64))
    seq_len = tf.shape(x)[1]
    x_positioned = x*8 + positional_encoding(20000, 64)[:, :seq_len, :]
    return tf.keras.Model(inputs=x, outputs=x_positioned)

def attention_layer():
    '''
    return a layer that apply attention algorithm
    the layer takes sequence of q, k and v as input
    outputs sequence of attentions
    '''
    q = tf.keras.layers.Input(shape=(None, None, None))
    k = tf.keras.layers.Input(shape=(None, None, None))
    v = tf.keras.layers.Input(shape=(None, None, None))
    
    matmul = tf.matmul(q, k, transpose_b=True)
    scaled_attention_logits = matmul / 8
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, v)
    return tf.keras.Model(inputs=[q, k, v],  outputs=output)

def multi_head(d_model, num_heads):
    '''
    The outer wrapper of attention model
    takes sequence of word embeddings as input
    convert embeddings into q, k and v
    split them into multiple parts (number of heads)
    apply attention algorithm
    outputs sequence of attentions
    '''
    x = tf.keras.layers.Input(shape=(None, 64))
    batch_size, seq_len = tf.shape(x)[:2]
    depth = d_model // num_heads

    # q.shape = (batch_size, seq_len, d_model); same as k and v
    q = tf.keras.layers.Dense(64)(x)  
    k = tf.keras.layers.Dense(64)(x)  
    v = tf.keras.layers.Dense(64)(x)  

    # convert to (batch_size, num_heads, seq_len, depth)
    q = tf.transpose(tf.reshape(q, (batch_size, seq_len, num_heads, depth)), perm=[0, 2, 1, 3])  
    k = tf.transpose(tf.reshape(k, (batch_size, seq_len, num_heads, depth)), perm=[0, 2, 1, 3])  
    v = tf.transpose(tf.reshape(v, (batch_size, seq_len, num_heads, depth)), perm=[0, 2, 1, 3])  

    # scaled_attention.shape = (batch_size, num_heads, seq_len, depth)
    scaled_attention = attention_layer()([q, k, v])

    # convert to (batch_size, seq_len, num_heads, depth)
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  

    # convert back to (batch_size, seq_len, d_model)
    concat_attention = tf.reshape(scaled_attention, (batch_size, seq_len, d_model))  

    output = tf.keras.layers.Dense(64)(concat_attention)  # (batch_size, seq_len, d_model)

    return tf.keras.Model(inputs=x, outputs=output)


# Attention model

In [None]:
def attention_model():
    '''
    return a training model that uses attention layer to summarize the sentence into one vector
    uses categorical cross entropy as loss function
    uses macro F1 score as metrics
    '''
    model = tf.keras.Sequential([
        embedding_model(
            None,
            # tf.keras.regularizers.L2(1e-7),
            ),
        add_position(),
        multi_head(64, 1),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(num_cls, activation='softmax', kernel_regularizer=tf.keras.regularizers.l2(0.01))                           
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(0.01),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=[tfa.metrics.F1Score(num_cls, average='macro')]
              )

    return model

In [None]:
att_model = attention_model()

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_f1_score', patience=10, mode='max')

att_model.fit(cached_train, epochs=1000, validation_data=cached_val, callbacks=[early_stopping])

In [None]:
att_dict = get_embedding_dict(att_model)

In [None]:
with open('/content/drive/MyDrive/CSC2515/part2/att_dict', 'wb') as fp:
    pickle.dump(att_dict, fp)
    

# Multi heads attention model

In [None]:
def multi_attention_model():
    '''
    return a training model that uses multi heads attention layer to summarize the sentence into one vector
    uses categorical cross entropy as loss function
    uses macro F1 score as metrics
    '''
    model = tf.keras.Sequential([
        embedding_model(
            None,
            # tf.keras.regularizers.L2(1e-7),
            ),
        add_position(),
        multi_head(64, 8),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(num_cls, activation='softmax', kernel_regularizer=tf.keras.regularizers.l2(0.01))                           
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(0.01),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=[tfa.metrics.F1Score(num_cls, average='macro')]
              )

    return model

In [None]:
mtatt_model = multi_attention_model()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_f1_score', patience=10, mode='max')
mtatt_model.fit(cached_train, epochs=1000, validation_data=cached_val, callbacks=[early_stopping])

In [None]:
mtatt_dict = get_embedding_dict(mtatt_model)

In [None]:
with open('/content/drive/MyDrive/CSC2515/part2/mtatt_dict', 'wb') as fp:
    pickle.dump(mtatt_dict, fp)