In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Conv1D
from keras import layers
from sklearn.preprocessing import scale
from sklearn.model_selection import KFold, StratifiedKFold
from corpora_utils import CorporaHelper,CorporaDomains, CorporaProperties
import numpy as np
import os
import pickle
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session

In [4]:
config = tf.compat.v1.ConfigProto(
    gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.8)
    # device_count = {'GPU': 1}
)
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(session)

In [5]:
"""
Deep learning with multigenre corpus LSTM two layers and 4 emotions
"""
# K-Fold variables
num_folds = 3
fold_runs = 2
fold_no = 1

MULTIGENRE = True
TWITTER = False

In [6]:
# set wich corpora to use Multigenre or twitter
use_mg_train_corpora = MULTIGENRE

In [7]:
# train
epochs = 3
skfold = StratifiedKFold(n_splits = num_folds, random_state = 7, shuffle = True)
acc_per_fold = []
loss_per_fold = []
avg_acc_per_run = []
avg_loss_per_run = []
create_final_model = True

In [8]:
# load data
train_labels = []
train_texts = []
test_labels = []
test_texts = []

In [13]:
def load_corpora(filepath, sep=';'):
    print('Load: ', filepath)
    corpora_helper = CorporaHelper(filepath, separator=sep)
    count_joy = 0
    count_sadness = 0
    count_anger = 0
    count_fear = 0
    labels = []
    texts = []
    # preprocessing corpora
    corpora_helper.translate_urls()
    corpora_helper.translate_emoticons()
    corpora_helper.translate_emojis()
    corpora_helper.translate_email()
    #corpora_helper.translate_mention()
    corpora_helper.translate_html_tags()
    #corpora_helper.translate_camel_case()
    corpora_helper.translate_underscore()

    corpora_helper.translate_string('-LRB-','(')
    corpora_helper.translate_string('-RRB-',')')
    corpora_helper.translate_string('`',"'") # ` to '
    corpora_helper.translate_string("''",'"') # double '' to "
    #corpora_helper.translate_contractions()
    corpora_helper.translate_string("'","") # remove '
    corpora_helper.translate_string("\\n"," ") # replace new lines with space

    #corpora_helper.spell_correction()
    corpora_helper.add_space_at_special_chars()
    corpora_helper.translate_to_lower()

    # 0 anger
    # 1 fear
    # 2 joy
    # 3 sadness
    for index, corpus in corpora_helper.get_data().iterrows():
        if corpus[CorporaProperties.EMOTION.value] == 'anger':
            texts.append(corpus[CorporaProperties.CLEANED_CORPUS.value])
            labels.append(0)
            count_anger += 1
        elif corpus[CorporaProperties.EMOTION.value] == 'fear':
            texts.append(corpus[CorporaProperties.CLEANED_CORPUS.value])
            labels.append(1)
            count_fear += 1
        elif corpus[CorporaProperties.EMOTION.value] == 'joy':
            texts.append(corpus[CorporaProperties.CLEANED_CORPUS.value])
            labels.append(2)
            count_joy += 1
        elif corpus[CorporaProperties.EMOTION.value] == 'sadness':
            texts.append(corpus[CorporaProperties.CLEANED_CORPUS.value])
            labels.append(3)
            count_sadness += 1
    print('number of anger labels: ',count_anger)
    print('number of fear labels: ', count_fear)
    print('number of joy labels: ',count_joy)
    print('number of sadness labels: ', count_sadness)
    print('----------------------------------------------------------------------')
    return texts, labels
    #max_data = count_anger + count_fear + count_joy + count_sadness

In [18]:
train_file = ""
test_file = ""
sep = ';'
word_embeddings_path = ''
if use_mg_train_corpora:
    train_file = "corpora/multigenre_450_train.csv"
    test_file = "corpora/multigenre_450_test.csv"
    word_embeddings_path = 'costum_embeddings/multigenre_embedding.pkl'
    sep = ';'
else:
    train_file = "corpora/twitter_2000_train.csv"
    test_file = "corpora/twitter_2000_test.csv"
    word_embeddings_path = 'costum_embeddings/twitter_embedding.pkl'
    sep = '\t'

train_texts, train_labels = load_corpora(train_file, sep=sep)
test_texts, test_labels = load_corpora(test_file, sep=sep)


Load:  corpora/multigenre_450_train.csv
number of anger labels:  405
number of fear labels:  405
number of joy labels:  405
number of sadness labels:  405
----------------------------------------------------------------------
Load:  corpora/multigenre_450_test.csv
number of anger labels:  45
number of fear labels:  45
number of joy labels:  45
number of sadness labels:  45
----------------------------------------------------------------------


In [19]:
# Load prepared Multigenre ensemble embedding

with open(word_embeddings_path, 'rb') as word_embeddings_file:
    embedding_info = pickle.load(word_embeddings_file)
max_words = 10000


In [20]:
# Embedding helper functions
def is_active_vector_method(string):
    return int(string)

In [21]:
def get_unigram_embedding(word, word_embedding_dict, bin_string):
    
    if word in word_embedding_dict:
        word_feature_embedding_dict = word_embedding_dict[word]
        final_embedding = np.array([])
    else:
        return None
    
    for i in range(16):
        if is_active_vector_method(bin_string[i]):
            final_embedding = np.append(final_embedding, word_feature_embedding_dict[i])
    
    return final_embedding

In [22]:
pre_padding = 0
embeddings_index = embedding_info[0]
MAX_SEQUENCE_LENGTH = embedding_info[1]
maxlen = MAX_SEQUENCE_LENGTH
#MAX_NB_WORDS = 10000

In [24]:
unigram_feature_string = "1001111111100001"

In [25]:
EMBEDDING_DIM = len(get_unigram_embedding("glad", embedding_info[0], unigram_feature_string))
print("Embedding dimension:",EMBEDDING_DIM)

Embedding dimension: 1485


In [26]:
# Create train an test data set
def create_data(texts, labels, maxlen, max_words = 10000):
    ## Create one hot encoding
    #max_words = 10000
    #maxlen = 100 # max. number of words in sequences
    tokenizer = Tokenizer(num_words=max_words, filters = '')
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    word_i = tokenizer.word_index
    print ('%s eindeutige Tokens gefunden.' % len(word_i))

    data = pad_sequences(sequences, maxlen=maxlen)

    labels = np.asarray(labels)
    print('Shape of data:', data.shape)
    print('Shape of labels:', labels.shape)
    print('-------------------------------------------')

    # mix the data
    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]

    # split in train and validate
    x_data = data
    y_data = labels
    return x_data, y_data, word_i

In [27]:
# Train an word index for embedding enrichment
x_train, y_train, word_index = create_data(train_texts, train_labels, maxlen)
x_test, y_test, text_word_index = create_data(test_texts, test_labels, maxlen)

4950 eindeutige Tokens gefunden.
Shape of data: (1620, 95)
Shape of labels: (1620,)
-------------------------------------------
1145 eindeutige Tokens gefunden.
Shape of data: (180, 95)
Shape of labels: (180,)
-------------------------------------------


In [28]:
# Build Matrix
word_embedding_matrix = list()
word_embedding_matrix = np.zeros((max_words, EMBEDDING_DIM))
#word_embedding_matrix.append(np.zeros(EMBEDDING_DIM))

In [29]:
for word, i in word_index.items(): # sorted(word_indices, key=word_indices.get):
    embedding_features = get_unigram_embedding(word, embedding_info[0], unigram_feature_string)
    if i < max_words:
        if embedding_features is not None:
            # Words not found in embedding index will be all-zeros.
            word_embedding_matrix[i] = embedding_features

word_embedding_matrix = np.asarray(word_embedding_matrix, dtype='f')
word_embedding_matrix = scale(word_embedding_matrix)



In [30]:
#print('word_indices_len',word_indices_len)
print('EMBEDDING_DIM',EMBEDDING_DIM)
print('input_length', MAX_SEQUENCE_LENGTH + pre_padding)
embedding = Embedding(max_words, EMBEDDING_DIM, input_length=maxlen, trainable=False)
#embedding = Embedding(word_indices_len + 1, EMBEDDING_DIM,input_length=MAX_SEQUENCE_LENGTH + pre_padding, trainable=False)

EMBEDDING_DIM 1485
input_length 95


In [31]:
def create_model():
    # Create model
    """
    model = Sequential()
    model.add(embedding)
    model.add(Conv1D(32,5, activation='relu'))
    model.add(Flatten()) #3D to 2D
    #model.add(Dense(32, activation='relu'))
    model.add(Dense(4, activation='softmax'))
    #model.summary()
    """
    model = Sequential()
    model.add(embedding)
    model.add(Conv1D(32,5, activation='relu'))
    model.add(layers.Bidirectional(layers.LSTM(32,dropout=0.4, recurrent_dropout=0.4,)))
    #model.add(Dense(8, activation='relu'))
    #model.add(Dense(32, activation='relu'))
    model.add(Dense(4, activation='softmax'))

    return model


In [36]:
# run x Times the folds
for run_num in range(1,fold_runs+1):
    # k-fold
    for train_ind, val_ind in skfold.split(x_train,y_train):

        # Create model
        model = create_model()

        # Load GloVe embedding
        model.layers[0].set_weights([word_embedding_matrix])
        model.layers[0].trainable = False

        # Train and Evaluate
        model.compile(optimizer='adam',
                    loss='sparse_categorical_crossentropy',
                    metrics=['acc'])
        print('------------------------------------------------------------------------')
        print(f'Training for fold {fold_no} ind run {run_num} ...')

        history = model.fit(x_train[train_ind], y_train[train_ind],
                            #epochs=epochs,
                            epochs=5,
                            batch_size=32,
                            verbose=1,
                            validation_data=(x_train[val_ind], y_train[val_ind]))

        # metrics
        scores = model.evaluate(x_train[val_ind], y_train[val_ind], batch_size=128)
        #print(f'Score for fold {fold_no}: {model.metrics_name[0]} of {scores[0]}; {model.metrics_name[1]} of {scores[1]*100}%')
        print(f'Score for fold {fold_no}: ... of {scores[0]}; ... of {scores[1]*100}%')
        acc_per_fold.append(scores[1]*100)
        loss_per_fold.append(scores[0])

        fold_no += 1

------------------------------------------------------------------------
Training for fold 1 ind run 1 ...
Train on 1080 samples, validate on 540 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 1: ... of 1.0363049568953338; ... of 56.48148059844971%
------------------------------------------------------------------------
Training for fold 2 ind run 1 ...
Train on 1080 samples, validate on 540 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 2: ... of 0.9999263772258052; ... of 55.925923585891724%
------------------------------------------------------------------------
Training for fold 3 ind run 1 ...
Train on 1080 samples, validate on 540 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 3: ... of 1.0416074655674121; ... of 52.77777910232544%
------------------------------------------------------------------------
Training for fold 4 ind run 2 ...
Train on 1080 samples, validate on 540 samples
Epoch 1/5
Epoch 2/5


In [33]:
    # == Provide average scores ==
    print('------------------------------------------------------------------------')
    print('Score per fold')
    for i in range(0, len(acc_per_fold)):
        print('------------------------------------------------------------------------')
        print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
    print('------------------------------------------------------------------------')
    print('Average scores for all folds:')
    avg_acc_per_run.append(np.mean(acc_per_fold))
    avg_loss_per_run.append(np.mean(loss_per_fold))
    print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
    print(f'> Loss: {np.mean(loss_per_fold)}')
    print('------------------------------------------------------------------------')

    # reset fold vars
    acc_per_fold = []
    loss_per_fold = []
    fold_no = 1

------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 1.178701270951165 - Accuracy: 48.33333194255829%
------------------------------------------------------------------------
> Fold 2 - Loss: 1.1146059901626022 - Accuracy: 53.14815044403076%
------------------------------------------------------------------------
> Fold 3 - Loss: 1.0720727028670134 - Accuracy: 53.33333611488342%
------------------------------------------------------------------------
> Fold 4 - Loss: 1.104002708858914 - Accuracy: 54.07407283782959%
------------------------------------------------------------------------
> Fold 5 - Loss: 1.1047057814068264 - Accuracy: 50.18518567085266%
------------------------------------------------------------------------
> Fold 6 - Loss: 1.159952214912132 - Accuracy: 48.148149251937866%
------------------------------------------------------------------------
A

In [34]:
# == Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(avg_acc_per_run)):
    print('------------------------------------------------------------------------')
    print(f'> Run {i+1} Fold averages - Loss: {avg_loss_per_run[i]} - Accuracy: {avg_acc_per_run[i]}%')
print('------------------------------------------------------------------------')
print(f'Overall average scores for all {fold_runs} runs:')

print(f'> Accuracy: {np.mean(avg_acc_per_run)} (+- {np.std(avg_acc_per_run)})')
print(f'> Loss: {np.mean(avg_loss_per_run)}')
print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Run 1 Fold averages - Loss: 1.1223401115264422 - Accuracy: 51.203704377015434%
------------------------------------------------------------------------
Overall average scores for all 2 runs:
> Accuracy: 51.203704377015434 (+- 0.0)
> Loss: 1.1223401115264422
------------------------------------------------------------------------


In [35]:
# create final model #Todo sync with fold rund
if create_final_model:
    model = create_model()
    model.summary()

    # Load GloVe embedding
    model.layers[0].set_weights([word_embedding_matrix])
    model.layers[0].trainable = False

    # Train and Evaluate
    model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['acc'])
    print('------------------------------------------------------------------------')
    print('Training for final model ...')

    history = model.fit(x_train[train_ind], y_train[train_ind],
                        epochs=epochs,
                        batch_size=32,
                        verbose=1)
    model.save('model_emotion_detection.h5')   

    # Test final model
    print("Evaluate Findal Model on test data")
    results = model.evaluate(x_test, y_test, batch_size=128)
    print("test loss, test acc:", results)

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 95, 1485)          14850000  
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 91, 32)            237632    
_________________________________________________________________
bidirectional_7 (Bidirection (None, 64)                16640     
_________________________________________________________________
dense_7 (Dense)              (None, 4)                 260       
Total params: 15,104,532
Trainable params: 254,532
Non-trainable params: 14,850,000
_________________________________________________________________
------------------------------------------------------------------------
Training for final model ...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Evaluate Findal Model on test data
test loss, test acc: [1.4015240563286675, 0.31111112236976624]
