In [56]:
import pandas as pd
from nltk import word_tokenize
from word_embeddings import load_embedding_weights
import numpy as np

from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, GRU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy

# Load Data

In [2]:
df = pd.read_csv('data/train.En.csv', usecols=['tweet', 'sarcastic']).dropna()
df['tweet_tokens'] = df['tweet'].apply(lambda x: word_tokenize(x.lower()))

In [26]:
def create_vocabulary(sentence_tokens):
    vocabulary = set()
    for tokens in sentence_tokens:
        vocabulary.update(tokens)
        
    vocabulary = list(vocabulary)
    word_to_id = {word : index for word, index in zip(vocabulary, range(len(vocabulary)))}
    # word_to_id = dict(enumerate(vocabulary))
    
    return vocabulary, word_to_id

In [27]:
sentences = df['tweet_tokens'].values
labels = df['sarcastic'].values

vocabulary, word_to_id = create_vocabulary(sentences)

# Create Embeddings

In [19]:
embeddings = load_embedding_weights(vocabulary, 100, 'word2vecSG,'/home/aleksandar/projects/NLP_2021/Exercises/2/data')

Creating embedding weights...


In [28]:
df['tweet_indices'] = df['tweet_tokens'].apply(lambda x: np.array([word_to_id[i] for i in x if i in word_to_id.keys()]))

In [35]:
sentence_indices = df['tweet_indices'].values
padded_sentences = pad_sequences(sentence_indices, 10)

# Defining the Models

*(stratify for balanced classes)*

In [40]:
X_train, X_test, y_train, y_test = train_test_split(padded_sentences, labels, test_size=0.1, stratify=labels)

## Long Short-Term Memory

In [51]:
model = Sequential()
model.add(Embedding(input_dim = len(vocabulary), output_dim=100, weights=[embeddings], trainable=False))  # if set false, previously learned word_vec are used, if set true weights are updated with the backpropagation, causing different embeddings
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

# binary_crossentropy - 2 classes
# categorical_crossentropy - n classes
# return_sequences - if true values from all timepoints, if false only from the last timepoint

model.compile(optimizer=Adam(learning_rate = 0.01), loss=binary_crossentropy, metrics=['accuracy'])
model.fit(X_train, y_train, epochs=15, batch_size=32, verbose=2)
model.evaluate(X_test, y_test)

Epoch 1/15
98/98 - 4s - loss: 0.5780 - accuracy: 0.7401
Epoch 2/15
98/98 - 2s - loss: 0.5555 - accuracy: 0.7513
Epoch 3/15
98/98 - 2s - loss: 0.5092 - accuracy: 0.7676
Epoch 4/15
98/98 - 2s - loss: 0.3857 - accuracy: 0.8285
Epoch 5/15
98/98 - 2s - loss: 0.2608 - accuracy: 0.8929
Epoch 6/15
98/98 - 2s - loss: 0.1655 - accuracy: 0.9372
Epoch 7/15
98/98 - 2s - loss: 0.1040 - accuracy: 0.9622
Epoch 8/15
98/98 - 2s - loss: 0.0927 - accuracy: 0.9647
Epoch 9/15
98/98 - 2s - loss: 0.0758 - accuracy: 0.9712
Epoch 10/15
98/98 - 2s - loss: 0.0678 - accuracy: 0.9779
Epoch 11/15
98/98 - 2s - loss: 0.0656 - accuracy: 0.9753
Epoch 12/15
98/98 - 2s - loss: 0.0777 - accuracy: 0.9702
Epoch 13/15
98/98 - 2s - loss: 0.0905 - accuracy: 0.9673
Epoch 14/15
98/98 - 2s - loss: 0.0861 - accuracy: 0.9686
Epoch 15/15
98/98 - 2s - loss: 0.0723 - accuracy: 0.9724


[1.2875099182128906, 0.6599423885345459]

## Bidirectional Long Short-Term Memory

In [53]:
model = Sequential()
model.add(Embedding(input_dim = len(vocabulary), output_dim=100, weights=[embeddings], trainable=False))  # if set false, previously learned word_vec are used, if set true weights are updated with the backpropagation, causing different embeddings

model.add(Bidirectional(LSTM(128)))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate = 0.01), loss=binary_crossentropy, metrics=['accuracy'])
model.fit(X_train, y_train, epochs=15, batch_size=32, verbose=2)
model.evaluate(X_test, y_test)

Epoch 1/15
98/98 - 3s - loss: 0.5785 - accuracy: 0.7397
Epoch 2/15
98/98 - 1s - loss: 0.5139 - accuracy: 0.7635
Epoch 3/15
98/98 - 1s - loss: 0.3601 - accuracy: 0.8519
Epoch 4/15
98/98 - 1s - loss: 0.1580 - accuracy: 0.9474
Epoch 5/15
98/98 - 1s - loss: 0.0736 - accuracy: 0.9760
Epoch 6/15
98/98 - 1s - loss: 0.0386 - accuracy: 0.9897
Epoch 7/15
98/98 - 1s - loss: 0.0271 - accuracy: 0.9894
Epoch 8/15
98/98 - 1s - loss: 0.0223 - accuracy: 0.9913
Epoch 9/15
98/98 - 1s - loss: 0.0350 - accuracy: 0.9878
Epoch 10/15
98/98 - 1s - loss: 0.0761 - accuracy: 0.9728
Epoch 11/15
98/98 - 1s - loss: 0.0928 - accuracy: 0.9657
Epoch 12/15
98/98 - 1s - loss: 0.0784 - accuracy: 0.9705
Epoch 13/15
98/98 - 1s - loss: 0.0413 - accuracy: 0.9865
Epoch 14/15
98/98 - 1s - loss: 0.0184 - accuracy: 0.9936
Epoch 15/15
98/98 - 1s - loss: 0.0113 - accuracy: 0.9949


[1.5866525173187256, 0.6887608170509338]

## Gated Recurrent Units

In [57]:
model = Sequential()
model.add(Embedding(input_dim = len(vocabulary), output_dim=100, weights=[embeddings], trainable=False))  # if set false, previously learned word_vec are used, if set true weights are updated with the backpropagation, causing different embeddings

model.add(GRU(128))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate = 0.01), loss=binary_crossentropy, metrics=['accuracy'])
model.fit(X_train, y_train, epochs=15, batch_size=32, verbose=2)
model.evaluate(X_test, y_test)

Epoch 1/15
98/98 - 2s - loss: 0.5843 - accuracy: 0.7369
Epoch 2/15
98/98 - 1s - loss: 0.5343 - accuracy: 0.7532
Epoch 3/15
98/98 - 1s - loss: 0.4283 - accuracy: 0.8067
Epoch 4/15
98/98 - 1s - loss: 0.2713 - accuracy: 0.8946
Epoch 5/15
98/98 - 1s - loss: 0.1815 - accuracy: 0.9292
Epoch 6/15
98/98 - 1s - loss: 0.1218 - accuracy: 0.9574
Epoch 7/15
98/98 - 1s - loss: 0.0964 - accuracy: 0.9702
Epoch 8/15
98/98 - 1s - loss: 0.0762 - accuracy: 0.9728
Epoch 9/15
98/98 - 1s - loss: 0.0701 - accuracy: 0.9760
Epoch 10/15
98/98 - 1s - loss: 0.0594 - accuracy: 0.9804
Epoch 11/15
98/98 - 1s - loss: 0.1157 - accuracy: 0.9612
Epoch 12/15
98/98 - 1s - loss: 0.1602 - accuracy: 0.9391
Epoch 13/15
98/98 - 1s - loss: 0.1433 - accuracy: 0.9436
Epoch 14/15
98/98 - 1s - loss: 0.0979 - accuracy: 0.9625
Epoch 15/15
98/98 - 1s - loss: 0.0830 - accuracy: 0.9676


[1.2985566854476929, 0.6974063515663147]