# <font color="grey">Imports and utility functions</font>

In [23]:
import numpy as np
import pandas as pd
from nltk import word_tokenize

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Input, Embedding, Concatenate
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.optimizers import Adam

from scripts.word_embeddings import load_embedding_weights
from sklearn.model_selection import train_test_split

In [2]:
def tokenize(data: pd.core.series):
    data['tweet_tokens'] = data['tweet'].apply(lambda x: word_tokenize(x.lower()))
    data['rephrase_tokens'] = data['rephrase'].apply(lambda x: word_tokenize(x.lower()))

In [3]:
def create_vocabulary(sentence_tokens):
    vocabulary = set()
    for tokens in sentence_tokens:
        vocabulary.update(tokens)

    vocabulary = list(vocabulary)
    return vocabulary, dict(map(lambda x: (x[1], x[0]), enumerate(vocabulary)))

In [20]:
def create_train_test_data(sentences, rephrases):
    sent1, sent2, labels = [], [], []
    
    for sentence, rephrase in zip(sentences, rephrases):
        p = np.random.randint(2)
        if p == 0:
            sent1.append(sentence)
            sent2.append(rephrase)
        else:
            sent1.append(rephrase)
            sent2.append(sentence)
        labels.append(p)
    
    return sent1, sent2, labels

# Data Loading

In [4]:
df = pd.read_csv('data/train.En.csv', usecols=['tweet', 'rephrase']).dropna()
# tweet - has sarcasam; rephrase - does not have sarcasam
df.head()

Unnamed: 0,tweet,rephrase
0,The only thing I got from college is a caffein...,"College is really difficult, expensive, tiring..."
1,I love it when professors draw a big question ...,I do not like when professors don’t write out ...
2,Remember the hundred emails from companies whe...,"I, at the bare minimum, wish companies actuall..."
3,Today my pop-pop told me I was not “forced” to...,"Today my pop-pop told me I was not ""forced"" to..."
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,I would say Ted Cruz is an asshole and doesn’t...


In [6]:
tokenize(df)
df.head()

Unnamed: 0,tweet,rephrase,tweet_tokens,rephrase_tokens
0,The only thing I got from college is a caffein...,"College is really difficult, expensive, tiring...","[the, only, thing, i, got, from, college, is, ...","[college, is, really, difficult, ,, expensive,..."
1,I love it when professors draw a big question ...,I do not like when professors don’t write out ...,"[i, love, it, when, professors, draw, a, big, ...","[i, do, not, like, when, professors, don, ’, t..."
2,Remember the hundred emails from companies whe...,"I, at the bare minimum, wish companies actuall...","[remember, the, hundred, emails, from, compani...","[i, ,, at, the, bare, minimum, ,, wish, compan..."
3,Today my pop-pop told me I was not “forced” to...,"Today my pop-pop told me I was not ""forced"" to...","[today, my, pop-pop, told, me, i, was, not, “,...","[today, my, pop-pop, told, me, i, was, not, ``..."
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,I would say Ted Cruz is an asshole and doesn’t...,"[@, volphancarol, @, littlewhitty, @, mystical...","[i, would, say, ted, cruz, is, an, asshole, an..."


# Tokenization

In [7]:
sentences = df['tweet_tokens'].values
rephrases = df['rephrase_tokens'].values

vocabulary, word_to_id = create_vocabulary(np.concatenate((sentences, rephrases)))

In [8]:
embeddings = load_embedding_weights(vocabulary, 100, 'word2vecSG',"/home/aleksandar/projects/NLP_2021/Exercises/2/data")

In [9]:
df['tweet_indices'] = df['tweet_tokens'].apply(lambda x: np.array([word_to_id[i] for i in x]))
df['rephrase_indices'] = df['rephrase_tokens'].apply(lambda x: np.array([word_to_id[i] for i in x]))

In [10]:
sentence_indices = df['tweet_indices'].values
rephrase_indices = df['rephrase_indices'].values

In [11]:
df.head()

Unnamed: 0,tweet,rephrase,tweet_tokens,rephrase_tokens,tweet_indices,rephrase_indices
0,The only thing I got from college is a caffein...,"College is really difficult, expensive, tiring...","[the, only, thing, i, got, from, college, is, ...","[college, is, really, difficult, ,, expensive,...","[3910, 418, 4627, 4292, 1887, 1298, 2038, 4557...","[2038, 4557, 665, 1849, 3756, 1493, 3756, 4002..."
1,I love it when professors draw a big question ...,I do not like when professors don’t write out ...,"[i, love, it, when, professors, draw, a, big, ...","[i, do, not, like, when, professors, don, ’, t...","[4292, 1642, 1701, 4036, 1705, 3502, 1623, 983...","[4292, 708, 266, 4523, 4036, 1705, 4559, 4576,..."
2,Remember the hundred emails from companies whe...,"I, at the bare minimum, wish companies actuall...","[remember, the, hundred, emails, from, compani...","[i, ,, at, the, bare, minimum, ,, wish, compan...","[749, 3910, 4975, 2082, 1298, 4609, 4036, 4783...","[4292, 3756, 3834, 3910, 4393, 1452, 3756, 242..."
3,Today my pop-pop told me I was not “forced” to...,"Today my pop-pop told me I was not ""forced"" to...","[today, my, pop-pop, told, me, i, was, not, “,...","[today, my, pop-pop, told, me, i, was, not, ``...","[4934, 4470, 2057, 136, 3395, 4292, 1879, 266,...","[4934, 4470, 2057, 136, 3395, 4292, 1879, 266,..."
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,I would say Ted Cruz is an asshole and doesn’t...,"[@, volphancarol, @, littlewhitty, @, mystical...","[i, would, say, ted, cruz, is, an, asshole, an...","[1396, 4340, 1396, 3246, 1396, 4816, 4292, 472...","[4292, 2150, 3295, 4743, 2368, 4557, 1662, 457..."


# Train-Test Data Preparation

In [21]:
padded_sentenes = pad_sequences(sentence_indices, 10)
padded_rephrases = pad_sequences(rephrase_indices, 10)

In [25]:
sentences1, sentences2, labels = create_train_test_data(padded_sentenes, padded_rephrases)
x1_train, x1_test, x2_train, x2_test, y_train, y_test  = train_test_split(sentences1, sentences2, labels, test_size=0.1, random_state=0, stratify=labels)

# Model definition, training, evaluation

In [13]:
input1 = Input(shape=(10,))
x1 = Embedding(input_dim=len(vocabulary), output_dim=100, weights=[embeddings], trainable=False)(input1)
x1 = LSTM(128)(x1)

input2 = Input(shape=(10,))
x2 = Embedding(input_dim=len(vocabulary), output_dim=100, weights=[embeddings], trainable=False)(input2)
x2 = LSTM(128)(x2)

x = Concatenate()([x1, x2])

output = Dense(1, activation='sigmoid')(x)

2021-11-11 07:32:28.181954: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-11-11 07:32:28.182004: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-11-11 07:32:28.182023: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (legion-y540): /proc/driver/nvidia/version does not exist
2021-11-11 07:32:28.182258: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
model = Model(inputs=[input1, input2], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.01), loss=binary_crossentropy, metrics=['accuracy'])

In [28]:
model.fit([np.array(x1_train), np.array(x2_train)], np.array(y_train), batch_size=32, epochs=15, verbose=2)

Epoch 1/15
25/25 - 0s - loss: 8.1966e-05 - accuracy: 1.0000 - 311ms/epoch - 12ms/step
Epoch 2/15
25/25 - 0s - loss: 7.3095e-05 - accuracy: 1.0000 - 281ms/epoch - 11ms/step
Epoch 3/15
25/25 - 0s - loss: 6.5899e-05 - accuracy: 1.0000 - 279ms/epoch - 11ms/step
Epoch 4/15
25/25 - 0s - loss: 5.9824e-05 - accuracy: 1.0000 - 275ms/epoch - 11ms/step
Epoch 5/15
25/25 - 0s - loss: 5.4524e-05 - accuracy: 1.0000 - 280ms/epoch - 11ms/step
Epoch 6/15
25/25 - 0s - loss: 4.9933e-05 - accuracy: 1.0000 - 274ms/epoch - 11ms/step
Epoch 7/15
25/25 - 0s - loss: 4.5946e-05 - accuracy: 1.0000 - 285ms/epoch - 11ms/step
Epoch 8/15
25/25 - 0s - loss: 4.2469e-05 - accuracy: 1.0000 - 287ms/epoch - 11ms/step
Epoch 9/15
25/25 - 0s - loss: 3.9322e-05 - accuracy: 1.0000 - 289ms/epoch - 12ms/step
Epoch 10/15
25/25 - 0s - loss: 3.6594e-05 - accuracy: 1.0000 - 281ms/epoch - 11ms/step
Epoch 11/15
25/25 - 0s - loss: 3.4039e-05 - accuracy: 1.0000 - 288ms/epoch - 12ms/step
Epoch 12/15
25/25 - 0s - loss: 3.1872e-05 - accuracy

<keras.callbacks.History at 0x7f97d05d81c0>

In [29]:
model.evaluate([np.array(x1_test), np.array(x2_test)], np.array(y_test))



[0.7828599810600281, 0.8275862336158752]