# <font color="grey">Imports and utility functions</font>

In [114]:
import numpy as np
import pandas as pd
from nltk import word_tokenize

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Input, Embedding, Concatenate
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.optimizers import Adam

from scripts.word_embeddings import load_embedding_weights

In [35]:
def tokenize(data: pd.core.series):
    data['tweet_tokens'] = data['tweet'].apply(lambda x: word_tokenize(x.lower()))
    data['rephrase_tokens'] = data['rephrase'].apply(lambda x: word_tokenize(x.lower()))

In [63]:
def create_vocabulary(sentence_tokens):
    vocabulary = set()
    for tokens in sentence_tokens:
        vocabulary.update(tokens)

    vocabulary = list(vocabulary)
    return vocabulary, dict(map(lambda x: (x[1], x[0]), enumerate(vocabulary)))

# Data Loading

In [78]:
df = pd.read_csv('data/train.En.csv', usecols=['tweet', 'rephrase']).dropna()
# tweet - has sarcasam; rephrase - does not have sarcasam
df.head()

Unnamed: 0,tweet,rephrase
0,The only thing I got from college is a caffein...,"College is really difficult, expensive, tiring..."
1,I love it when professors draw a big question ...,I do not like when professors don’t write out ...
2,Remember the hundred emails from companies whe...,"I, at the bare minimum, wish companies actuall..."
3,Today my pop-pop told me I was not “forced” to...,"Today my pop-pop told me I was not ""forced"" to..."
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,I would say Ted Cruz is an asshole and doesn’t...


In [79]:
tokenize(df)
df.head()

Unnamed: 0,tweet,rephrase,tweet_tokens,rephrase_tokens
0,The only thing I got from college is a caffein...,"College is really difficult, expensive, tiring...","[the, only, thing, i, got, from, college, is, ...","[college, is, really, difficult, ,, expensive,..."
1,I love it when professors draw a big question ...,I do not like when professors don’t write out ...,"[i, love, it, when, professors, draw, a, big, ...","[i, do, not, like, when, professors, don, ’, t..."
2,Remember the hundred emails from companies whe...,"I, at the bare minimum, wish companies actuall...","[remember, the, hundred, emails, from, compani...","[i, ,, at, the, bare, minimum, ,, wish, compan..."
3,Today my pop-pop told me I was not “forced” to...,"Today my pop-pop told me I was not ""forced"" to...","[today, my, pop-pop, told, me, i, was, not, “,...","[today, my, pop-pop, told, me, i, was, not, ``..."
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,I would say Ted Cruz is an asshole and doesn’t...,"[@, volphancarol, @, littlewhitty, @, mystical...","[i, would, say, ted, cruz, is, an, asshole, an..."


In [80]:
sentences = df['tweet_tokens'].values
rephrases = df['rephrase_tokens'].values

vocabulary, word_to_id = create_vocabulary(np.concatenate((sentences, rephrases)))

In [81]:
embeddings = load_embedding_weights(vocabulary, 100, 'word2vecSG',"/home/aleksandar/projects/NLP_2021/Exercises/2/data")

In [82]:
df['tweet_indices'] = df['tweet_tokens'].apply(lambda x: np.array([word_to_id[i] for i in x]))
df['rephrase_indices'] = df['rephrase_tokens'].apply(lambda x: np.array([word_to_id[i] for i in x]))

In [83]:
sentence_indices = df['tweet_indices'].values
rephrase_indices = df['rephrase_indices'].values

In [84]:
df.head()

Unnamed: 0,tweet,rephrase,tweet_tokens,rephrase_tokens,tweet_indices,rephrase_indices
0,The only thing I got from college is a caffein...,"College is really difficult, expensive, tiring...","[the, only, thing, i, got, from, college, is, ...","[college, is, really, difficult, ,, expensive,...","[1769, 1457, 3537, 1205, 2715, 646, 458, 1175,...","[458, 1175, 749, 241, 3300, 9, 3300, 582, 3300..."
1,I love it when professors draw a big question ...,I do not like when professors don’t write out ...,"[i, love, it, when, professors, draw, a, big, ...","[i, do, not, like, when, professors, don, ’, t...","[1205, 2294, 702, 773, 1625, 4260, 1043, 3800,...","[1205, 4347, 1447, 1247, 773, 1625, 1627, 2993..."
2,Remember the hundred emails from companies whe...,"I, at the bare minimum, wish companies actuall...","[remember, the, hundred, emails, from, compani...","[i, ,, at, the, bare, minimum, ,, wish, compan...","[1847, 1769, 470, 5070, 646, 2120, 773, 266, 1...","[1205, 3300, 2423, 1769, 3701, 4484, 3300, 176..."
3,Today my pop-pop told me I was not “forced” to...,"Today my pop-pop told me I was not ""forced"" to...","[today, my, pop-pop, told, me, i, was, not, “,...","[today, my, pop-pop, told, me, i, was, not, ``...","[1979, 2402, 1213, 1756, 905, 1205, 2096, 1447...","[1979, 2402, 1213, 1756, 905, 1205, 2096, 1447..."
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,I would say Ted Cruz is an asshole and doesn’t...,"[@, volphancarol, @, littlewhitty, @, mystical...","[i, would, say, ted, cruz, is, an, asshole, an...","[1345, 3746, 1345, 5155, 1345, 4669, 1205, 379...","[1205, 1406, 5111, 1277, 4371, 1175, 3032, 69,..."


In [87]:
padded_sentenes = pad_sequences(sentence_indices, 10)
rephrase_sentenes = pad_sequences(rephrase_indices, 10)

In [109]:
input1 = Input(shape=(10,))
x1 = Embedding(input_dim=len(vocabulary), output_dim=100, weights=[embeddings], trainable=False)(input1)
x1 = LSTM(128)(x1)

input2 = Input(shape=(10,))
x2 = Embedding(input_dim=len(vocabulary), output_dim=100, weights=[embeddings], trainable=False)(input2)
x2 = LSTM(128)(x2)

x = Concatenate()([x1, x2])

output = Dense(1, activation='sigmoid')(x)

In [116]:
model = Model(inputs=[input1, input2], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.01), loss=binary_crossentropy, metrics=['accuracy'])