In [1]:
import os
import pickle

def load_embeddings(file_name, vocabulary):
    """
    Loads word embeddings from the file with the given name.
    :param file_name: name of the file containing word embeddings
    :type file_name: str
    :param vocabulary: captions vocabulary
    :type vocabulary: numpy.array
    :return: word embeddings
    :rtype: dict
    """
    embeddings = dict()
    with open(file_name, 'r', encoding='utf-8') as doc:
        line = doc.readline()
        while line != '':
            line = line.rstrip('\n').lower()
            parts = line.split(' ')
            vals = np.array(parts[1:], dtype=np.float)
            if parts[0] in vocabulary:
                embeddings[parts[0]] = vals
            line = doc.readline()
    return embeddings


def load_embedding_weights(vocabulary, embedding_size, embedding_type, path='.'):
    print("local")
    """
    Creates and loads embedding weights.
    :param vocabulary: vocabulary
    :type vocabulary: numpy.array
    :param embedding_size: embedding size
    :type embedding_size: int
    :param embedding_type: type of the pre-trained embeddings
    :type embedding_type: string
    :return: embedding weights
    :rtype: numpy.array
    """
    if os.path.exists(f'{path}/embedding_matrix_{embedding_type}_{embedding_size}.pkl'):
        with open(f'{path}/embedding_matrix_{embedding_type}_{embedding_size}.pkl', 'rb') as f:
            embedding_matrix = pickle.load(f)
    else:
        print('Creating embedding weights...')
        if embedding_type == 'glove':
            embeddings = load_embeddings(f'{path}/glove.6B.{embedding_size}d.txt', vocabulary)
        else:
          embeddings = load_embeddings(f'{path}/word2vecSG.iSarcasamEval.{embedding_size}d.txt', vocabulary)
        embedding_matrix = np.zeros((len(vocabulary), embedding_size))
        for i in range(len(vocabulary)):
            if vocabulary[i] in embeddings.keys():
                embedding_matrix[i] = embeddings[vocabulary[i]]
            else:
                embedding_matrix[i] = np.random.standard_normal(embedding_size)
        with open(f'{path}/embedding_matrix_{embedding_type}_{embedding_size}.pkl', 'wb') as f:
            pickle.dump(embedding_matrix, f)
    return embedding_matrix

In [2]:
from nltk import word_tokenize
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from scripts.word_embeddings import load_embedding_weights

from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Input, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import categorical_crossentropy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split

2021-12-05 12:03:23.555210: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-05 12:03:23.555249: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
def load_data():
    return pd.read_csv('data/train.En.csv', usecols=['tweet', 'rephrase']).dropna()

In [4]:
def tokenize(data):
    data['tweet_tokens'] = data['tweet'].apply(lambda x: word_tokenize(x.lower()))
    data['rephrase_tokens'] = data['rephrase'].apply(lambda x: word_tokenize(x.lower()))

In [5]:
def append_start_end(data):
    data['tweet_tokens'] = data['tweet_tokens'].apply(lambda x: np.concatenate((['<START>'], x, ['</END>'])))
    data['rephrase_tokens'] = data['rephrase_tokens'].apply(lambda x: np.concatenate((['<START>'], x, ['</END>'])))


In [6]:
def create_vocabulary(sentence_tokens):
    vocab = set()
    for tokens in sentence_tokens:
        vocab.update(tokens)
    
    vocab = list(vocab)
    word_to_id = {word: index for word, index in zip(vocab, range(len(vocab)))}
    id_to_word = {index: word for word, index in zip(vocab, range(len(vocab)))}
    return vocab, word_to_id, id_to_word

In [7]:
def create_train_data(sentences, rephrases):
    input_sentences, input_rephrases, next_words = [], [], []

    for sentence, rephrase in zip(sentences, rephrases):
        for i in range(1, len(rephrase)):
            input_sentences.append(sentence)
            input_rephrases.append(rephrase[:i])
            next_words.append(rephrase[i])

    return input_sentences, input_rephrases, next_words

In [29]:
def create_model(padding_size, vocabulary_size, embedding_size, embeddings=None, name=""):
    # encoder
    encoder_inputs = Input(shape=(padding_size,), name='encoder_inputs')
    encoder_embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_size,
                                #   weights=[embeddings], 
                                  trainable=False)(encoder_inputs)

    encoder = LSTM(128, return_state=True, name='encoder')
    encoder(encoder_embedding)

    _, state_h, state_c = encoder(encoder_embedding)
    encoder_states = [state_h, state_c]

    # decoder
    decoder_inputs = Input(shape=(padding_size,), name='decoder_inputs')
    decoder_embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_size,
                                #   weights=[embeddings], 
                                  trainable=False)(decoder_inputs)

    decoder = LSTM(128, return_state=True, name='decoder')
    decoder_outputs, _, _ =decoder(decoder_embedding, initial_state=encoder_states)

    decoder_outputs = Dense(vocabulary_size, activation='softmax', name='decoder_dense')(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(optimizer=Adam(lr=0.001), loss=categorical_crossentropy, metrics=['accuracy'])
    model._name = name
    return model

# Data Loading and Preprocessing

In [9]:
df = load_data()
df.head()

Unnamed: 0,tweet,rephrase
0,The only thing I got from college is a caffein...,"College is really difficult, expensive, tiring..."
1,I love it when professors draw a big question ...,I do not like when professors don’t write out ...
2,Remember the hundred emails from companies whe...,"I, at the bare minimum, wish companies actuall..."
3,Today my pop-pop told me I was not “forced” to...,"Today my pop-pop told me I was not ""forced"" to..."
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,I would say Ted Cruz is an asshole and doesn’t...


In [10]:
tokenize(df)
df.head()

Unnamed: 0,tweet,rephrase,tweet_tokens,rephrase_tokens
0,The only thing I got from college is a caffein...,"College is really difficult, expensive, tiring...","[the, only, thing, i, got, from, college, is, ...","[college, is, really, difficult, ,, expensive,..."
1,I love it when professors draw a big question ...,I do not like when professors don’t write out ...,"[i, love, it, when, professors, draw, a, big, ...","[i, do, not, like, when, professors, don, ’, t..."
2,Remember the hundred emails from companies whe...,"I, at the bare minimum, wish companies actuall...","[remember, the, hundred, emails, from, compani...","[i, ,, at, the, bare, minimum, ,, wish, compan..."
3,Today my pop-pop told me I was not “forced” to...,"Today my pop-pop told me I was not ""forced"" to...","[today, my, pop-pop, told, me, i, was, not, “,...","[today, my, pop-pop, told, me, i, was, not, ``..."
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,I would say Ted Cruz is an asshole and doesn’t...,"[@, volphancarol, @, littlewhitty, @, mystical...","[i, would, say, ted, cruz, is, an, asshole, an..."


In [11]:
append_start_end(df)
df.head()

Unnamed: 0,tweet,rephrase,tweet_tokens,rephrase_tokens
0,The only thing I got from college is a caffein...,"College is really difficult, expensive, tiring...","[<START>, the, only, thing, i, got, from, coll...","[<START>, college, is, really, difficult, ,, e..."
1,I love it when professors draw a big question ...,I do not like when professors don’t write out ...,"[<START>, i, love, it, when, professors, draw,...","[<START>, i, do, not, like, when, professors, ..."
2,Remember the hundred emails from companies whe...,"I, at the bare minimum, wish companies actuall...","[<START>, remember, the, hundred, emails, from...","[<START>, i, ,, at, the, bare, minimum, ,, wis..."
3,Today my pop-pop told me I was not “forced” to...,"Today my pop-pop told me I was not ""forced"" to...","[<START>, today, my, pop-pop, told, me, i, was...","[<START>, today, my, pop-pop, told, me, i, was..."
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,I would say Ted Cruz is an asshole and doesn’t...,"[<START>, @, volphancarol, @, littlewhitty, @,...","[<START>, i, would, say, ted, cruz, is, an, as..."


In [12]:
sentences = df['tweet_tokens'].values
rephrases = df['rephrase_tokens'].values

In [13]:
vocabulary, word_to_id, id_to_word = create_vocabulary(np.concatenate((sentences, rephrases)))

df['tweet_indices'] = df['tweet_tokens'].apply(lambda x: np.array([word_to_id[i] for i in x]))
sentence_indices = df['tweet_indices'].values

df['rephrase_indices'] = df['rephrase_tokens'].apply(lambda x: np.array([word_to_id[i] for i in x]))
rephrase_indices = df['rephrase_indices'].values


In [14]:
embeddings = load_embedding_weights(vocabulary, 50, 'glove', "/mnt/d/Downloads")

In [15]:
train_sentences, train_rephrases, \
    test_sentences, test_rephrases = train_test_split(sentence_indices, rephrase_indices, test_size=0.2, random_state=42)

In [16]:
input_sentences, input_rephrases, next_words = create_train_data(train_sentences, train_rephrases)

In [17]:
padded_sentences = pad_sequences(input_sentences, maxlen=10)
padded_rephrases = pad_sequences(input_rephrases, maxlen=10)

In [18]:
label_binarizer = LabelBinarizer()
label_binarizer.fit(list(word_to_id.values()))
next_words = label_binarizer.transform(next_words)

# Create Model

In [19]:
import wandb
from wandb.keras import WandbCallback

In [33]:
model = create_model(10, len(vocabulary), 50, embeddings, name='lstm_seq2seq')

model.fit([np.array(padded_sentences), np.array(padded_rephrases)],
              np.array(next_words),
              batch_size=64, epochs=10, verbose=2)

  super(Adam, self).__init__(name, **kwargs)


Epoch 1/10
63/63 - 5s - loss: 7.5857 - accuracy: 0.0387 - 5s/epoch - 73ms/step
Epoch 2/10
63/63 - 2s - loss: 6.4355 - accuracy: 0.0434 - 2s/epoch - 27ms/step
Epoch 3/10
63/63 - 2s - loss: 6.3038 - accuracy: 0.0434 - 2s/epoch - 27ms/step
Epoch 4/10
63/63 - 2s - loss: 6.2700 - accuracy: 0.0417 - 2s/epoch - 28ms/step
Epoch 5/10
63/63 - 2s - loss: 6.2546 - accuracy: 0.0434 - 2s/epoch - 27ms/step
Epoch 6/10
63/63 - 2s - loss: 6.2465 - accuracy: 0.0434 - 2s/epoch - 27ms/step
Epoch 7/10
63/63 - 2s - loss: 6.2385 - accuracy: 0.0434 - 2s/epoch - 28ms/step
Epoch 8/10
63/63 - 2s - loss: 6.2361 - accuracy: 0.0404 - 2s/epoch - 26ms/step
Epoch 9/10
63/63 - 2s - loss: 6.2304 - accuracy: 0.0434 - 2s/epoch - 26ms/step
Epoch 10/10
63/63 - 2s - loss: 6.2255 - accuracy: 0.0434 - 2s/epoch - 27ms/step


<keras.callbacks.History at 0x7ffa441a7640>

In [35]:
model.save_weights('EncoderDecoder.h5')
