In [93]:
from uuid import uuid4
import pandas as pd
import numpy as np
import wandb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model

from scripts.utils import load_data
from scripts.utils import nlp_pipeline
from scripts.utils import create_vocabulary
from scripts.loader import load_embeddings


# Utils

In [7]:
def append_start_end(data):
    data['text_tokens'] = data['text_tokens'].apply(lambda x: np.concatenate((['<START>'], x, ['</END>'])))
    data['summary_tokens'] = data['summary_tokens'].apply(lambda x: np.concatenate((['<START>'], x, ['</END>'])))

In [40]:
def create_train_data(texts, summaries):
    input_texts, input_summaries, next_words = [], [], []

    for sentence, rephrase in zip(texts, summaries):
        for i in range(1, len(rephrase)):
            input_texts.append(sentence)
            input_summaries.append(rephrase[:i])
            next_words.append(rephrase[i])

    return input_texts, input_summaries, next_words

# Data Preprocessing

In [55]:
run = wandb.init(project="[NLP] lab-05 | text-summarization", job_type="load_data")

[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
2021-12-09 07:25:41.276551: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-09 07:25:41.276598: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [64]:
df = load_data()
df.head()

Unnamed: 0,id,text,summary
0,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\r\n but for the...",This unit is generally quite accurate. \r\nSe...
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and ve...",The rooms were not large but were clean and ve...
2,battery-life_amazon_kindle,After I plugged it in to my USB hub on my com...,Battery life is exceptional.\r\nThe Kindle can...
3,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\...,The battery life is too short.\r\nThe time bet...
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh ...",The battery life is longer then 5 hours.\r\nBu...


Upload raw data as artifact to WANDB

In [63]:
raw_data = wandb.Artifact(
    "opinosis-raw", type="dataset",
    description="Raw OPINOSIS dataset",
    metadata={"source": "https://archive.ics.uci.edu/ml/datasets/Opinosis+Opinion+%26frasl%3B+Review",
                "sizes": len(df)}
)

complete_data = wandb.Table(data=df, columns=df.columns)
raw_data.add(complete_data, "Complete dataset")
run.log_artifact(raw_data)

<wandb.sdk.wandb_artifacts.Artifact at 0x7f0eb28433a0>

## Tokenization


In [65]:
df['text_tokens'] = df['text'].apply(lambda x: nlp_pipeline(x))
df['summary_tokens'] = df['summary'].apply(lambda x: nlp_pipeline(x))
df.head()


Unnamed: 0,id,text,summary,text_tokens,summary_tokens
0,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\r\n but for the...",This unit is generally quite accurate. \r\nSe...,"[accurate, part, find, garmin, software, provi...","[unit, generally, quite, accurate, set-up, usa..."
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and ve...",The rooms were not large but were clean and ve...,"[room, overly, big, clean, comfortable, beds, ...","[rooms, large, clean, comfortable, bathroom, s..."
2,battery-life_amazon_kindle,After I plugged it in to my USB hub on my com...,Battery life is exceptional.\r\nThe Kindle can...,"[plugged, usb, hub, computer, charge, battery,...","[battery, life, exceptional, kindle, run, days..."
3,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\...,The battery life is too short.\r\nThe time bet...,"[short, battery, life, moved, 8gb, love, ipod,...","[battery, life, short, time, chargers, enough]"
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh ...",The battery life is longer then 5 hours.\r\nBu...,"[6ghz, 533fsb, cpu, glossy, display, 3, cell, ...","[battery, life, longer, 5, hours, due, battery..."


## START/END Tokens

In [66]:
append_start_end(df)
df.head()

Unnamed: 0,id,text,summary,text_tokens,summary_tokens
0,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\r\n but for the...",This unit is generally quite accurate. \r\nSe...,"[<START>, accurate, part, find, garmin, softwa...","[<START>, unit, generally, quite, accurate, se..."
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and ve...",The rooms were not large but were clean and ve...,"[<START>, room, overly, big, clean, comfortabl...","[<START>, rooms, large, clean, comfortable, ba..."
2,battery-life_amazon_kindle,After I plugged it in to my USB hub on my com...,Battery life is exceptional.\r\nThe Kindle can...,"[<START>, plugged, usb, hub, computer, charge,...","[<START>, battery, life, exceptional, kindle, ..."
3,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\...,The battery life is too short.\r\nThe time bet...,"[<START>, short, battery, life, moved, 8gb, lo...","[<START>, battery, life, short, time, chargers..."
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh ...",The battery life is longer then 5 hours.\r\nBu...,"[<START>, 6ghz, 533fsb, cpu, glossy, display, ...","[<START>, battery, life, longer, 5, hours, due..."


## Create Vocabulary and Embeddings

In [67]:
texts = df['text_tokens'].values
summaries = df['summary_tokens'].values

In [68]:
vocabulary, word_to_id, id_to_word = create_vocabulary(np.concatenate((texts, summaries)))

In [69]:
df['text_indices'] = df['text_tokens'].apply(lambda tokens: np.array([word_to_id[word] for word in tokens]))
df['summary_indices'] = df['summary_tokens'].apply(lambda tokens: np.array([word_to_id[word] for word in tokens]))

text_indices = df['text_indices'].values
summary_indices = df['summary_indices'].values

df.head()

Unnamed: 0,id,text,summary,text_tokens,summary_tokens,text_indices,summary_indices
0,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\r\n but for the...",This unit is generally quite accurate. \r\nSe...,"[<START>, accurate, part, find, garmin, softwa...","[<START>, unit, generally, quite, accurate, se...","[4775, 4944, 6706, 518, 5524, 6488, 2610, 4944...","[4775, 862, 1690, 3226, 4944, 3899, 1463, 6257..."
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and ve...",The rooms were not large but were clean and ve...,"[<START>, room, overly, big, clean, comfortabl...","[<START>, rooms, large, clean, comfortable, ba...","[4775, 4483, 6166, 6704, 6028, 6954, 5399, 452...","[4775, 4612, 3498, 6028, 6954, 3937, 4407, 955..."
2,battery-life_amazon_kindle,After I plugged it in to my USB hub on my com...,Battery life is exceptional.\r\nThe Kindle can...,"[<START>, plugged, usb, hub, computer, charge,...","[<START>, battery, life, exceptional, kindle, ...","[4775, 6697, 4520, 3262, 3253, 4189, 2465, 166...","[4775, 2465, 1679, 5061, 2841, 5885, 1630, 364..."
3,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\...,The battery life is too short.\r\nThe time bet...,"[<START>, short, battery, life, moved, 8gb, lo...","[<START>, battery, life, short, time, chargers...","[4775, 3034, 2465, 1679, 1332, 2154, 1036, 154...","[4775, 2465, 1679, 3034, 4748, 5418, 1943, 4943]"
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh ...",The battery life is longer then 5 hours.\r\nBu...,"[<START>, 6ghz, 533fsb, cpu, glossy, display, ...","[<START>, battery, life, longer, 5, hours, due...","[4775, 1153, 3811, 16, 903, 6029, 3447, 2620, ...","[4775, 2465, 1679, 7166, 6371, 3342, 3486, 246..."


Upload pre-processed data as artifact to WANDB

In [77]:
pre_processed_data = wandb.Artifact(
    "opinosis-preprocessed", type="dataset",
    description="Preprocessed OPINOSIS dataset",
    metadata={"sizes": len(df), "pipeline": ["tokenization", "indexing", "start/end tokens"]}
)

pre_processed_dataframe = wandb.Table(data=df, columns=df.columns, allow_mixed_types=True)
pre_processed_data.add(pre_processed_dataframe, "Preprocessed dataset")
run.log_artifact(pre_processed_data)

<wandb.sdk.wandb_artifacts.Artifact at 0x7f0e7383cc40>

In [78]:
embeddings = load_embeddings(vocabulary,embedding_size=50, embedding_type='glove', dump_path='./data')

Loading embedding matrix from file
Loading embedding matrix from file


2021-12-09 07:38:04,182 [INFO] Loading embedding matrix from file


# Create Train Test Data

In [79]:
train_texts, train_summaries, test_texts, test_summaries = train_test_split(text_indices, summary_indices, test_size=0.1)
input_texts, input_summaries, next_words = create_train_data(train_texts, train_summaries)

In [83]:
max_texts_length = max([len(text) for text in input_texts])
max_summaries_length = max([len(summary) for summary in input_summaries])

In [85]:
padded_texts = pad_sequences(input_texts, maxlen=max_texts_length)
padded_summaries = pad_sequences(input_summaries, maxlen=max_summaries_length)

In [88]:
label_binarizer = LabelBinarizer()
label_binarizer.fit(list(word_to_id.values()))
next_words = label_binarizer.transform(next_words)

# Create Model

In [96]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import categorical_crossentropy

In [98]:
def create_model(texts_size, summaries_size, vocabulary_size, embedding_size, embeddings=None, name=None) -> Model:
    name = f"Encoder-Decoder-{str(uuid4())}" if name is None else name

    # Encoder
    encoder_inputs = Input(shape=(texts_size,), name="encoder_inputs")
    encoder_embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_size,
                                   weights=[embeddings],
                                   trainable=False)(encoder_inputs)
    
    encoder = LSTM(128, return_state=True, name="encoder")
    encoder(encoder_embedding)
    
    _, state_h, state_c = encoder(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(summaries_size,), name="decoder_inputs")
    decoder_embedding = Embedding(input_dim = vocabulary_size, output_dim=embedding_size,
                                  weights=[embeddings],
                                  trainable=False)(decoder_inputs)

    decoder = LSTM(128, return_state=True, name="decoder")
    decoder_outputs, _, _ = decoder(decoder_embedding, initial_state=encoder_states)

    decoder_outputs = Dense(vocabulary_size, activation='softmax', name='decoder_dense')(decoder_outputs)

    # Compile the model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(optimizer=Adam(lr=0.01), loss=categorical_crossentropy, metrics=['accuracy'])
    model._name = name
    return model

In [100]:
run.finish()

In [101]:
model = create_model(max_texts_length, max_summaries_length, len(vocabulary), 50, embeddings)

  super(Adam, self).__init__(name, **kwargs)


In [103]:
model.fit([np.array(padded_texts), np.array(padded_summaries)],
              np.array(next_words),
              batch_size=64, epochs=5, verbose=1)

Epoch 1/10
  5/139 [>.............................] - ETA: 15:13 - loss: 8.0637 - accuracy: 0.0406

KeyboardInterrupt: 

In [104]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0
