In [1]:
import numpy as np
import wandb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from scripts.utils import load_data
from scripts.utils import nlp_pipeline
from scripts.utils import create_vocabulary
from scripts.loader import load_embeddings

2021-12-10 23:31:19.467182: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-10 23:31:19.467225: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Utils

In [2]:
def append_start_end(data):
    data['text_tokens'] = data['text_tokens'].apply(lambda x: np.concatenate((['<START>'], x, ['</END>'])))
    data['summary_tokens'] = data['summary_tokens'].apply(lambda x: np.concatenate((['<START>'], x, ['</END>'])))

In [3]:
def create_train_data(texts, summaries):
    input_texts, input_summaries, next_words = [], [], []

    for sentence, rephrase in zip(texts, summaries):
        for i in range(1, len(rephrase)):
            input_texts.append(sentence)
            input_summaries.append(rephrase[:i])
            next_words.append(rephrase[i])

    return input_texts, input_summaries, next_words

# Data Preprocessing

In [55]:
run = wandb.init(project="[NLP] lab-05 | text-summarization", job_type="load_data")

[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
2021-12-09 07:25:41.276551: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-09 07:25:41.276598: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [4]:
df = load_data()
df.head()

Unnamed: 0,id,text,summary
0,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\r\n but for the...",This unit is generally quite accurate. \r\nSe...
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and ve...",The rooms were not large but were clean and ve...
2,battery-life_amazon_kindle,After I plugged it in to my USB hub on my com...,Battery life is exceptional.\r\nThe Kindle can...
3,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\...,The battery life is too short.\r\nThe time bet...
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh ...",The battery life is longer then 5 hours.\r\nBu...


In [25]:
df.to_csv('data/data.csv', index=False)

Upload raw data as artifact to WANDB

In [63]:
raw_data = wandb.Artifact(
    "opinosis-raw", type="dataset",
    description="Raw OPINOSIS dataset",
    metadata={"source": "https://archive.ics.uci.edu/ml/datasets/Opinosis+Opinion+%26frasl%3B+Review",
                "sizes": len(df)}
)

complete_data = wandb.Table(data=df, columns=df.columns)
raw_data.add(complete_data, "Complete dataset")
run.log_artifact(raw_data)

<wandb.sdk.wandb_artifacts.Artifact at 0x7f0eb28433a0>

## Tokenization


In [5]:
df['text_tokens'] = df['text'].apply(lambda x: nlp_pipeline(x))
df['summary_tokens'] = df['summary'].apply(lambda x: nlp_pipeline(x))
df.head()

Unnamed: 0,id,text,summary,text_tokens,summary_tokens
0,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\r\n but for the...",This unit is generally quite accurate. \r\nSe...,"[accurate, part, find, garmin, software, provi...","[unit, generally, quite, accurate, set-up, usa..."
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and ve...",The rooms were not large but were clean and ve...,"[room, overly, big, clean, comfortable, beds, ...","[rooms, large, clean, comfortable, bathroom, s..."
2,battery-life_amazon_kindle,After I plugged it in to my USB hub on my com...,Battery life is exceptional.\r\nThe Kindle can...,"[plugged, usb, hub, computer, charge, battery,...","[battery, life, exceptional, kindle, run, days..."
3,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\...,The battery life is too short.\r\nThe time bet...,"[short, battery, life, moved, 8gb, love, ipod,...","[battery, life, short, time, chargers, enough]"
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh ...",The battery life is longer then 5 hours.\r\nBu...,"[6ghz, 533fsb, cpu, glossy, display, 3, cell, ...","[battery, life, longer, 5, hours, due, battery..."


## START/END Tokens

In [6]:
append_start_end(df)
df.head()

Unnamed: 0,id,text,summary,text_tokens,summary_tokens
0,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\r\n but for the...",This unit is generally quite accurate. \r\nSe...,"[<START>, accurate, part, find, garmin, softwa...","[<START>, unit, generally, quite, accurate, se..."
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and ve...",The rooms were not large but were clean and ve...,"[<START>, room, overly, big, clean, comfortabl...","[<START>, rooms, large, clean, comfortable, ba..."
2,battery-life_amazon_kindle,After I plugged it in to my USB hub on my com...,Battery life is exceptional.\r\nThe Kindle can...,"[<START>, plugged, usb, hub, computer, charge,...","[<START>, battery, life, exceptional, kindle, ..."
3,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\...,The battery life is too short.\r\nThe time bet...,"[<START>, short, battery, life, moved, 8gb, lo...","[<START>, battery, life, short, time, chargers..."
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh ...",The battery life is longer then 5 hours.\r\nBu...,"[<START>, 6ghz, 533fsb, cpu, glossy, display, ...","[<START>, battery, life, longer, 5, hours, due..."


## Create Vocabulary and Embeddings

In [7]:
texts = df['text_tokens'].values
summaries = df['summary_tokens'].values

In [8]:
vocabulary, word_to_id, id_to_word = create_vocabulary(np.concatenate((texts, summaries)))

In [9]:
df['text_indices'] = df['text_tokens'].apply(lambda tokens: np.array([word_to_id[word] for word in tokens]))
df['summary_indices'] = df['summary_tokens'].apply(lambda tokens: np.array([word_to_id[word] for word in tokens]))

text_indices = df['text_indices'].values
summary_indices = df['summary_indices'].values

df.head()

Unnamed: 0,id,text,summary,text_tokens,summary_tokens,text_indices,summary_indices
0,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\r\n but for the...",This unit is generally quite accurate. \r\nSe...,"[<START>, accurate, part, find, garmin, softwa...","[<START>, unit, generally, quite, accurate, se...","[440, 2649, 2035, 4156, 3937, 5707, 5747, 2649...","[440, 6079, 1713, 6056, 2649, 6617, 2629, 2610..."
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and ve...",The rooms were not large but were clean and ve...,"[<START>, room, overly, big, clean, comfortabl...","[<START>, rooms, large, clean, comfortable, ba...","[440, 984, 2141, 4129, 872, 684, 3121, 3724, 1...","[440, 3524, 1160, 872, 684, 88, 4908, 1176, 17..."
2,battery-life_amazon_kindle,After I plugged it in to my USB hub on my com...,Battery life is exceptional.\r\nThe Kindle can...,"[<START>, plugged, usb, hub, computer, charge,...","[<START>, battery, life, exceptional, kindle, ...","[440, 3183, 4169, 5482, 1609, 127, 3752, 5950,...","[440, 3752, 901, 3319, 3261, 7008, 3344, 727, ..."
3,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\...,The battery life is too short.\r\nThe time bet...,"[<START>, short, battery, life, moved, 8gb, lo...","[<START>, battery, life, short, time, chargers...","[440, 6839, 3752, 901, 3924, 6798, 1625, 2525,...","[440, 3752, 901, 6839, 6001, 1242, 3001, 6289]"
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh ...",The battery life is longer then 5 hours.\r\nBu...,"[<START>, 6ghz, 533fsb, cpu, glossy, display, ...","[<START>, battery, life, longer, 5, hours, due...","[440, 320, 3070, 5356, 4886, 3069, 6961, 2506,...","[440, 3752, 901, 154, 5005, 391, 5779, 3752, 6..."


Upload pre-processed data as artifact to WANDB

In [77]:
pre_processed_data = wandb.Artifact(
    "opinosis-preprocessed", type="dataset",
    description="Preprocessed OPINOSIS dataset",
    metadata={"sizes": len(df), "pipeline": ["tokenization", "indexing", "start/end tokens"]}
)

pre_processed_dataframe = wandb.Table(data=df, columns=df.columns, allow_mixed_types=True)
pre_processed_data.add(pre_processed_dataframe, "Preprocessed dataset")
run.log_artifact(pre_processed_data)

<wandb.sdk.wandb_artifacts.Artifact at 0x7f0e7383cc40>

In [10]:
embeddings = load_embeddings(vocabulary,embedding_size=50, embedding_type='glove', dump_path='./data')

# Create Train Test Data

In [11]:
train_texts, train_summaries, test_texts, test_summaries = train_test_split(text_indices, summary_indices, test_size=0.1)
input_texts, input_summaries, next_words = create_train_data(train_texts, train_summaries)

In [12]:
max_texts_length = max([len(text) for text in input_texts])
max_summaries_length = max([len(summary) for summary in input_summaries])

In [13]:
padded_texts = pad_sequences(input_texts, maxlen=max_texts_length)
padded_summaries = pad_sequences(input_summaries, maxlen=max_summaries_length)

In [14]:
label_binarizer = LabelBinarizer()
label_binarizer.fit(list(word_to_id.values()))
next_words = label_binarizer.transform(next_words)

# Create Model

This section serves as demonstration of how to create a model. It is not necessary to create a model to run the experiment. Model training will be executed on my of my GPU servers, so there will be model imports from files.

**Disclaimer:** Skip this section if you are running this notebook on low-end device.

In [17]:
from wandb.keras import WandbCallback

from scripts.model import create_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import categorical_crossentropy

In [20]:
run = wandb.init(reinit=True, name=model.name)
model = create_model(max_texts_length, max_summaries_length, len(vocabulary), 50, embeddings)
model.compile(optimizer=Adam(lr=0.01), loss=categorical_crossentropy, metrics=['accuracy'])
model.summary()

2021-12-10 06:38:21.879507: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


2021-12-10 06:38:23,565 [ERROR] Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


  super(Adam, self).__init__(name, **kwargs)
[34m[1mwandb[0m: Currently logged in as: [33maleksandar1932[0m (use `wandb login --relogin` to force relogin)
  warn("The `IPython.html` package has been deprecated since IPython 4.0. "
[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
2021-12-10 06:38:25.563570: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-10 06:38:25.563620: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Model: "Encoder-Decoder-2d1474e3-60b1-43e0-9c8b-50a4b25ef11f"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, 3674)]       0           []                               
                                                                                                  
 decoder_inputs (InputLayer)    [(None, 1619)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 3674, 50)     359400      ['encoder_inputs[0][0]']         
                                                                                                  
 embedding_1 (Embedding)        (None, 1619, 50)     359400      ['decoder_inputs[0][0]']         
                                               

In [21]:
model.fit([np.array(padded_texts), np.array(padded_summaries)],
              np.array(next_words),
              batch_size=64, epochs=15, verbose=1, callbacks=[WandbCallback()])
run.finish()

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
 4/90 [>.............................] - ETA: 7:50 - loss: 0.3752 - accuracy: 0.9570

KeyboardInterrupt: 

In [22]:
model

<keras.engine.functional.Functional at 0x7fd9fc208b20>

In [23]:
run.finish()

0,1
accuracy,▁▁▂▂▃▅▆▇█
epoch,▁▂▃▄▅▅▆▇█
loss,█▇▆▅▄▃▂▁▁

0,1
accuracy,0.87487
epoch,8.0
loss,0.66317


# Evaluate Model on Test Data

For this example, the above model was pre-trained on CUDA enabled hardware, and it's going to be imported from a `/models` directory.

In [16]:
from tensorflow.keras.models import load_model

In [17]:
model = load_model('models/opinosis_model-33f8c698-7ca7-4d28-b897-71b8677185a9.h5')

2021-12-10 23:41:20.065953: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (100)
2021-12-10 23:41:20.066139: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (legion-y540): /proc/driver/nvidia/version does not exist
2021-12-10 23:41:20.066421: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
model.summary()

Model: "Encoder-Decoder-5853b83a-2923-44aa-8ba1-103074a5a4b3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, 3516)]       0           []                               
                                                                                                  
 decoder_inputs (InputLayer)    [(None, 2450)]       0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, 3516, 50)     359400      ['encoder_inputs[0][0]']         
                                                                                                  
 embedding_5 (Embedding)        (None, 2450, 50)     359400      ['decoder_inputs[0][0]']         
                                               

In [38]:
from scripts.model import decode


def decode(model, input_sent, word_to_id, padding_size, verbose=False):
    generated_sentence = []
    generated_sentence.append(word_to_id['<START>'])

    for i in range(padding_size):
        output_sent = pad_sequences([generated_sentence], padding_size)
        predictions = model.predict(
            [np.expand_dims(input_sent, axis=0), output_sent])
        next_word = np.argmax(predictions)
        if verbose:
            print(next_word)
        generated_sentence.append(next_word)

    return generated_sentence

In [32]:
padded_texts_test = pad_sequences(test_texts, maxlen=3516)
padded_summaries_test = pad_sequences(test_summaries, maxlen=max_summaries_length)

In [39]:
decode(model, padded_texts_test[0], word_to_id, 3516, True)

KeyError: '5255'