In [1]:
import tensorflow as tf
from tensorflow import keras

import numpy as np

In [2]:
def get_model(vocab_size, emb_dim, max_len=256) :
    inputs      = tf.keras.Input(shape=(max_len,), dtype='int32')
    inputs_mask = tf.keras.Input(shape=(max_len,), dtype='bool')
    
    embedding_layer = keras.layers.Embedding(vocab_size, emb_dim, input_length=max_len, mask_zero=True)
    embedding = embedding_layer(inputs)
    
    enc_conv1 = keras.layers.Conv1D(filters=128, kernel_size=3, activation='relu', padding='same')(embedding)

    attention = keras.layers.Attention(causal=False)(
        [
         enc_conv1, 
         enc_conv1
        ],
        mask=[ 
         inputs_mask, 
         inputs_mask
        ]
    )

    last  = attention
    
    conv1 = keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same')(last)
    last  = conv1
    
    conv2 = keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu', padding='same')(last)
    last  = conv2
    
    pool1 = keras.layers.MaxPooling1D(pool_size=2, padding='same')(last)
    last  = pool1
    
    drop1 = keras.layers.Dropout(0.05)(last)
    last  = drop1
    
    conv3 = keras.layers.Conv1D(filters=16, kernel_size=3, activation='relu', padding='same')(last)
    last  = conv3
    
    conv4 = keras.layers.Conv1D(filters=8,  kernel_size=3, activation='relu', padding='same')(last)
    last  = conv4
    
    pool2 = keras.layers.GlobalAveragePooling1D()(last)
    last  = pool2
    
    drop2 = keras.layers.Dropout(0.05)(last)
    last  = drop2

    outputs = keras.layers.Dense(1, activation='sigmoid')(last)

    model = keras.models.Model(inputs=[inputs, inputs_mask], outputs=outputs)

    model.summary()

    return model, embedding_layer

In [3]:
import pandas as pd
import json

def get_word_index(sentences) :
    all_tokens = []
    for txt in sentences :
        all_tokens += txt.split()

    tokens = pd.Series(all_tokens, range(len(all_tokens)), name="tokens")
    types = tokens.unique()
    word_index = {word : i for i, word in enumerate(["<pad>", "<unk>"] + list(types))}
    
    def decode_review(text):
        reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
        return ' '.join([reverse_word_index.get(i, '?') for i in text])
    
    return word_index, decode_review

def get_dataset(dataframe, word_index, max_len=256) :
    x_series, y_series = dataframe["text"], dataframe["class"]

    x_list = [txt.split() for txt in list(x_series)]
    
    x_seq = []
    for tknlst in x_list :
        seq = []
        for tkn in tknlst :
            try :
                seq.append(word_index[tkn])
            except KeyError :
                seq.append(word_index["<unk>"])
        seq = (seq + [0] * (max_len - len(seq))) if (len(seq) < max_len) else (seq[ : max_len])
        x_seq.append(seq)
    
    y_int = [1 if lb >= .5 else 0 for lb in list(y_series)]

    x = np.array(x_seq, dtype=int) 
    mask = x != 0

    y = np.array(y_int, dtype=int) 

    return x, mask, y

In [4]:
train_dataframe = pd.read_csv("../resources/datasets/StanfordSentimentTreebank/split/SST2Processed2-train.csv", index_col=0)
word_index, decode_review = get_word_index(train_dataframe["text"])
x_train, mask_train, y_train = get_dataset(train_dataframe, word_index)
x_train.shape, mask_train.shape, y_train.shape

((8544, 256), (8544, 256), (8544,))

In [5]:
x_train[1], mask_train[1]

(array([ 2,  3, 32, 33, 34, 35,  3, 36, 35,  3, 37, 38,  5, 39, 40, 14, 18,
        41, 35, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 35, 53, 54, 55,
        56,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0, 

In [6]:
y_train[1]

1

In [7]:
decode_review(x_train[1])

"<start> the gorgeously elaborate continuation of the lord of the rings trilogy is so huge that a column of words can not adequately describe co writerdirector peter jackson's expanded vision of jrr tolkien's middle earth <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad

In [8]:
val_dataframe = pd.read_csv("../resources/datasets/StanfordSentimentTreebank/split/SST2Processed2-dev.csv", index_col=0)
x_val, mask_val, y_val = get_dataset(val_dataframe, word_index)
x_val.shape, mask_val.shape, y_val.shape

((1101, 256), (1101, 256), (1101,))

In [9]:
len(word_index)

16177

In [10]:
embed_dim = 768
model, emb_layer = get_model(len(word_index), embed_dim)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 256, 300)     4853100     ['input_1[0][0]']                
                                                                                                  
 conv1d (Conv1D)                (None, 256, 128)     115328      ['embedding[0][0]']              
                                                                                                  
 input_2 (InputLayer)           [(None, 256)]        0           []                               
                                                                                              

In [11]:
embeddings_df = pd.read_csv(f"../resources/embeddings/StanfordSentimentTreebank/SST2Processed2-train_dim{embed_dim}.csv", index_col=0)
emb_wgts = emb_layer.get_weights()
emb_wgts[0][0] = embeddings_df.iloc[0].values
emb_wgts[0][1 - embeddings_df.shape[0] : ] = embeddings_df.values[1:]
emb_layer.set_weights(emb_wgts)

In [12]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit([x_train, mask_train],
                    y_train,
                    epochs=50,
                    batch_size=64,
                    validation_data=([x_val, mask_val], y_val),
                    callbacks=[
                        keras.callbacks.ModelCheckpoint(
                            f"../resources/output/SST2Processed2/checkpoint_cnn1d_embed{embed_dim}.h5",
                            save_best_only=True,
                            save_weights_only=True,
                            verbose=1
                        )
                    ],
                    verbose=1)

Epoch 1/50
Epoch 1: val_loss improved from inf to 0.68875, saving model to ../resources/output/SST2Processed2\checkpoint_cnn1d.h5
Epoch 2/50
Epoch 2: val_loss improved from 0.68875 to 0.66507, saving model to ../resources/output/SST2Processed2\checkpoint_cnn1d.h5
Epoch 3/50
Epoch 3: val_loss improved from 0.66507 to 0.64307, saving model to ../resources/output/SST2Processed2\checkpoint_cnn1d.h5
Epoch 4/50
Epoch 4: val_loss improved from 0.64307 to 0.63397, saving model to ../resources/output/SST2Processed2\checkpoint_cnn1d.h5
Epoch 5/50
Epoch 5: val_loss did not improve from 0.63397
Epoch 6/50
Epoch 6: val_loss did not improve from 0.63397
Epoch 7/50
Epoch 7: val_loss did not improve from 0.63397
Epoch 8/50
Epoch 8: val_loss did not improve from 0.63397
Epoch 9/50
Epoch 9: val_loss did not improve from 0.63397
Epoch 10/50
Epoch 10: val_loss did not improve from 0.63397
Epoch 11/50
Epoch 11: val_loss did not improve from 0.63397
Epoch 12/50
Epoch 12: val_loss did not improve from 0.6339