In [1]:
import tensorflow as tf
from tensorflow import keras

import numpy as np

In [2]:
def get_model(vocab_size, emb_dim, max_len=256, n_attention_heads=6, emb_trainable=True) :
    inputs      = tf.keras.Input(shape=(max_len,), dtype='int32')
    inputs_mask = tf.keras.Input(shape=(max_len,), dtype='bool')
    
    embedding_layer = keras.layers.Embedding(vocab_size, emb_dim, input_length=max_len, mask_zero=True, trainable=emb_trainable)
    
    embedding  = embedding_layer(inputs)
    last       = embedding
    
    enc_conv1  = keras.layers.Conv1D(filters=128, kernel_size=3, activation='relu', padding='same')(last)
    last       = enc_conv1

    if n_attention_heads == 1 :
        attention = keras.layers.Attention(causal=False)(
            [
            last, 
            last
            ],
            mask=[ 
            inputs_mask, 
            inputs_mask
            ]
        )
        last  = attention
    elif n_attention_heads > 1 :
        attentions = []
        for _ in range(n_attention_heads) :
            attentions.append(
                attention = keras.layers.Attention(causal=False)(
                    [
                        last, 
                        last
                    ],
                    mask=[ 
                        inputs_mask, 
                        inputs_mask
                    ]
                )
            )
        attention = keras.layers.Add()(attentions)
        attention = keras.layers.BatchNormalization()(attention)
        last  = attention
    else :
        pass
    
    conv1 = keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same')(last)
    last  = conv1
    
    conv2 = keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu', padding='same')(last)
    last  = conv2
    
    pool1 = keras.layers.MaxPooling1D(pool_size=2, padding='same')(last)
    last  = pool1
    
    drop1 = keras.layers.Dropout(0.05)(last)
    last  = drop1
    
    conv3 = keras.layers.Conv1D(filters=16, kernel_size=3, activation='relu', padding='same')(last)
    last  = conv3
    
    conv4 = keras.layers.Conv1D(filters=8,  kernel_size=3, activation='relu', padding='same')(last)
    last  = conv4
    
    pool2 = keras.layers.GlobalAveragePooling1D()(last)
    last  = pool2
    
    drop2 = keras.layers.Dropout(0.05)(last)
    last  = drop2

    outputs = keras.layers.Dense(1, activation='sigmoid')(last)

    model = keras.models.Model(inputs=[inputs, inputs_mask], outputs=outputs)

    model.summary()

    return model, embedding_layer

In [3]:
import pandas as pd
import json

def get_word_index(sentences) :
    all_tokens = []
    for txt in sentences :
        all_tokens += txt.split()

    tokens = pd.Series(all_tokens, range(len(all_tokens)), name="tokens")
    types = tokens.unique()
    word_index = {word : i for i, word in enumerate(["<pad>", "<unk>"] + list(types))}
    
    def decode_review(text):
        reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
        return ' '.join([reverse_word_index.get(i, '?') for i in text])
    
    return word_index, decode_review

def get_dataset(dataframe, word_index, max_len=256) :
    index_series, x_series, y_series = dataframe.index, dataframe["text"], dataframe["class"]

    index = list(index_series)

    x_list = [txt.split() for txt in list(x_series)]
    
    x_seq = []
    for tknlst in x_list :
        seq = []
        for tkn in tknlst :
            try :
                seq.append(word_index[tkn])
            except KeyError :
                seq.append(word_index["<unk>"])
        seq = (seq + [0] * (max_len - len(seq))) if (len(seq) < max_len) else (seq[ : max_len])
        x_seq.append(seq)
    
    y_int = [1 if lb >= .5 else 0 for lb in list(y_series)]

    x = np.array(x_seq, dtype=int) 
    mask = x != 0

    y = np.array(y_int, dtype=int) 

    return index, x, mask, y

In [4]:
train_dataframe = pd.read_csv("../resources/datasets/StanfordSentimentTreebank/split/SST2Processed2-train.csv", index_col=0)
word_index, decode_review = get_word_index(train_dataframe["text"])
ids_train, x_train, mask_train, y_train = get_dataset(train_dataframe, word_index)
len(ids_train), x_train.shape, mask_train.shape, y_train.shape

(8544, (8544, 256), (8544, 256), (8544,))

In [5]:
x_train[1], mask_train[1]

(array([ 2,  3, 32, 33, 34, 35,  3, 36, 35,  3, 37, 38,  5, 39, 40, 14, 18,
        41, 35, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 35, 53, 54, 55,
        56,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0, 

In [6]:
y_train[1]

1

In [7]:
decode_review(x_train[1])

"<start> the gorgeously elaborate continuation of the lord of the rings trilogy is so huge that a column of words can not adequately describe co writerdirector peter jackson's expanded vision of jrr tolkien's middle earth <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad

In [8]:
val_dataframe = pd.read_csv("../resources/datasets/StanfordSentimentTreebank/split/SST2Processed2-dev.csv", index_col=0)
ids_val, x_val, mask_val, y_val = get_dataset(val_dataframe, word_index)
len(ids_val), x_val.shape, mask_val.shape, y_val.shape

(1101, (1101, 256), (1101, 256), (1101,))

In [9]:
len(word_index)

16177

In [10]:
embed_dim = 768
embeddings_df = pd.read_csv(f"../resources/embeddings/StanfordSentimentTreebank/SST2Processed2-train_dim{embed_dim}.csv", index_col=0)
n_attention_heads = 1
emb_trainable = True
experimet_name = "cnn1d_1ATTENTION"
checkpoint_dir = f"../resources/output/SST2Processed2/checkpoint_{experimet_name}_embed{embed_dim}.hdf5"

model, emb_layer = get_model(len(word_index), embed_dim, n_attention_heads=n_attention_heads, emb_trainable=emb_trainable)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 256, 768)     12423936    ['input_1[0][0]']                
                                                                                                  
 conv1d (Conv1D)                (None, 256, 128)     295040      ['embedding[0][0]']              
                                                                                                  
 conv1d_1 (Conv1D)              (None, 256, 64)      24640       ['conv1d[0][0]']                 
                                                                                              

In [11]:
emb_wgts = emb_layer.get_weights()
emb_wgts[0][0] = embeddings_df.iloc[0].values
emb_wgts[0][1 - embeddings_df.shape[0] : ] = embeddings_df.values[1:]
emb_layer.set_weights(emb_wgts)

In [12]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit([x_train, mask_train],
                    y_train,
                    epochs=100,
                    batch_size=64,
                    validation_data=([x_val, mask_val], y_val),
                    callbacks=[
                        keras.callbacks.ModelCheckpoint(
                            checkpoint_dir,
                            save_best_only=True,
                            save_weights_only=False,
                            verbose=1
                        ),
                        keras.callbacks.EarlyStopping(
                            monitor='accuracy',
                            min_delta=.001,
                            patience=30
                        )
                    ],
                    verbose=1)

Epoch 1/100
Epoch 1: val_loss improved from inf to 0.68129, saving model to ../resources/output/SST2Processed2\checkpoint_cnn1d_NO_ATTENTION_embed768.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 0.68129 to 0.57933, saving model to ../resources/output/SST2Processed2\checkpoint_cnn1d_NO_ATTENTION_embed768.hdf5
Epoch 3/100
Epoch 3: val_loss improved from 0.57933 to 0.54289, saving model to ../resources/output/SST2Processed2\checkpoint_cnn1d_NO_ATTENTION_embed768.hdf5
Epoch 4/100
Epoch 4: val_loss did not improve from 0.54289
Epoch 5/100
Epoch 5: val_loss did not improve from 0.54289
Epoch 6/100
Epoch 6: val_loss did not improve from 0.54289
Epoch 7/100
Epoch 7: val_loss did not improve from 0.54289
Epoch 8/100
Epoch 8: val_loss did not improve from 0.54289
Epoch 9/100
Epoch 9: val_loss did not improve from 0.54289
Epoch 10/100
Epoch 10: val_loss did not improve from 0.54289
Epoch 11/100
Epoch 11: val_loss did not improve from 0.54289
Epoch 12/100
Epoch 12: val_loss did not improve fro

In [13]:
model.load_weights(checkpoint_dir)

In [14]:
test_dataframe = pd.read_csv("../resources/datasets/StanfordSentimentTreebank/split/SST2Processed2-test.csv", index_col=0)
ids_test, x_test, mask_test, y_test = get_dataset(test_dataframe, word_index)
len(ids_test), x_test.shape, mask_test.shape, y_test.shape

(2210, (2210, 256), (2210, 256), (2210,))

In [15]:
preds = model.predict([x_test, mask_test])
preds.shape



(2210, 1)

In [16]:
preds_df = pd.DataFrame(preds, index=ids_test, columns=["predictions"])
preds_df

Unnamed: 0,predictions
3,0.180711
4,0.723724
5,0.840086
6,0.768156
7,0.933851
...,...
11621,0.630594
11623,0.943358
11626,0.848387
11628,0.171943


In [17]:
dataset_with_preds_df = pd.concat([test_dataframe, preds_df], axis=1)
dataset_with_preds_df

Unnamed: 0,text,phrase_id,class,predictions
3,<start> effective but too tepid biopic,13995,0.513890,0.180711
4,<start> if you sometimes like to go to the mov...,14123,0.736110,0.723724
5,<start> emerges as something rare an issue mov...,13999,0.861110,0.840086
6,<start> the film provides some great insight i...,14498,0.597220,0.768156
7,<start> offers that rare combination of entert...,14351,0.833330,0.933851
...,...,...,...,...
11621,<start> an imaginative comedythriller,13851,0.777780,0.630594
11623,<start> a rare beautiful film,18182,0.916670,0.943358
11626,<start> an hilarious romantic comedy,23211,0.888890,0.848387
11628,<start> never sinks into exploitation,26177,0.625000,0.171943


In [18]:
hits, total = 0, 0
for _, row in dataset_with_preds_df.iterrows() :
    total += 1
    pred  = row["predictions"] >= .5
    label = row["class"]       >= .5
    if pred == label : 
        hits += 1
accuracy = hits / total
accuracy

0.7552036199095022

In [19]:
hits, total = 0, 0
for _, row in dataset_with_preds_df.iterrows() :
    total += 1
    pred  = row["predictions"]
    label = row["class"]
    diff = label - pred
    hits += abs(diff)
avg_dev = hits / total
avg_dev

0.19856765723799943

In [20]:
true_positives, true_negatives, false_positives, false_negatives = 0, 0, 0, 0
for _, row in dataset_with_preds_df.iterrows() :
    pred  = row["predictions"] >= .5
    label = row["class"]       >= .5
    if pred == label :
        if pred :
            true_positives  += 1
        else :
            true_negatives  += 1
    else :
        if pred :
            false_positives += 1
        else :
            false_negatives += 1
print(true_positives, true_negatives, false_positives, false_negatives)

851 818 281 260


In [21]:
precision = true_positives / (true_positives + false_positives)
precision

0.7517667844522968

In [22]:
recall = true_positives / (true_positives + false_negatives)
recall

0.765976597659766

In [23]:
f1 = (2 * precision * recall) / (precision + recall)
f1

0.7588051716451182

In [24]:
eval_dict = {
    'model'     : checkpoint_dir,
    'accuracy'  : accuracy,
    'precision' : precision,
    'recall'    : recall,
    'f1-score'  : f1
}

In [25]:
with open(f"{eval_dict['model']}-eval-dict.json", "w") as f :
    f.write(json.dumps(eval_dict))