In [1]:
import tensorflow as tf
from tensorflow import keras

import numpy as np

In [11]:
def get_attention(inputs, mask, n_heads, causal) :
    if n_heads == 1 :
        attention = keras.layers.Attention(causal=causal)(
            [
            inputs, 
            inputs
            ],
            mask=[ 
            mask, 
            mask
            ]
        )
    elif n_heads > 1 :
        attentions = []
        for _ in range(n_heads) :
            attentions.append(
                keras.layers.Attention(causal=causal)(
                    [
                        inputs, 
                        inputs
                    ],
                    mask=[ 
                        mask, 
                        mask
                    ]
                )
            )
        attention = keras.layers.Add()(attentions)
        attention = keras.layers.BatchNormalization()(attention)
    else :
        return inputs
    return attention

def pooling_on_mask(inputs_mask:tf.Tensor) :
    inputs_mask = tf.cast(inputs_mask, tf.int8)
    shape = inputs_mask.get_shape().as_list()
    inputs_mask = keras.layers.Reshape((shape[1], 1))(inputs_mask)
    inputs_mask = keras.layers.MaxPooling1D(pool_size=2, padding='same')(inputs_mask)
    shape = inputs_mask.get_shape().as_list()
    inputs_mask = keras.layers.Reshape((shape[1],))(inputs_mask)
    inputs_mask = tf.cast(inputs_mask, tf.bool)
    return inputs_mask

def get_model(vocab_size, 
              emb_dim, 
              max_len=256, 
              n_attention_heads=6, 
              n_end_attention_heads=0, 
              emb_trainable=True, 
              causal=False, 
              with_pooling=False,
              residual_connections=False,
              global_avg_pool=True) :
    inputs      = tf.keras.Input(shape=(max_len,), dtype='int32')
    inputs_mask = tf.keras.Input(shape=(max_len,), dtype='bool')
    mask = inputs_mask

    embedding_layer = keras.layers.Embedding(vocab_size, emb_dim, input_length=max_len, mask_zero=True, trainable=emb_trainable)
    
    embedding  = embedding_layer(inputs)
    last       = embedding
    
    reshape    = keras.layers.Reshape((max_len, emb_dim, 1))(last)
    last       = reshape
    
    last       = keras.layers.Concatenate(axis=3)([last, tf.reverse(last, axis=[1])])

    enc_conv1  = keras.layers.Conv2D(filters=32, kernel_size=3, strides=(1, 3), activation='relu', padding='same')(last)
    last       = enc_conv1
    
    drop = keras.layers.Dropout(0.05)(last)
    last  = drop
    
    # unshape    = keras.layers.Reshape((max_len, emb_dim))(last)
    # last       = unshape

    # last = get_attention(last, mask, n_attention_heads, causal)
    
    # reshape    = keras.layers.Reshape((max_len, emb_dim, 1))(last)
    # last       = reshape
    
    conv1 = keras.layers.Conv2D(filters=16, kernel_size=3, activation='relu', padding='same')(last)
    last  = conv1
    
    conv2 = keras.layers.Conv2D(filters=8, kernel_size=3, activation='relu', padding='same')(last)
    last  = conv2
    
    if n_end_attention_heads <= 0 or with_pooling:
        pool1 = keras.layers.MaxPooling2D(pool_size=2, padding='same')(last)
        last  = pool1
        if n_end_attention_heads >= 1 :
            mask = pooling_on_mask(mask)
    
    drop = keras.layers.Dropout(0.05)(last)
    last  = drop
    
    conv3 = keras.layers.Conv2D(filters=4, kernel_size=3, activation='relu', padding='same')(last)
    last  = conv3
    
    # conv4 = keras.layers.Conv2D(filters=2,  kernel_size=3, activation='relu', padding='same')(last)
    
    conv5 = keras.layers.Conv2D(filters=1,  kernel_size=3, activation='relu', padding='same')(last)
    last  = conv5
    
    unshape    = keras.layers.Reshape((max_len, int(emb_dim/3)))(last)
    last       = unshape

    pre_attention = last
    last = get_attention(last, mask, n_end_attention_heads, causal)
    
    if global_avg_pool :
        ga_pool = keras.layers.GlobalAveragePooling1D()(last)
        last  = ga_pool
        if residual_connections :
            pre_attention = keras.layers.GlobalAveragePooling1D()(pre_attention)
            last = keras.layers.Concatenate()([last, pre_attention])
    else :
        last = keras.layers.Flatten()(last)
        if residual_connections :
            pre_attention = keras.layers.Flatten()(pre_attention)
            last = keras.layers.Concatenate()([last, pre_attention])
        last = keras.layers.Dense(units=1024, activation='relu')(last)
        last = keras.layers.Dense(units=128, activation='relu')(last)
        last = keras.layers.Dense(units=16, activation='relu')(last)
    
    drop = keras.layers.Dropout(0.05)(last)
    last  = drop

    # last = keras.layers.Dense(units=int(emb_dim/2), activation='relu')(last)
    last = keras.layers.Dense(units=int(emb_dim/6), activation='relu')(last)
    outputs = keras.layers.Dense(1, activation='sigmoid')(last)

    model = keras.models.Model(inputs=[inputs, inputs_mask], outputs=outputs)

    model.summary()

    return model, embedding_layer

In [3]:
import pandas as pd
import json

def get_word_index(sentences) :
    all_tokens = []
    for txt in sentences :
        all_tokens += txt.split()

    tokens = pd.Series(all_tokens, range(len(all_tokens)), name="tokens")
    types = tokens.unique()
    word_index = {word : i for i, word in enumerate(["<pad>", "<unk>"] + list(types))}
    
    def decode_review(text):
        reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
        return ' '.join([reverse_word_index.get(i, '?') for i in text])
    
    return word_index, decode_review

def get_dataset(dataframe, word_index, max_len=256) :
    index_series, x_series, y_series = dataframe.index, dataframe["text"], dataframe["class"]

    index = list(index_series)

    x_list = [txt.split() for txt in list(x_series)]
    
    x_seq = []
    for tknlst in x_list :
        seq = []
        for tkn in tknlst :
            try :
                seq.append(word_index[tkn])
            except KeyError :
                seq.append(word_index["<unk>"])
        seq = (seq + [0] * (max_len - len(seq))) if (len(seq) < max_len) else (seq[ : max_len])
        x_seq.append(seq)
    
    y_int = [1 if lb >= .5 else 0 for lb in list(y_series)]

    x = np.array(x_seq, dtype=int) 
    mask = x != 0

    y = np.array(y_int, dtype=int) 

    return index, x, mask, y

In [4]:
train_dataframe = pd.read_csv("../resources/datasets/StanfordSentimentTreebank/split/SST2Processed2-train.csv", index_col=0)
word_index, decode_review = get_word_index(train_dataframe["text"])
ids_train, x_train, mask_train, y_train = get_dataset(train_dataframe, word_index)
len(ids_train), x_train.shape, mask_train.shape, y_train.shape

(8544, (8544, 256), (8544, 256), (8544,))

In [5]:
x_train[1], mask_train[1]

(array([ 2,  3, 32, 33, 34, 35,  3, 36, 35,  3, 37, 38,  5, 39, 40, 14, 18,
        41, 35, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 35, 53, 54, 55,
        56,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0, 

In [6]:
y_train[1]

1

In [7]:
decode_review(x_train[1])

"<start> the gorgeously elaborate continuation of the lord of the rings trilogy is so huge that a column of words can not adequately describe co writerdirector peter jackson's expanded vision of jrr tolkien's middle earth <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad

In [8]:
val_dataframe = pd.read_csv("../resources/datasets/StanfordSentimentTreebank/split/SST2Processed2-dev.csv", index_col=0)
ids_val, x_val, mask_val, y_val = get_dataset(val_dataframe, word_index)
len(ids_val), x_val.shape, mask_val.shape, y_val.shape

(1101, (1101, 256), (1101, 256), (1101,))

In [9]:
len(word_index)

16177

In [16]:
embed_dim = 768
embeddings_df = pd.read_csv(f"../resources/embeddings/StanfordSentimentTreebank/SST2Processed2-train_dim{embed_dim}.csv", index_col=0)
n_attention_heads = 0
n_end_attention_heads = 1
emb_trainable = True
causal = False
pooling = False
residual_connections = False
global_avg_pool = True
experimet_name = f"cnn2d_{n_attention_heads}ATTENTION_{n_end_attention_heads}ENDATTENTION_embed{embed_dim}"
checkpoint_dir = f"../resources/output/SST2Processed2/cnn2d/checkpoint_{experimet_name}.hdf5"

model, emb_layer = get_model(
    len(word_index),
    embed_dim,
    n_attention_heads=n_attention_heads,
    n_end_attention_heads=n_end_attention_heads,
    emb_trainable=emb_trainable,
    with_pooling=pooling,
    causal=causal,
    residual_connections=residual_connections,
    global_avg_pool=global_avg_pool
)

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 256, 768)     12423936    ['input_5[0][0]']                
                                                                                                  
 reshape_3 (Reshape)            (None, 256, 768, 1)  0           ['embedding_2[0][0]']            
                                                                                                  
 tf.reverse_2 (TFOpLambda)      (None, 256, 768, 1)  0           ['reshape_3[0][0]']              
                                                                                            

In [13]:
emb_wgts = emb_layer.get_weights()
emb_wgts[0][0] = embeddings_df.iloc[0].values
emb_wgts[0][1 - embeddings_df.shape[0] : ] = embeddings_df.values[1:]
emb_layer.set_weights(emb_wgts)

In [14]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit([x_train, mask_train],
                    y_train,
                    epochs=50,
                    batch_size=64,
                    validation_data=([x_val, mask_val], y_val),
                    callbacks=[
                        keras.callbacks.ModelCheckpoint(
                            checkpoint_dir,
                            save_best_only=True,
                            save_weights_only=False,
                            verbose=1
                        ),
                        keras.callbacks.EarlyStopping(
                            monitor='accuracy',
                            min_delta=.001,
                            patience=5
                        )
                    ],
                    verbose=1)

Epoch 1/50
Epoch 1: val_loss improved from inf to 0.68512, saving model to ../resources/output/SST2Processed2/cnn2d\checkpoint_cnn2d_0ATTENTION_1ENDATTENTION_DROPOUT_embed768.hdf5
Epoch 2/50
Epoch 2: val_loss improved from 0.68512 to 0.59402, saving model to ../resources/output/SST2Processed2/cnn2d\checkpoint_cnn2d_0ATTENTION_1ENDATTENTION_DROPOUT_embed768.hdf5
Epoch 3/50
Epoch 3: val_loss improved from 0.59402 to 0.54601, saving model to ../resources/output/SST2Processed2/cnn2d\checkpoint_cnn2d_0ATTENTION_1ENDATTENTION_DROPOUT_embed768.hdf5
Epoch 4/50
Epoch 4: val_loss did not improve from 0.54601
Epoch 5/50
Epoch 5: val_loss did not improve from 0.54601
Epoch 6/50
Epoch 6: val_loss did not improve from 0.54601
Epoch 7/50
Epoch 7: val_loss did not improve from 0.54601
Epoch 8/50
Epoch 8: val_loss did not improve from 0.54601
Epoch 9/50
Epoch 9: val_loss did not improve from 0.54601
Epoch 10/50
Epoch 10: val_loss did not improve from 0.54601
Epoch 11/50
Epoch 11: val_loss did not impro

In [17]:
model.load_weights(checkpoint_dir)

In [18]:
test_dataframe = pd.read_csv("../resources/datasets/StanfordSentimentTreebank/split/SST2Processed2-test.csv", index_col=0)
ids_test, x_test, mask_test, y_test = get_dataset(test_dataframe, word_index)
len(ids_test), x_test.shape, mask_test.shape, y_test.shape

(2210, (2210, 256), (2210, 256), (2210,))

In [19]:
preds = model.predict([x_test, mask_test])
preds.shape



(2210, 1)

In [20]:
preds_df = pd.DataFrame(preds, index=ids_test, columns=["predictions"])
preds_df

Unnamed: 0,predictions
3,0.054791
4,0.829216
5,0.252992
6,0.876140
7,0.997403
...,...
11621,0.519608
11623,0.999923
11626,0.983892
11628,0.013041


In [21]:
dataset_with_preds_df = pd.concat([test_dataframe, preds_df], axis=1)
dataset_with_preds_df

Unnamed: 0,text,phrase_id,class,predictions
3,<start> effective but too tepid biopic,13995,0.513890,0.054791
4,<start> if you sometimes like to go to the mov...,14123,0.736110,0.829216
5,<start> emerges as something rare an issue mov...,13999,0.861110,0.252992
6,<start> the film provides some great insight i...,14498,0.597220,0.876140
7,<start> offers that rare combination of entert...,14351,0.833330,0.997403
...,...,...,...,...
11621,<start> an imaginative comedythriller,13851,0.777780,0.519608
11623,<start> a rare beautiful film,18182,0.916670,0.999923
11626,<start> an hilarious romantic comedy,23211,0.888890,0.983892
11628,<start> never sinks into exploitation,26177,0.625000,0.013041


In [22]:
hits, total = 0, 0
for _, row in dataset_with_preds_df.iterrows() :
    total += 1
    pred  = row["predictions"] >= .5
    label = row["class"]       >= .5
    if pred == label : 
        hits += 1
accuracy = hits / total
accuracy

0.7651583710407239

In [23]:
hits, total = 0, 0
for _, row in dataset_with_preds_df.iterrows() :
    total += 1
    pred  = row["predictions"]
    label = row["class"]
    diff = label - pred
    hits += abs(diff)
avg_dev = hits / total
avg_dev

0.2129111063542746

In [24]:
true_positives, true_negatives, false_positives, false_negatives = 0, 0, 0, 0
for _, row in dataset_with_preds_df.iterrows() :
    pred  = row["predictions"] >= .5
    label = row["class"]       >= .5
    if pred == label :
        if pred :
            true_positives  += 1
        else :
            true_negatives  += 1
    else :
        if pred :
            false_positives += 1
        else :
            false_negatives += 1
print(true_positives, true_negatives, false_positives, false_negatives)

798 893 206 313


In [25]:
precision = true_positives / (true_positives + false_positives)
precision

0.7948207171314741

In [26]:
recall = true_positives / (true_positives + false_negatives)
recall

0.7182718271827183

In [28]:
f1 = (2 * precision * recall) / (precision + recall)
f1

0.7546099290780143

In [29]:
eval_dict = {
    'model'     : checkpoint_dir,
    'accuracy'  : accuracy,
    'precision' : precision,
    'recall'    : recall,
    'f1-score'  : f1
}

In [30]:
with open(f"{eval_dict['model']}-eval-dict.json", "w") as f :
    f.write(json.dumps(eval_dict))