In [1]:
import tensorflow as tf
from tensorflow import keras

import numpy as np

In [2]:
def get_attention(inputs, mask, n_heads, causal) :
    if n_heads == 1 :
        attention = keras.layers.Attention(causal=causal)(
            [
            inputs, 
            inputs
            ],
            mask=[ 
            mask, 
            mask
            ]
        )
    elif n_heads > 1 :
        attentions = []
        for _ in range(n_heads) :
            attentions.append(
                keras.layers.Attention(causal=causal)(
                    [
                        inputs, 
                        inputs
                    ],
                    mask=[ 
                        mask, 
                        mask
                    ]
                )
            )
        attention = keras.layers.Add()(attentions)
        attention = keras.layers.BatchNormalization()(attention)
    else :
        return inputs
    return attention

def pooling_on_mask(inputs_mask:tf.Tensor) :
    inputs_mask = tf.cast(inputs_mask, tf.int8)
    shape = inputs_mask.get_shape().as_list()
    inputs_mask = keras.layers.Reshape((shape[1], 1))(inputs_mask)
    inputs_mask = keras.layers.MaxPooling1D(pool_size=2, padding='same')(inputs_mask)
    shape = inputs_mask.get_shape().as_list()
    inputs_mask = keras.layers.Reshape((shape[1],))(inputs_mask)
    inputs_mask = tf.cast(inputs_mask, tf.bool)
    return inputs_mask

def get_model(vocab_size, 
              embed_dim, 
              max_len=256, 
              n_attention_heads=6, 
              n_end_attention_heads=0, 
              n_classes=1,
              emb_trainable=True, 
              causal=False, 
              with_pooling=False,
              residual_connections=False,
              global_avg_pool=True,
              decoding_2d=False,
              encoding_2d=False,
              lstm_in_middle=False,
              lstm_on_end=False,
              return_sequences_end=True) :
    inputs      = tf.keras.Input(shape=(max_len,), dtype='int32')
    inputs_mask = tf.keras.Input(shape=(max_len,), dtype='bool')
    mask = inputs_mask

    embedding_layer = keras.layers.Embedding(vocab_size, embed_dim, input_length=max_len, mask_zero=True, trainable=emb_trainable)
    
    embedding  = embedding_layer(inputs)
    last       = embedding
    
    enc_units  = 128
    enc_conv1  = keras.layers.Conv1D(filters=enc_units, kernel_size=3, activation='relu', padding='same')(last)
    last       = enc_conv1
    
    if encoding_2d :
        reshape    = keras.layers.Reshape((max_len, enc_units, 1))(last)
        last       = reshape

        enc_filters = 32
        enc_conv2  = keras.layers.Conv2D(filters=enc_filters, kernel_size=3, strides=(1, 1), activation='relu', padding='same')(last)
        last       = enc_conv2
        
        unshape    = keras.layers.Reshape((max_len, enc_filters * enc_units))(last)
        last       = unshape
        
        drop = keras.layers.Dropout(0.05)(last)
        last  = drop

    last = get_attention(last, mask, n_attention_heads, causal)
    
    conv1 = keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same')(last)
    last  = conv1
    
    conv2 = keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu', padding='same')(last)
    last  = conv2
    
    if n_end_attention_heads <= 0 or with_pooling:
        pool1 = keras.layers.MaxPooling1D(pool_size=2, padding='same')(last)
        last  = pool1
        if n_end_attention_heads >= 1 :
            mask = pooling_on_mask(mask)
    
    drop = keras.layers.Dropout(0.05)(last)
    last  = drop
    
    conv3 = keras.layers.Conv1D(filters=16, kernel_size=3, activation='relu', padding='same')(last)
    last  = conv3
    
    conv4 = keras.layers.Conv1D(filters=8,  kernel_size=3, activation='relu', padding='same')(last)
    last  = conv4

    if lstm_in_middle :
        lstm = keras.layers.LSTM(units=128, return_sequences=True)(last, mask=mask)
        last = lstm

    pre_attention = last
    last = get_attention(last, mask, n_end_attention_heads, causal)
    
    if lstm_on_end :
        lstm = keras.layers.LSTM(units=128, return_sequences=return_sequences_end)(last, mask=mask)
        last = lstm
    
        if return_sequences_end :
            if decoding_2d :
                features = last.get_shape().as_list()[2]
                reshape    = keras.layers.Reshape((max_len, features, 1))(last)
                last       = reshape

                dec_filters = 32
                enc_conv2  = keras.layers.Conv2D(filters=dec_filters, kernel_size=3, strides=(1, 1), activation='relu', padding='same')(last)
                last       = enc_conv2
                
                unshape    = keras.layers.Reshape((max_len, dec_filters * features))(last)
                last       = unshape
                
                drop = keras.layers.Dropout(0.05)(last)
                last  = drop
    
    if not lstm_on_end or return_sequences_end :
        if global_avg_pool :
            ga_pool = keras.layers.GlobalAveragePooling1D()(last)
            last  = ga_pool
            if residual_connections :
                pre_attention = keras.layers.GlobalAveragePooling1D()(pre_attention)
                last = keras.layers.Concatenate()([last, pre_attention])
        else :
            last = keras.layers.Flatten()(last)
            if residual_connections :
                pre_attention = keras.layers.Flatten()(pre_attention)
                last = keras.layers.Concatenate()([last, pre_attention])
            last = keras.layers.Dense(units=1024, activation='relu')(last)
            last = keras.layers.Dense(units=128, activation='relu')(last)
            last = keras.layers.Dense(units=16, activation='relu')(last)
    
    drop = keras.layers.Dropout(0.05)(last)
    last  = drop

    outputs = keras.layers.Dense(n_classes, activation='sigmoid' if n_classes == 1 else 'softmax')(last)

    model = keras.models.Model(inputs=[inputs, inputs_mask], outputs=outputs)

    model.summary()

    return model, embedding_layer

In [3]:
import pandas as pd
import json

def get_word_index(sentences) :
    all_tokens = []
    for txt in sentences :
        all_tokens += txt.split()

    tokens = pd.Series(all_tokens, range(len(all_tokens)), name="tokens")
    types = tokens.unique()
    word_index = {word : i for i, word in enumerate(["<pad>", "<unk>"] + list(types))}
    
    def decode_review(text):
        reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
        return ' '.join([reverse_word_index.get(i, '?') for i in text])
    
    return word_index, decode_review

def get_dataset(dataframe, word_index, max_len=256) :
    index_series, x_series, y_series = dataframe.index, dataframe["text"], dataframe["class"]

    index = list(index_series)

    x_list = [txt.split() for txt in list(x_series)]
    
    x_seq = []
    for tknlst in x_list :
        seq = []
        for tkn in tknlst :
            try :
                seq.append(word_index[tkn])
            except KeyError :
                seq.append(word_index["<unk>"])
        seq = (seq + [0] * (max_len - len(seq))) if (len(seq) < max_len) else (seq[ : max_len])
        x_seq.append(seq)
    
    y_int = [1 if lb >= .5 else 0 for lb in list(y_series)]

    x = np.array(x_seq, dtype=int) 
    mask = x != 0

    y = np.array(y_int, dtype=int) 

    return index, x, mask, y

In [4]:
train_dataframe = pd.read_csv("../resources/datasets/StanfordSentimentTreebank/split/SST2Processed2-train.csv", index_col=0)
word_index, decode_review = get_word_index(train_dataframe["text"])
ids_train, x_train, mask_train, y_train = get_dataset(train_dataframe, word_index)
len(ids_train), x_train.shape, mask_train.shape, y_train.shape

(8544, (8544, 256), (8544, 256), (8544,))

In [5]:
x_train[1], mask_train[1]

(array([ 2,  3, 32, 33, 34, 35,  3, 36, 35,  3, 37, 38,  5, 39, 40, 14, 18,
        41, 35, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 35, 53, 54, 55,
        56,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0, 

In [6]:
y_train[1]

1

In [7]:
decode_review(x_train[1])

"<start> the gorgeously elaborate continuation of the lord of the rings trilogy is so huge that a column of words can not adequately describe co writerdirector peter jackson's expanded vision of jrr tolkien's middle earth <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad

In [8]:
val_dataframe = pd.read_csv("../resources/datasets/StanfordSentimentTreebank/split/SST2Processed2-dev.csv", index_col=0)
ids_val, x_val, mask_val, y_val = get_dataset(val_dataframe, word_index)
len(ids_val), x_val.shape, mask_val.shape, y_val.shape

(1101, (1101, 256), (1101, 256), (1101,))

In [9]:
len(word_index)

16177

In [10]:
test_dataframe = pd.read_csv("../resources/datasets/StanfordSentimentTreebank/split/SST2Processed2-test.csv", index_col=0)
ids_test, x_test, mask_test, y_test = get_dataset(test_dataframe, word_index)
len(ids_test), x_test.shape, mask_test.shape, y_test.shape

(2210, (2210, 256), (2210, 256), (2210,))

In [13]:
experiments = [
    {
        "experiment_name" : f"cnn1d_STARTATTENTION0_ENDATTENTION0_embed768",
        "load_weights_from" : None,
        "args" : {
            "vocab_size" : len(word_index),
            "embed_dim" : 768,
            "n_attention_heads" : 0,
            "n_end_attention_heads" : 0,
            "emb_trainable" : True
        }
    }
]

In [14]:
for experiment in experiments :
    checkpoint_dir = f"../resources/output/SST2Processed2/cnn1d_retest/checkpoint_{experiment['experiment_name']}.hdf5"
    embeddings_df = pd.read_csv(f"../resources/embeddings/StanfordSentimentTreebank/SST2Processed2-train_dim{experiment['args']['embed_dim']}.csv", index_col=0)
    
    model, emb_layer = get_model(**experiment['args'])

    emb_wgts = emb_layer.get_weights()
    emb_wgts[0][0] = embeddings_df.iloc[0].values
    emb_wgts[0][1 - embeddings_df.shape[0] : ] = embeddings_df.values[1:]
    emb_layer.set_weights(emb_wgts)

    if not experiment['load_weights_from'] is None :
        model.load_weights(experiment['load_weights_from'])
    emb_layer.trainable = experiment['args']['emb_trainable']

    model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])
    history = model.fit([x_train, mask_train],
                        y_train,
                        epochs=50,
                        batch_size=64,
                        validation_data=([x_val, mask_val], y_val),
                        callbacks=[
                            keras.callbacks.ModelCheckpoint(
                                checkpoint_dir,
                                save_best_only=True,
                                save_weights_only=False,
                                verbose=1
                            ),
                            keras.callbacks.EarlyStopping(
                                monitor='accuracy',
                                min_delta=.001,
                                patience=5
                            )
                        ],
                        verbose=1)

    model.load_weights(checkpoint_dir)

    preds = model.predict([x_test, mask_test])
    preds_df = pd.DataFrame(preds, index=ids_test, columns=["predictions"])
    dataset_with_preds_df = pd.concat([test_dataframe, preds_df], axis=1)

    hits, total = 0, 0
    for _, row in dataset_with_preds_df.iterrows() :
        total += 1
        pred  = row["predictions"] >= .5
        label = row["class"]       >= .5
        if pred == label : 
            hits += 1
    accuracy = hits / total

    hits, total = 0, 0
    for _, row in dataset_with_preds_df.iterrows() :
        total += 1
        pred  = row["predictions"]
        label = row["class"]
        diff = label - pred
        hits += abs(diff)
    avg_dev = hits / total

    true_positives, true_negatives, false_positives, false_negatives = 0, 0, 0, 0
    for _, row in dataset_with_preds_df.iterrows() :
        pred  = row["predictions"] >= .5
        label = row["class"]       >= .5
        if pred == label :
            if pred :
                true_positives  += 1
            else :
                true_negatives  += 1
        else :
            if pred :
                false_positives += 1
            else :
                false_negatives += 1
    print(true_positives, true_negatives, false_positives, false_negatives)

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1 = (2 * precision * recall) / (precision + recall)

    eval_dict = {
        'model'     : checkpoint_dir,
        'accuracy'  : accuracy,
        'precision' : precision,
        'recall'    : recall,
        'f1-score'  : f1
    }

    print(eval_dict)

    with open(f"{eval_dict['model']}-eval-dict.json", "w") as f :
        f.write(json.dumps(eval_dict))

    del model 

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 256, 768)     12423936    ['input_3[0][0]']                
                                                                                                  
 conv1d_5 (Conv1D)              (None, 256, 128)     295040      ['embedding_1[0][0]']            
                                                                                                  
 conv1d_6 (Conv1D)              (None, 256, 64)      24640       ['conv1d_5[0][0]']               
                                                                                            

KeyboardInterrupt: 

In [15]:
model = keras.models.load_model("../resources/output/TweetsProcessed2/cnn1d/checkpoint_cnn1d_STARTATTENTION0_ENDATTENTION0_embed768/fold1.hdf5")
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 256, 768)     9319680     ['input_1[0][0]']                
                                                                                                  
 conv1d (Conv1D)                (None, 256, 128)     295040      ['embedding[0][0]']              
                                                                                                  
 conv1d_1 (Conv1D)              (None, 256, 64)      24640       ['conv1d[0][0]']                 
                                                                                              