In [1]:
import tensorflow as tf
from tensorflow import keras

import numpy as np

In [2]:
def get_attention(inputs, mask, n_heads, causal) :
    if n_heads == 1 :
        attention = keras.layers.Attention(causal=causal)(
            [
            inputs, 
            inputs
            ],
            mask=[ 
            mask, 
            mask
            ]
        )
    elif n_heads > 1 :
        attentions = []
        for _ in range(n_heads) :
            attentions.append(
                keras.layers.Attention(causal=causal)(
                    [
                        inputs, 
                        inputs
                    ],
                    mask=[ 
                        mask, 
                        mask
                    ]
                )
            )
        attention = keras.layers.Add()(attentions)
        attention = keras.layers.BatchNormalization()(attention)
    else :
        return inputs
    return attention

def pooling_on_mask(inputs_mask:tf.Tensor) :
    inputs_mask = tf.cast(inputs_mask, tf.int8)
    shape = inputs_mask.get_shape().as_list()
    inputs_mask = keras.layers.Reshape((shape[1], 1))(inputs_mask)
    inputs_mask = keras.layers.MaxPooling1D(pool_size=2, padding='same')(inputs_mask)
    shape = inputs_mask.get_shape().as_list()
    inputs_mask = keras.layers.Reshape((shape[1],))(inputs_mask)
    inputs_mask = tf.cast(inputs_mask, tf.bool)
    return inputs_mask

def get_model(vocab_size, 
                embed_dim,
                max_len=256,
                n_classes=1,
                with_attention=True,
                return_sequences_on_end=False,
                emb_trainable=True) :
    inputs      = tf.keras.Input(shape=(max_len,), dtype='int32')
    inputs_mask = tf.keras.Input(shape=(max_len,), dtype='bool')
    
    embedding_layer = keras.layers.Embedding(vocab_size, embed_dim, input_length=max_len, mask_zero=True, trainable=emb_trainable)
    embedding = embedding_layer(inputs)
    
    enc_conv1 = keras.layers.Conv1D(filters=128, kernel_size=3, activation='relu', padding='same')(embedding)
    
    lstm2 = keras.layers.LSTM(units=64, return_sequences=True)([enc_conv1], mask=inputs_mask)
    last  = lstm2
    
    if with_attention :
        attention = keras.layers.Attention(causal=False)(
            [
            last, 
            last
            ],
            mask=[ 
            inputs_mask, 
            inputs_mask
            ]
        )
        last  = attention
    
    lstm3 = keras.layers.LSTM(units=64, return_sequences=return_sequences_on_end)([last], mask=inputs_mask)
    last  = lstm3

    if return_sequences_on_end :
        pool = keras.layers.GlobalAveragePooling1D()(last)
        last = pool
    
    drop1 = keras.layers.Dropout(0.05)(last)
    last  = drop1
    
    outputs = keras.layers.Dense(n_classes, activation='sigmoid' if n_classes==1 else 'softmax')(last)

    model = keras.models.Model(inputs=[inputs, inputs_mask], outputs=outputs)

    model.summary()

    return model, embedding_layer

In [3]:
import pandas as pd
import json

def get_word_index(sentences) :
    all_tokens = []
    for txt in sentences :
        all_tokens += txt.split()

    tokens = pd.Series(all_tokens, range(len(all_tokens)), name="tokens")
    types = tokens.unique()
    word_index = {word : i for i, word in enumerate(["<pad>", "<unk>"] + list(types))}
    
    def decode_review(text):
        reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
        return ' '.join([reverse_word_index.get(i, '?') for i in text])
    
    return word_index, decode_review

def get_dataset(dataframe, word_index, n_classes, max_len=256) :
    index_series, x_series, y_series = dataframe.index, dataframe["text"], dataframe["class"]

    index = list(index_series)

    x_list = [txt.split() for txt in list(x_series)]
    
    x_seq = []
    for tknlst in x_list :
        seq = []
        for tkn in tknlst :
            try :
                seq.append(word_index[tkn])
            except KeyError :
                seq.append(word_index["<unk>"])
        seq = (seq + [0] * (max_len - len(seq))) if (len(seq) < max_len) else (seq[ : max_len])
        x_seq.append(seq)

    x = np.array(x_seq, dtype=int) 
    mask = x != 0
    
    if n_classes == 1 :
        y_int = [1 if lb >= .5 else 0 for lb in list(y_series)]

        y = np.array(y_int, dtype=int) 
    elif n_classes > 1 :
        y = np.zeros((y_series.shape[0], n_classes), dtype=int)
        for i, label in enumerate(list(y_series)) :
            y[i][label] = 1
    else :
        raise Exception()

    return index, x, mask, y

In [4]:
experiments = [
    {
        "experiment_name" : f"lstm_WITH_ATTENTION_embed768",
        "load_weights_from" : None,
        "args" : {
            "vocab_size" : None,
            "embed_dim" : 768,
            "n_classes" : 3,
            "with_attention" : True,
            "emb_trainable" : True,
        }
    },
    {
        "experiment_name" : f"lstm_NO_ATTENTION_embed768",
        "load_weights_from" : None,
        "args" : {
            "vocab_size" : None,
            "embed_dim" : 768,
            "n_classes" : 3,
            "with_attention" : False,
            "emb_trainable" : True,
        }
    },
    {
        "experiment_name" : f"lstm_WITH_ATTENTION_RETSEQ_embed768",
        "load_weights_from" : None,
        "args" : {
            "vocab_size" : None,
            "embed_dim" : 768,
            "n_classes" : 3,
            "with_attention" : True,
            "return_sequences_on_end" : True,
            "emb_trainable" : True,
        }
    },
]

In [7]:
import os
from statistics import mean

n_folds = 10
dataset_folder = "TwitterAirlines"
dataset_name   = "TweetsProcessed2"
for experiment in experiments :
    checkpoint_dir  = f"../resources/output/{dataset_name}/lstm/checkpoint_{experiment['experiment_name']}"
    
    folds_dfs, embeddings_dfs = [], []
    for i_fold in range(n_folds) :
        fold_df       = pd.read_csv(f"../resources/datasets/{dataset_folder}/folds/{dataset_name}_Fold{i_fold + 1}.csv", index_col=0)
        embedding_df = pd.read_csv(f"../resources/embeddings/{dataset_folder}/{dataset_name}_Fold{i_fold + 1}_dim{experiment['args']['embed_dim']}.csv", index_col=0)
        
        folds_dfs.append(fold_df)
        embeddings_dfs.append(embedding_df)
    
    evals = []
    for i_fold in range(n_folds) :
        print("=" * 50)
        checkpoint_path = f"{checkpoint_dir}/fold{i_fold + 1}.hdf5"
        print(checkpoint_path)
        if os.path.exists(checkpoint_path) : continue

        embeddings_df = embeddings_dfs[i_fold]

        train_dataframe = pd.concat(folds_dfs[ : i_fold] + folds_dfs[i_fold + 1 : ])
        valid_dataframe = folds_dfs[i_fold]
        
        word_index, decode_review = get_word_index(train_dataframe["text"])
        if experiment["args"]["vocab_size"] is None :
            experiment["args"]["vocab_size"] = len(word_index)

        ids_train, x_train, mask_train, y_train = get_dataset(train_dataframe, word_index, n_classes=experiment["args"]["n_classes"])
        print(len(ids_train), x_train.shape, mask_train.shape, y_train.shape)
        print(decode_review(x_train[1]), y_train[1])
        
        ids_valid, x_valid, mask_valid, y_valid = get_dataset(valid_dataframe, word_index, n_classes=experiment["args"]["n_classes"])
        print(len(ids_valid), x_valid.shape, mask_valid.shape, y_valid.shape)
        print(decode_review(x_valid[1]), y_valid[1])

        model, emb_layer = get_model(**experiment['args'])

        emb_wgts = emb_layer.get_weights()
        print(embeddings_df.shape[0], emb_wgts[0].shape, len(word_index))
        emb_wgts[0][0] = embeddings_df.iloc[0].values
        emb_wgts[0][1 - embeddings_df.shape[0] : ] = embeddings_df.values[1:]
        emb_layer.set_weights(emb_wgts)

        if not experiment['load_weights_from'] is None :
            model.load_weights(experiment['load_weights_from'])
        emb_layer.trainable = experiment['args']['emb_trainable']

        model.compile(optimizer='adam',
                    loss='binary_crossentropy' if experiment["args"]["n_classes"] == 1 else 'categorical_crossentropy',
                    metrics=['accuracy'])
        history = model.fit([x_train, mask_train],
                            y_train,
                            epochs=6,
                            batch_size=64,
                            validation_data=([x_valid, mask_valid], y_valid),
                            callbacks=[
                                keras.callbacks.ModelCheckpoint(
                                    checkpoint_path,
                                    save_best_only=True,
                                    save_weights_only=False,
                                    verbose=1
                                )
                            ],
                            verbose=1)

        model.load_weights(checkpoint_path)

        preds = model.predict([x_valid, mask_valid])

        if experiment["args"]["n_classes"] == 1 :
            preds_df = pd.DataFrame(preds, index=ids_valid, columns=["predictions"])
            dataset_with_preds_df = pd.concat([valid_dataframe, preds_df], axis=1)
            
            hits, total = 0, 0
            for _, row in dataset_with_preds_df.iterrows() :
                total += 1
                pred  = row["predictions"] >= .5
                label = row["class"]       >= .5
                if pred == label : 
                    hits += 1
            accuracy = hits / total

            hits, total = 0, 0
            for _, row in dataset_with_preds_df.iterrows() :
                total += 1
                pred  = row["predictions"]
                label = row["class"]
                diff = label - pred
                hits += abs(diff)
            avg_dev = hits / total

            true_positives, true_negatives, false_positives, false_negatives = 0, 0, 0, 0
            for _, row in dataset_with_preds_df.iterrows() :
                pred  = row["predictions"] >= .5
                label = row["class"]       >= .5
                if pred == label :
                    if pred :
                        true_positives  += 1
                    else :
                        true_negatives  += 1
                else :
                    if pred :
                        false_positives += 1
                    else :
                        false_negatives += 1
            print(true_positives, true_negatives, false_positives, false_negatives)

            precision = true_positives / (true_positives + false_positives)
            recall = true_positives / (true_positives + false_negatives)
            f1 = (2 * precision * recall) / (precision + recall)

            eval_dict = {
                'model'     : checkpoint_path,
                'accuracy'  : accuracy,
                'precision' : precision,
                'recall'    : recall,
                'f1-score'  : f1,
                'detail'    : {
                    "true_positives"  : true_positives,
                    "true_negatives"  : true_negatives,
                    "false_positives" : false_positives,
                    "false_negatives" : false_negatives
                }
            }
        else :
            preds_df = pd.DataFrame(preds, index=ids_valid, columns=["preds_0", "preds_1", "preds_2"])
            dataset_with_preds_df = pd.concat([valid_dataframe, preds_df], axis=1)
            dataset_with_preds_df.to_csv(f"{checkpoint_dir}/fold{i_fold}-predictions.csv")
            
            hits, total = 0, 0
            for _, row in dataset_with_preds_df.iterrows() :
                total += 1
                pred  = np.argmax([row[col] for col in ["preds_0", "preds_1", "preds_2"]])
                label = row["class"]
                if pred == label : 
                    hits += 1
            accuracy = hits / total
            eval_dict = {
                'model'     : checkpoint_path,
                'accuracy'  : accuracy
            }

        print(eval_dict)

        with open(f"{eval_dict['model']}-eval-dict.json", "w") as f :
            f.write(json.dumps(eval_dict, indent=4))

        evals.append(eval_dict)

        del model 
    if len(evals) == 0 : continue
    final_metrics = {
        "experiment" : experiment["experiment_name"],
        'accuracy'  : mean([fold_metrics["accuracy"] for fold_metrics in evals])
    }
    if 'precision' in evals[0].keys() :
        final_metrics['precision'] = mean([fold_metrics["precision"] for fold_metrics in evals]),
    if 'recall' in evals[0].keys() :
        final_metrics['recall']    = mean([fold_metrics["recall"] for fold_metrics in evals]),
    if 'f1-score' in evals[0].keys() :
        final_metrics['f1-score']  = mean([fold_metrics["f1"] for fold_metrics in evals]),
    if 'detail' in evals[0].keys() :
        final_metrics['detail'] = {
            "true_positives"  : sum([fold_metrics["detail"]["true_positives"]  for fold_metrics in evals]),
            "true_negatives"  : sum([fold_metrics["detail"]["true_negatives"]  for fold_metrics in evals]),
            "false_positives" : sum([fold_metrics["detail"]["false_positives"] for fold_metrics in evals]),
            "false_negatives" : sum([fold_metrics["detail"]["false_negatives"] for fold_metrics in evals])
        }
    with open(f"{checkpoint_dir}/final-eval-dict.json", "w") as f :
        f.write(json.dumps(eval_dict, indent=4))

../resources/output/TweetsProcessed2/lstm/checkpoint_lstm_WITH_ATTENTION_embed768/fold1.hdf5
../resources/output/TweetsProcessed2/lstm/checkpoint_lstm_WITH_ATTENTION_embed768/fold2.hdf5
../resources/output/TweetsProcessed2/lstm/checkpoint_lstm_WITH_ATTENTION_embed768/fold3.hdf5
../resources/output/TweetsProcessed2/lstm/checkpoint_lstm_WITH_ATTENTION_embed768/fold4.hdf5
../resources/output/TweetsProcessed2/lstm/checkpoint_lstm_WITH_ATTENTION_embed768/fold5.hdf5
../resources/output/TweetsProcessed2/lstm/checkpoint_lstm_WITH_ATTENTION_embed768/fold6.hdf5
../resources/output/TweetsProcessed2/lstm/checkpoint_lstm_WITH_ATTENTION_embed768/fold7.hdf5
../resources/output/TweetsProcessed2/lstm/checkpoint_lstm_WITH_ATTENTION_embed768/fold8.hdf5
../resources/output/TweetsProcessed2/lstm/checkpoint_lstm_WITH_ATTENTION_embed768/fold9.hdf5
../resources/output/TweetsProcessed2/lstm/checkpoint_lstm_WITH_ATTENTION_embed768/fold10.hdf5
../resources/output/TweetsProcessed2/lstm/checkpoint_lstm_NO_ATTENTIO

KeyboardInterrupt: 

In [None]:
embedding_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,759,760,761,762,763,764,765,766,767,768
<start>,-0.251972,-0.335857,0.743956,0.287609,-0.320067,-0.336197,0.651577,-0.417630,0.072657,0.669512,...,0.118542,0.013129,-0.103702,0.214072,0.223253,0.165746,-0.625366,-0.433057,-0.647071,-0.493110
to,-0.095761,-0.244435,0.169767,-0.254720,0.520574,0.380848,-0.152164,0.410601,0.975136,0.334161,...,-0.455171,-0.563830,0.425149,-0.744738,0.315320,-1.070223,-0.815350,0.106142,0.148487,-0.391389
the,-0.040541,-1.178647,0.772364,0.711954,0.177214,-0.639962,-0.159563,0.261875,0.473067,-0.223687,...,-0.444203,-0.343703,-0.317150,-0.642309,1.033737,0.581608,0.067017,-0.575224,0.700265,-0.863741
i,-0.406263,0.581536,0.150963,0.089914,-0.085831,-0.187691,-0.108174,0.237551,0.576532,0.273172,...,-0.107218,-0.156319,-0.019238,-0.439631,-0.072913,0.322750,-0.363102,0.098256,-0.759535,-0.244696
a,-0.284253,-0.756383,0.784185,0.245781,-0.217883,-0.411314,0.006313,-0.081967,-0.082813,-1.140420,...,-0.822037,0.241155,-0.176131,-0.117470,0.494665,0.277910,-0.839016,-0.212997,-0.053063,-1.011198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1147,0.066911,0.108697,0.117314,-0.026495,0.015199,0.079509,-0.211216,0.084738,-0.009791,0.068621,...,0.129855,-0.015207,-0.077254,-0.055104,-0.080412,0.157144,0.227705,-0.226615,-0.020310,0.101783
automated'',0.034742,0.024435,-0.035676,-0.049706,0.029638,0.022952,0.011129,-0.154946,-0.134221,0.064848,...,-0.025236,-0.079394,0.159307,0.042786,-0.130191,0.089049,-0.016407,-0.054820,0.087518,0.114606
930,0.033926,0.170894,0.087645,-0.031428,-0.038994,0.038913,-0.184922,0.008812,-0.069440,0.067128,...,0.128920,-0.038195,-0.017345,-0.083919,-0.037820,0.150396,0.248054,-0.204338,-0.037215,0.093046
screenings,0.195423,0.028148,0.039086,-0.015086,-0.050388,0.027777,-0.067449,0.064195,-0.061795,-0.068845,...,-0.050059,0.038295,0.016049,0.132378,0.034546,0.030001,0.030840,-0.048373,0.060404,0.080170


In [None]:
len(word_index)

12136

In [None]:
word_index.keys()

In [None]:
emb_index = list(embedding_df.index)

In [None]:
len(emb_index)

12134

In [None]:
outliers = []
for word in list(word_index.keys()) :
    if not word in emb_index :
        outliers.append(word)

In [None]:
outliers

[]

In [None]:
outliers = []
word_index_keys = list(word_index.keys())
for word in emb_index :
    if not word in word_index_keys :
        outliers.append(word)

In [None]:
i_fold = 3

embeddings_df = embeddings_dfs[i_fold]

train_dataframe = pd.concat(folds_dfs[ : i_fold] + folds_dfs[i_fold + 1 : ])
valid_dataframe = folds_dfs[i_fold]

word_index, decode_review = get_word_index(train_dataframe["text"])
if experiment["args"]["vocab_size"] is None :
    experiment["args"]["vocab_size"] = len(word_index)

ids_train, x_train, mask_train, y_train = get_dataset(train_dataframe, word_index, n_classes=experiment["args"]["n_classes"])
print(len(ids_train), x_train.shape, mask_train.shape, y_train.shape)
print(decode_review(x_train[1]), y_train[1])

ids_valid, x_valid, mask_valid, y_valid = get_dataset(valid_dataframe, word_index, n_classes=experiment["args"]["n_classes"])
print(len(ids_valid), x_valid.shape, mask_valid.shape, y_valid.shape)
print(decode_review(x_valid[1]), y_valid[1])

13176 (13176, 256) (13176, 256) (13176, 3)
<start> iad to jfk still has n't boarded what's today's excuse and how am i gon na get to work <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad

In [None]:
embeddings_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,759,760,761,762,763,764,765,766,767,768
<start>,-0.282846,-0.485796,-0.038414,0.352181,-0.564837,0.169060,0.218938,0.147674,0.519879,0.071983,...,0.788335,0.355169,-0.353737,-0.654546,-0.739606,0.774340,0.977387,-0.108453,-0.701812,1.476708
to,-0.415130,-1.033926,0.068353,0.440023,-0.405840,0.913767,-0.697176,-0.593707,0.068347,-0.544617,...,0.463278,-0.190466,-0.006372,-1.845080,-1.871068,0.187292,1.300340,-0.927931,-0.190056,1.390106
the,0.642369,-0.869593,0.740655,0.698861,-0.100020,-0.076253,-0.532491,0.089165,0.047654,0.244985,...,0.431776,-0.542156,-0.986821,-1.289415,-0.163144,1.409486,0.956747,-0.454120,-0.163223,0.987065
i,0.158433,-0.469385,-0.073635,-0.132904,-0.902013,0.510985,-0.734310,0.219299,0.671551,-0.015053,...,0.668103,0.064029,-0.218738,-1.440547,-1.043749,0.685961,1.527858,-0.738634,-1.085329,1.415565
a,-0.111537,-0.689202,0.877903,-0.269690,0.183240,0.115914,-0.101982,0.018584,-0.157449,-0.158345,...,1.076107,-0.535732,-0.057149,-1.205376,-1.086794,1.163292,1.485147,-1.402447,0.400378,0.712581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
renewed,0.074851,-0.038765,-0.051553,0.026132,0.056374,-0.123528,-0.042322,0.163471,-0.216049,-0.074544,...,0.050555,0.096236,-0.121695,-0.147618,-0.042536,0.047722,0.096364,0.018134,0.059003,-0.029817
ua761,0.030084,-0.010749,-0.053900,-0.006302,0.080238,0.077685,0.000377,0.058809,0.109591,0.121032,...,-0.053510,-0.164922,0.125757,0.041468,-0.036388,0.021465,0.004651,-0.082069,0.001113,0.006983
moves,0.022142,-0.021479,0.128811,-0.018625,0.078715,-0.000661,-0.105004,0.058264,-0.063435,-0.002328,...,0.061755,0.070977,-0.020302,-0.048003,0.011098,0.005438,0.081405,-0.020868,-0.023329,0.039025
clockwork,-0.112291,-0.067761,-0.064172,0.096315,0.094721,-0.001640,0.099411,-0.083651,0.160123,-0.050896,...,-0.037912,-0.173050,-0.009925,-0.069573,-0.144182,0.300522,0.064133,-0.018481,-0.056886,0.087694


In [None]:
len(word_index)

12129