In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import math
from tensorflow import keras
from keras import Sequential
from keras.utils import Sequence
from keras.layers import Conv1D, MaxPooling1D, GlobalAveragePooling1D, BatchNormalization, Activation, Dropout, Flatten, Dense, Layer, Embedding
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.optimizers import Adam
from attention import Attention

In [2]:
def get_embeddings_wgts(embeddings_df):
    embeddings_wgts = embeddings_df.to_numpy()
    embeddings_mask = np.zeros((embeddings_wgts.shape[0] + 1, embeddings_wgts.shape[1]), dtype=embeddings_wgts.dtype)
    embeddings_mask[1:,] = embeddings_wgts
    return embeddings_mask

In [3]:
import keras.backend as K
# Add attention layer to the deep learning network
class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)
 
    def build(self,input_shape):
        self.W=self.add_weight(name='attention_weight', shape=(input_shape[-1],1), 
                               initializer='random_normal', trainable=True)
        self.b=self.add_weight(name='attention_bias', shape=(input_shape[1],1), 
                               initializer='zeros', trainable=True)        
        super(attention, self).build(input_shape)
 
    def call(self,x):
        # Alignment scores. Pass them through tanh function
        e = K.tanh(K.dot(x,self.W)+self.b)
        # Remove dimension of size 1
        e = K.squeeze(e, axis=-1)   
        # Compute the weights
        alpha = K.softmax(e)
        # Reshape to tensorFlow format
        alpha = K.expand_dims(alpha, axis=-1)
        # Compute the context vector
        context = x * alpha
        context = K.sum(context, axis=1)
        return context

In [4]:
# load data
dataset_folder = '../resources/datasets/TwitterAirlines'
dataset_name   = 'TweetsProcessed2'
k = 10
fold = 0
n_classes = 3

folds_dfs = []
for i_fold in range(k) :
    fold_path       = f'{dataset_folder}/folds/{dataset_name}_Fold{i_fold + 1}.csv'
    fold_dataframe  = pd.read_csv(fold_path)
    folds_dfs.append(fold_dataframe)

train_dataframe = pd.concat(folds_dfs[ : fold] + folds_dfs[fold + 1 : ], axis=0)
test_dataframe  = folds_dfs[fold]
embeddings_path = f'{dataset_folder}/embeddings/{dataset_name}_Fold{fold + 1}.csv'

embeddings_df = pd.read_csv(embeddings_path, index_col=0)
embeddings_wgts = get_embeddings_wgts(embeddings_df)
embeddings_wgts.shape

(11622, 300)

In [5]:
import io
def load_vectors(fname, vocab_dicts):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    print(n, d)
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        try :
            aux = vocab_dicts[tokens[0]]
            data[tokens[0]] = [float(tkn) for tkn in tokens[1:]]
        except KeyError :
            pass
    return data

In [6]:
orig_embed_dict = {row.name : list(row) for _, row in embeddings_df.iterrows()}
pretrained_embedings_dict = load_vectors("../resources/pretrained-embeddings/wiki-news-300d-1M.vec", orig_embed_dict)

999994 300


In [7]:
len(orig_embed_dict), len(pretrained_embedings_dict)

(11621, 9736)

In [8]:
final_embeddings = orig_embed_dict.copy()
for k, v in pretrained_embedings_dict.items() :
    final_embeddings[k] = v

In [9]:
final_embeddings_df = pd.DataFrame(final_embeddings).T

In [10]:
final_embeddings_wgt = get_embeddings_wgts(final_embeddings_df)

In [38]:
model = Sequential()
# model.add(Embedding(input_dim=final_embeddings_wgt.shape[0], output_dim=final_embeddings_wgt.shape[1], weights=[final_embeddings_wgt], trainable=False, input_length=1))
# block 1 - convolutional (con1d)layer with max pooling and dropout 
model.add(Conv2D(filters=128, kernel_size=3, activation='relu', padding='same'))
model.add(Conv2D(filters=64, kernel_size=3, activation='relu', padding='same'))
model.add(MaxPooling2D(pool_size=2, padding='same'))
# model.add(Dropout(0.2))
model.add(Conv2D(filters=32, kernel_size=3, activation='relu', padding='same'))
model.add(Conv2D(filters=16,  kernel_size=3, activation='relu', padding='same'))
model.add(GlobalAveragePooling2D())
# model.add(Dropout(0.2))

# attention layer
# model.add(attention())

# output layer
model.add(Flatten())
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=32, activation='relu'))
# model.add(Dropout(0.1))
model.add(Dense(units=n_classes, activation='softmax' if n_classes > 1 else 'sigmoid'))

# compile model
model.build((None, None, final_embeddings_wgt.shape[1], 1))
model.compile(loss='categorical_crossentropy' if n_classes > 1 else 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_4 (Conv2D)           (None, None, 300, 128)    1280      
                                                                 
 conv2d_5 (Conv2D)           (None, None, 300, 64)     73792     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, None, 150, 64)    0         
 2D)                                                             
                                                                 
 conv2d_6 (Conv2D)           (None, None, 150, 32)     18464     
                                                                 
 conv2d_7 (Conv2D)           (None, None, 150, 16)     4624      
                                                                 
 global_average_pooling2d_1   (None, 16)               0         
 (GlobalAveragePooling2D)                             

In [18]:
model = Sequential()
model.add(Embedding(final_embeddings_wgt.shape[0], final_embeddings_wgt.shape[1]))
model.add(GlobalAveragePooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(3, activation='softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 300)         3486600   
                                                                 
 global_average_pooling1d_1   (None, 300)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_4 (Dense)             (None, 128)               38528     
                                                                 
 dense_5 (Dense)             (None, 64)                8256      
                                                                 
 dense_6 (Dense)             (None, 32)                2080      
                                                                 
 dense_7 (Dense)             (None, 16)                528       
                                                      

In [21]:
model.compile(
    loss='categorical_crossentropy', 
    optimizer=Adam(learning_rate=.1), 
    metrics=['accuracy']
)

In [13]:
def levenshtein(source:str, target:str) -> int :
    n = len(source)
    m = len(target)
    
    D = np.zeros((n + 1, m + 1), dtype=int)
    for i in range(1, n + 1) :
        D[i][0] = D[i - 1][0] + 1
    for j in range(1, m + 1) :
        D[0][j] = D[0][j - 1] + 1

    subst_cost = lambda x, y : 0 if x == y else 4
    for i in range(1, n + 1) :
        for j in range(1, m + 1) :
            D[i][j] = min([D[i - 1][j    ] + 1,
                           D[i - 1][j - 1] + subst_cost(source[i - 1], target[j - 1]),
                           D[i    ][j - 1] + 1])
    return D[n][m]
def distance(source:str, target:str) -> float :
    lev = levenshtein(source, target)
    if source in target or target in source : lev = lev - .5 
    return lev

In [23]:
class DatasetSequence(Sequence) :
    def __init__(self, 
                 dataframe:pd.DataFrame, 
                 vocabulario:list[str],
                 embeddings:np.ndarray=None,
                 translation_dict:dict[str,str]=None,
                 text_column:str='text',
                 label_column:str='class',
                 n_classes:int=1,
                 batch_size:int=16,
                 shuffle:bool=True) :
        self.dataframe = dataframe
        self.embeddings = embeddings
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.vocabulario_original = vocabulario
        self.vocabulario_dict = {word : i for i, word in enumerate(self.vocabulario_original)}
        
        vocabulario_real = []
        for txt in list(self.dataframe[text_column]) :
            vocabulario_real += txt.split()
        vocabulario_real = list(pd.Series(vocabulario_real).unique())
        
        if translation_dict is None :
            self.new_words = {}
        else :
            self.new_words = translation_dict
            for new, translation in translation_dict.items() :
                self.vocabulario_dict[new] = self.vocabulario_dict[translation]
        
        for tkn in vocabulario_real :
            if not tkn in self.vocabulario_dict.keys() :
                voc_sort = list(self.vocabulario_dict.keys())
                voc_sort.sort(key=lambda x : distance(tkn, x))
                print(tkn, "to", voc_sort[0])
                self.new_words[tkn] = voc_sort[0]
                self.vocabulario_dict[tkn] = self.vocabulario_dict[voc_sort[0]]
        print("No. new words:", len(self.new_words))
        
        self.text_column = text_column
        self.label_column = label_column
        self.n_classes = n_classes
        self.on_epoch_end()
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(self.dataframe.shape[0])
        if self.shuffle :
            np.random.shuffle(self.indexes)
    def __len__(self) :
        return math.ceil(self.dataframe.shape[0] / self.batch_size)
    def text_processing(self, text:str) :
        tokenlist = text.split()
        
        sequence = []
        for token in tokenlist :
            sequence.append(self.vocabulario_dict[token] + 1)
        
        return sequence
    def __generate_data(self, index) :
        if type(index) == int :
            row = self.dataframe.iloc[self.indexes[index]]
            sequence = self.text_processing(row[self.text_column])
            if self.n_classes == 1 :
                label = [1 if row[self.label_column] >= .5 else 0]
            else :
                label = [(1 if i == row[self.label_column] else 0) for i in range(self.n_classes)]
            return np.array(sequence, dtype=int), np.array(label, dtype=int)
        elif type(index) == slice :
            start = index.start
            if start == None : start = 0
            stop = index.stop
            if stop == None or stop > self.dataframe.shape[0] : stop = self.dataframe.shape[0]
            
            max_len = 0
            X, Y = [], []
            for i in range(start, stop) :
                x, y = self.__generate_data(i)
                sentence_len = x.shape[0]
                if sentence_len > max_len : max_len = sentence_len
                X.append(x)
                Y.append(y)

            np_X = np.zeros((len(X), max_len), dtype=int)
            for i, x in enumerate(X) :
                np_X[i, -len(x) : ] = np.array(x)
            np_Y = np.array(Y, dtype=int)
            return np_X, np_Y
        else :
            raise IndexError()
    def __getitem__(self, index) :
        batch_x, batch_y = self.__generate_data(slice(index * self.batch_size, (index + 1) * self.batch_size))
        if not self.embeddings is None :
            enc_batch_x = np.zeros((batch_x.shape[0], batch_x.shape[1], self.embeddings.shape[1], 1), dtype=float)
            for i, sentence in enumerate(batch_x) :
                for j, tkn in enumerate(sentence) :
                    enc_batch_x[i][j] = self.embeddings[tkn].reshape((self.embeddings.shape[1], 1)) 
            batch_x = enc_batch_x
        return batch_x, batch_y

In [24]:
dataset = DatasetSequence(train_dataframe, final_embeddings_df.index.to_list(), batch_size=8, n_classes=n_classes)

No. new words: 0


In [41]:
for i in range(len(dataset)) :
    x, y = dataset[i]
    print(x.shape, y.shape)

(16, 27, 300, 1) (16, 3)
(16, 29, 300, 1) (16, 3)
(16, 25, 300, 1) (16, 3)
(16, 28, 300, 1) (16, 3)
(16, 27, 300, 1) (16, 3)
(16, 26, 300, 1) (16, 3)
(16, 27, 300, 1) (16, 3)
(16, 27, 300, 1) (16, 3)
(16, 27, 300, 1) (16, 3)
(16, 29, 300, 1) (16, 3)
(16, 26, 300, 1) (16, 3)
(16, 25, 300, 1) (16, 3)
(16, 28, 300, 1) (16, 3)
(16, 24, 300, 1) (16, 3)
(16, 30, 300, 1) (16, 3)
(16, 24, 300, 1) (16, 3)
(16, 27, 300, 1) (16, 3)
(16, 29, 300, 1) (16, 3)
(16, 28, 300, 1) (16, 3)
(16, 25, 300, 1) (16, 3)
(16, 25, 300, 1) (16, 3)
(16, 28, 300, 1) (16, 3)
(16, 27, 300, 1) (16, 3)
(16, 29, 300, 1) (16, 3)
(16, 28, 300, 1) (16, 3)
(16, 30, 300, 1) (16, 3)
(16, 27, 300, 1) (16, 3)
(16, 27, 300, 1) (16, 3)
(16, 26, 300, 1) (16, 3)
(16, 28, 300, 1) (16, 3)
(16, 26, 300, 1) (16, 3)
(16, 27, 300, 1) (16, 3)
(16, 28, 300, 1) (16, 3)
(16, 29, 300, 1) (16, 3)
(16, 26, 300, 1) (16, 3)
(16, 27, 300, 1) (16, 3)
(16, 27, 300, 1) (16, 3)
(16, 24, 300, 1) (16, 3)
(16, 31, 300, 1) (16, 3)
(16, 25, 300, 1) (16, 3)


In [25]:
model.fit(dataset, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100

KeyboardInterrupt: 