In [None]:
# !pip install pandas 
# !pip install numpy
# !pip install matplotlib
# !pip install tqdm
# !pip install seaborn
# !pip install tensorflow
# !pip install keras
# !pip install scikit-learn
# !pip install scikeras

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
import zipfile
import os
from tqdm import tqdm
import copy
import seaborn as sns
import sklearn

In [None]:
# Create documents dataframe
dp_docs = [file for file in os.listdir('dependency_treebank/') if file.endswith('.dp')]
dataframes = []

for file in tqdm(dp_docs):
    with open('dependency_treebank/' + file, 'r') as f:
        lines = f.readlines()
        data = [line.split('\t') for line in lines]
        df = pd.DataFrame(data, columns=['word', 'pos', 'head'])
        # drop the last column
        df = df.iloc[:, :-1]
        dataframes.append(df)

df = pd.DataFrame({'Dataframes': dataframes})
print(df.head(5))
print(df['Dataframes'][0][0:5])
print(df.shape)

In [None]:
# Split the dataframes into train, validation and test sets
train = df['Dataframes'][0:100]
val = df['Dataframes'][100:150]
test = df['Dataframes'][150:200]

print(train.shape)
print(val.shape)
print(test.shape)

In [None]:
# Text preprocessing
def to_lower_case(df, docs):
    for i in range(docs, docs + len(df)):
        df[i]['word'] = df[i]['word'].str.lower()

to_lower_case(train, 0)
to_lower_case(val, 100)
to_lower_case(test, 150)
print(train[0][0:5])

In [None]:
# Text preprocessing: from each doc remove newlines and empty lines
def remove_newlines(df, docs):
    for i in range(docs, docs + len(df)):
        df[i] = df[i][df[i]['word'] != '\n']
        df[i] = df[i][df[i]['word'] != '']

print(len(train[0]))
remove_newlines(train, 0)
remove_newlines(val, 100)
remove_newlines(test, 150)
print(train[0])
print(len(train[0]))
print(len(val[100]))
print(len(test[150]))

In [None]:
# Create new dataframe that contains the single sentences
def create_sentences(df, docs):
    sentences = []
    chunks = []
    for i in range(docs, docs + len(df)):
        for element in df[i]['word']:
            if element != '.' and element != '!' and element != '?':
                chunks.append(element)
            else:
                chunks.append(element)
                sentences.append(chunks)
                chunks = []
    return sentences

# Create sentences for train, val and test
train_sentences = create_sentences(train, 0)
val_sentences = create_sentences(val, 100)
test_sentences = create_sentences(test, 150)
print(len(train_sentences))
print(len(val_sentences))
print(len(test_sentences))

In [None]:
def create_tag_sentences(df, docs):
    tag_sentences = []
    chunks = []
    for i in range(docs, docs + len(df)):
        for element in df[i]['pos']:
            if element != '.' and element != '!' and element != '?':
                chunks.append(element)
            else:
                chunks.append(element)
                tag_sentences.append(chunks)
                chunks = []
    return tag_sentences

# Create tag sentences for train, val and test
train_tag_sentences = create_tag_sentences(train, 0)
val_tag_sentences = create_tag_sentences(val, 100)
test_tag_sentences = create_tag_sentences(test, 150)
print(len(train_tag_sentences))
print(len(val_tag_sentences))
print(len(test_tag_sentences))


In [None]:
print(len(train_sentences[0]))
print(len(train_tag_sentences[0]))

In [None]:
# Encode train sentences and tags
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

sentence_tokenizer = Tokenizer(oov_token='<UNK>')
sentence_tokenizer.fit_on_texts(train_sentences)
encoded_train_sentences = sentence_tokenizer.texts_to_sequences(train_sentences)
encoded_val_sentences = sentence_tokenizer.texts_to_sequences(val_sentences)
encoded_test_sentences = sentence_tokenizer.texts_to_sequences(test_sentences)

print(f'OOV token: {sentence_tokenizer.oov_token}')
print(f'OOV index: {sentence_tokenizer.word_index[sentence_tokenizer.oov_token]}')
print(f'Vocabulary size: {len(sentence_tokenizer.word_index)}')
print(encoded_train_sentences[0])
print(encoded_val_sentences[0])
print(encoded_test_sentences[0])

In [None]:
# I print and decode sentence 0
i = 2
print(encoded_train_sentences[i])
print(sentence_tokenizer.sequences_to_texts(encoded_train_sentences[i:i+1]))

In [None]:
tag_tokenizer = Tokenizer(oov_token='<UNK>')
tag_tokenizer.fit_on_texts(train_tag_sentences)
encoded_train_tags = tag_tokenizer.texts_to_sequences(train_tag_sentences)
encoded_val_tags = tag_tokenizer.texts_to_sequences(val_tag_sentences)
encoded_test_tags = tag_tokenizer.texts_to_sequences(test_tag_sentences)

print(f'Tag vocabulary size: {len(tag_tokenizer.word_index)}')
print(encoded_train_tags[0])

In [None]:
# check length of longest sentence 
lengths = [len(sentence) for sentence in encoded_train_sentences]
print(max(lengths))
sns.boxplot(lengths)
plt.show()

In [None]:
# Pad each sequence to MAX_SEQ_LENGTH using KERAS' pad_sequences() function. 
# Sentences longer than MAX_SEQ_LENGTH are truncated.
# Sentences shorter than MAX_SEQ_LENGTH are padded with zeroes.

# Truncation and padding can either be 'pre' or 'post'. 
# For padding we are using 'pre' padding type, that is, add zeroes on the left side.
# For truncation, we are using 'post', that is, truncate a sentence from right side.

MAX_SEQ_LENGTH = 100
train_padded = pad_sequences(encoded_train_sentences, maxlen=MAX_SEQ_LENGTH, padding='pre', truncating='post')
train_tag_padded = pad_sequences(encoded_train_tags, maxlen=MAX_SEQ_LENGTH, padding='pre', truncating='post')
val_padded = pad_sequences(encoded_val_sentences, maxlen=MAX_SEQ_LENGTH, padding='pre', truncating='post')
val_tag_padded = pad_sequences(encoded_val_tags, maxlen=MAX_SEQ_LENGTH, padding='pre', truncating='post')
test_padded = pad_sequences(encoded_test_sentences, maxlen=MAX_SEQ_LENGTH, padding='pre', truncating='post')
test_tag_padded = pad_sequences(encoded_test_tags, maxlen=MAX_SEQ_LENGTH, padding='pre', truncating='post')

print(train_padded[0])
print(train_tag_padded[0])
print(len(train_padded[0]))
print(len(train_tag_padded[0]))

In [None]:
import gensim
import gensim.downloader as gloader

def load_embedding_model(embedding_dimension: int = 50) -> gensim.models.keyedvectors.KeyedVectors:
    download_path = ""
    download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
        
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Glove: 50, 100, 200, 300")
        raise e

    return emb_model

In [None]:
# Glove -> 50, 100, 200, 300
embedding_model = load_embedding_model(embedding_dimension=50)

In [None]:
def create_embedding_matrix(embedding_model, word2id, embedding_dim):
    vocab_size = len(word2id) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    OOV_embedding = np.random.rand(embedding_dim)
    count_not_in_glove = 0
    for word, i in tqdm(word2id.items()):
        try:
            embedding_matrix[i, :] = embedding_model[word]
        except KeyError:
            if word == '<UNK>':
                # give <UNK> a static embedding
                embedding_matrix[i, :] = OOV_embedding
                continue
            # If the word is not in the embedding model, it will have a random embedding vector, but check that its not already present
            embedding_vector = np.random.rand(embedding_dim)
            while embedding_vector in embedding_matrix:
                embedding_vector = np.random.rand(embedding_dim)
            embedding_matrix[i, :] = embedding_vector
            count_not_in_glove += 1
    print(f'Number of words that have no glove embedding: {count_not_in_glove}')
    return embedding_matrix

embedding_dim = 50
embedding_train_matrix = create_embedding_matrix(embedding_model, sentence_tokenizer.word_index, embedding_dim)
print(embedding_train_matrix.shape)

In [None]:
# For tags use one-hot encoding
from keras.utils import to_categorical

Y = to_categorical(train_tag_padded)
print(Y.shape)
print(Y[0])

In [None]:
print(train_tag_padded.shape)
print(train_padded.shape)

In [None]:
print("Embeddings shape: {}".format(embedding_train_matrix.shape))

In [None]:
# Split the data into train, val and test sets
X_train = train_padded
Y_train = Y
X_val = val_padded
Y_val = to_categorical(val_tag_padded)
X_test = test_padded
# for test also consider the missing tags
Y_test = to_categorical(test_tag_padded, num_classes=len(tag_tokenizer.word_index)+1)
print(X_train.shape)
print(Y_train.shape)
print(X_val.shape)
print(Y_val.shape)
print(X_test.shape)
print(Y_test.shape)

In [None]:
def check_OOV_terms(X):
    count_OOV = 0
    for sentence in X:
        for word in sentence:
            if word == 1:
                count_OOV += 1
    print(f'Number of OOV terms: {count_OOV}')

check_OOV_terms(X_train)
check_OOV_terms(X_val)
check_OOV_terms(X_test)

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Bidirectional, TimeDistributed, Dropout
from keras.optimizers import Adam
import pickle

num_tags = Y_train.shape[2]

In [None]:
# Baseline model: LSTM + FC

#HYPERPARAMETERS TUNING:
'''
def create_baseline_model(lstm_units, dropout_rate, recurrent_dropout_rate, learning_rate):
    model = Sequential()
    model.add(Embedding(input_dim=len(sentence_tokenizer.word_index)+1, output_dim=embedding_dim, input_length=MAX_SEQ_LENGTH, weights=[embedding_train_matrix], trainable=True, mask_zero=True))
    model.add(Bidirectional(LSTM(units=lstm_units, return_sequences=True, dropout=dropout_rate, recurrent_dropout=recurrent_dropout_rate)))
    model.add(TimeDistributed(Dense(num_tags, activation='softmax')))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy'])
    return model

hyperparameters = { 'lstm_units': [128, 256], 
                    'dropout_rate': [0.1, 0.2, 0.3],
                    'recurrent_dropout_rate': [0.1, 0.2, 0.3],
                    'learning_rate': [0.001, 0.01],
                    'batch_size': [32, 64, 128]
                  }
# for each hyperparameter combination, train the model for 3 epochs and save the model with the best validation accuracy
from sklearn.model_selection import ParameterGrid

# create a list of all possible combinations of hyperparameters
param_grid = ParameterGrid(hyperparameters)
print(f'Number of hyperparameter combinations: {len(param_grid)}')

# train the model for each hyperparameter combination
from sklearn.metrics import accuracy_score

best_baseline_val_accuracy = 0
best_baseline_model = None
best_baseline_hyperparameters = None

for params in tqdm(param_grid):
    model = create_baseline_model(params['lstm_units'], params['dropout_rate'], params['recurrent_dropout_rate'], params['learning_rate'])
    model.fit(X_train, Y_train, epochs=1, batch_size=params['batch_size'], verbose=0)
    val_loss, val_accuracy = model.evaluate(X_val, Y_val, verbose=0)
    if val_accuracy > best_baseline_val_accuracy:
        best_baseline_val_accuracy = val_accuracy
        best_baseline_model = model
        best_baseline_hyperparameters = params

print(f'Best validation accuracy: {best_baseline_val_accuracy}')
print(f'Best hyperparameters: {best_baseline_hyperparameters}') 

# save the hyperparameters and the model
import pickle
with open('best_baseline_hyperparameters.pickle', 'wb') as f:
    pickle.dump(best_baseline_hyperparameters, f)
best_baseline_model.save('best_baseline_model.h5')
'''

In [None]:
# load the model
baseline = keras.models.load_model('best_baseline_model.h5')
best_baseline_hyperparameters = pickle.load(open('best_baseline_hyperparameters.pickle', 'rb'))
baseline.summary()

In [None]:
history_baseline = baseline.fit(X_train, Y_train, validation_data=(X_val, Y_val), batch_size=best_baseline_hyperparameters["batch_size"], epochs=10, verbose=1)

In [None]:
# plot results
plt.plot(history_baseline.history['accuracy'])
plt.plot(history_baseline.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
# Model 1: LSTM + LSTM + FC

#HYPERPARAMETERS TUNING:
'''
def create_model1(lstm_units, dropout_rate, recurrent_dropout_rate, learning_rate):
    model = Sequential()
    model.add(Embedding(input_dim=len(sentence_tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=MAX_SEQ_LENGTH, weights=[embedding_train_matrix], trainable=True, mask_zero=True))
    model.add(Bidirectional(LSTM(units=lstm_units, return_sequences=True, dropout=dropout_rate, recurrent_dropout=recurrent_dropout_rate)))
    model.add(Bidirectional(LSTM(units=int(lstm_units/2), return_sequences=True, dropout=dropout_rate, recurrent_dropout=recurrent_dropout_rate)))
    model.add(TimeDistributed(Dense(num_tags, activation='softmax')))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy'])
    return model

# use same hyperparameters as baseline model
best_model1_val_accuracy = 0
best_model1_model = None
best_model1_hyperparameters = None

for params in tqdm(param_grid):
    model = create_model1(params['lstm_units'], params['dropout_rate'], params['recurrent_dropout_rate'], params['learning_rate'])
    model.fit(X_train, Y_train, epochs=1, batch_size=params['batch_size'], verbose=0)
    val_loss, val_accuracy = model.evaluate(X_val, Y_val, verbose=0)
    if val_accuracy > best_model1_val_accuracy:
        best_model1_val_accuracy = val_accuracy
        best_model1_model = model
        best_model1_hyperparameters = params

print(f'Best validation accuracy: {best_model1_val_accuracy}')
print(f'Best hyperparameters: {best_model1_hyperparameters}')

# save the hyperparameters and the model
with open('best_model1_hyperparameters.pickle', 'wb') as f:
    pickle.dump(best_model1_hyperparameters, f)
best_model1_model.save('best_model1_model.h5')
'''

In [None]:
# load the model
model1 = keras.models.load_model('best_model1_model.h5')
best_model1_hyperparameters = pickle.load(open('best_model1_hyperparameters.pickle', 'rb'))
model1.summary()

In [None]:
history_model1 = model1.fit(X_train, Y_train, validation_data=(X_val, Y_val), batch_size=best_model1_hyperparameters["batch_size"], epochs=10, verbose=1)

In [None]:
# plot results
plt.plot(history_model1.history['accuracy'])
plt.plot(history_model1.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
# Model 2: LSTM + FC + FC

#HYPERPARAMETERS TUNING:
'''
def create_model2(lstm_units, dropout_rate, recurrent_dropout_rate, learning_rate):
    model = Sequential()
    model.add(Embedding(input_dim=len(sentence_tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=MAX_SEQ_LENGTH, weights=[embedding_train_matrix], trainable=True, mask_zero=True))
    model.add(Bidirectional(LSTM(units=lstm_units, return_sequences=True, dropout=dropout_rate, recurrent_dropout=recurrent_dropout_rate)))
    model.add(TimeDistributed(Dense(num_tags*2, activation="softmax")))
    model.add(TimeDistributed(Dense(num_tags, activation="softmax")))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy'])
    return model

# use same hyperparameters as baseline model
best_model2_val_accuracy = 0
best_model2_model = None
best_model2_hyperparameters = None

for params in tqdm(param_grid):
    model = create_model2(params['lstm_units'], params['dropout_rate'], params['recurrent_dropout_rate'], params['learning_rate'])
    model.fit(X_train, Y_train, epochs=1, batch_size=params['batch_size'], verbose=0)
    val_loss, val_accuracy = model.evaluate(X_val, Y_val, verbose=0)
    if val_accuracy > best_model2_val_accuracy:
        best_model2_val_accuracy = val_accuracy
        best_model2_model = model
        best_model2_hyperparameters = params
    
print(f'Best validation accuracy: {best_model2_val_accuracy}')
print(f'Best hyperparameters: {best_model2_hyperparameters}')

# save the hyperparameters and the model
with open('best_model2_hyperparameters.pickle', 'wb') as f:
    pickle.dump(best_model2_hyperparameters, f)

best_model2_model.save('best_model2_model.h5')
'''

In [None]:
# load the model
model2 = keras.models.load_model('best_model2_model.h5')
best_model2_hyperparameters = pickle.load(open('best_model2_hyperparameters.pickle', 'rb'))
model2.summary()

In [None]:
history_model2 = model2.fit(X_train, Y_train, validation_data=(X_val, Y_val), batch_size=best_model2_hyperparameters["batch_size"], epochs=10, verbose=1)

In [None]:
# plot results
plt.plot(history_model2.history['accuracy'])
plt.plot(history_model2.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# METRICS

In [None]:
tag2idx = tag_tokenizer.word_index
idx2tag = {i: w for w, i in tag2idx.items()}
print(idx2tag)

In [None]:
# get y values for . , oov and pad
print(Y_test.shape)
print(Y_test[0])

In [None]:
# print all tags
print(tag2idx)
# print index of punctuation marks (. , ? ! " ' - : ; )
punctuation_tags = [',', '.', '?', '!', '"', "'", '-', ':', ';' , '<UNK>', '(', ')', '[', ']', '{', '}', '<', '>']
punctuation_tags_idx = []
for tag in punctuation_tags:
    try:
        print(f'{tag}: {tag2idx[tag]}')
        punctuation_tags_idx.append(tag2idx[tag])
    except KeyError:
        continue

In [None]:
# Metrics: Macro F1 score over all tokens, do not consider punctuation
from sklearn.metrics import f1_score

def macro_f1(y_true, y_pred):
    print(y_true.shape)
    y_true = np.argmax(y_true, axis=-1)
    y_pred = np.argmax(y_pred, axis=-1)
    print(y_true.shape)
    print(y_pred.shape)
    # remove punctuation tags from y_true
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    print(y_true.shape)
    print(y_pred.shape)
    print(y_true[38349])
    temp = [] 
    temp_pred = []
    for i in range(len(y_true)):
        if y_true[i] not in punctuation_tags_idx and y_true[i] != 0:
            temp.append(y_true[i])
            temp_pred.append(y_pred[i])
    print(len(temp))
    print(len(temp_pred))
    return f1_score(temp, temp_pred, average='macro')
    # remove 0 from y_true
    # y_true = y_true[y_true != 0]
    # print(y_true[0:10])
    # print(y_true.shape)
    # print(y_pred.shape)
    # remove pad from y_true
    # y_true = y_true[y_true != 0]
    # remove pad from y_pred
    # y_pred = y_pred[y_pred != 0]
    # print(y_true.shape)
    # print(y_pred.shape)
    #return f1_score(y_true, y_pred, average='macro')
            

In [None]:
# Evaluate the models
print("Baseline model")
print("Train accuracy: {}".format(baseline.evaluate(X_train, Y_train)[1]))
print("Validation accuracy: {}".format(baseline.evaluate(X_val, Y_val)[1]))
print("Test accuracy: {}".format(baseline.evaluate(X_test, Y_test)[1]))
print("Macro F1 score: {}".format(macro_f1(Y_test, baseline.predict(X_test))))

In [None]:
# Evaluate the models
print("Model 1")
print("Train accuracy: {}".format(model1.evaluate(X_train, Y_train)[1]))
print("Validation accuracy: {}".format(model1.evaluate(X_val, Y_val)[1]))
print("Test accuracy: {}".format(model1.evaluate(X_test, Y_test)[1]))
print("Macro F1 score: {}".format(macro_f1(Y_test, model1.predict(X_test))))

In [None]:
# Evaluate the models
print("Model 2")
print("Train accuracy: {}".format(model2.evaluate(X_train, Y_train)[1]))
print("Validation accuracy: {}".format(model2.evaluate(X_val, Y_val)[1]))
print("Test accuracy: {}".format(model2.evaluate(X_test, Y_test)[1]))
print("Macro F1 score: {}".format(macro_f1(Y_test, model2.predict(X_test))))