# Orazi Filippo, Rossolini Andrea

In [None]:
url = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip'

In [None]:
#! git clone https://github.com/keras-team/keras-contrib
#% cd keras-contrib/


In [None]:
# system packages
import os
import shutil
import sys

# data and numerical management packages
import pandas as pd
import numpy as np

# useful during debugging (progress bars)
from tqdm import tqdm

# Keras packages
from keras import Sequential 
from keras.layers import Embedding, SimpleRNN, TimeDistributed, Dense, Bidirectional, Masking, LSTM, GRU
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# utils
from urllib import request
import zipfile
import gensim
import gensim.downloader as gloader
import scipy.sparse
import tensorflow as tf
import matplotlib.pyplot as plt

In [None]:
! pip install tf2crf

# Assignment 3 : Sequence labelling with RNNs
In this assignement we will ask you to perform POS tagging.

You are asked to follow these steps:
*   Download the corpora and split it in training and test sets, structuring a dataframe.
*   Embed the words using GloVe embeddings
*   Create a baseline model, using a simple neural architecture
*   Experiment doing small modifications to the model
*   Evaluate your best model
*   Analyze the errors of your model

**Corpora**:
Ignore the numeric value in the third column, use only the words/symbols and its label.
https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip 

**Splits**: documents 1-100 are the train set, 101-150 validation set, 151-199 test set.

**Baseline**: two layers architecture: a Bidirectional LSTM and a Dense/Fully-Connected layer on top.

**Modifications**: experiment using a GRU instead of the LSTM, adding an additional LSTM layer, and using a CRF in addition to the LSTM. Each of this change must be done by itself (don't mix these modifications).

**Training and Experiments**: all the experiments must involve only the training and validation sets.

**Evaluation**: in the end, only the best model of your choice must be evaluated on the test set. The main metric must be F1-Macro computed between the various part of speech (without considering punctuation classes).

**Error Analysis** (optional) : analyze the errors done by your model, try to understand which may be the causes and think about how to improve it.

**Report**: You are asked to deliver a small report of about 4-5 lines in the .txt file that sums up your findings.

# Download the corpora and split it in training and test sets, structuring a dataframe.



In [None]:
# Config
print("Current work directory: {}".format(os.getcwd()))

dataset_folder = os.path.join(os.getcwd(), "Datasets")

if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)

dataset_path = os.path.join(dataset_folder, "Movies.tar.gz")

print(dataset_path)

def download_dataset(download_path, url):
    if not os.path.exists(download_path):
        print("Downloading dataset...")
        request.urlretrieve(url, download_path)
        print("Download complete!")

def extract_dataset(download_path, extract_path):
    print("Extracting dataset... (it may take a while...)")
    with zipfile.ZipFile(download_path, 'r') as zip_ref:
      zip_ref.extractall(extract_path)
    print("Extraction completed!")

# Download
download_dataset(dataset_path, url)

# Extraction
extract_dataset(dataset_path, dataset_folder)

In [None]:
print(dataset_folder)

##Dividing documents into sentences

In [None]:
'''
Deviding dataset in training, test and validation set
'''
path = dataset_folder + "/dependency_treebank/"

glob_vocabulary = set()
glob_tags = set()
bohs = set()

dataframe = {
    "sentences" : [],
    "tags" : [],
    "split" : []
} # array of dictionaries

for count, document in enumerate(os.scandir(path)):
  if count <= 101 :
    split = "train"
  elif 101 < count <= 151:
    split = "test"
  else :
    split = "val"
  with open(document, 'r') as file:
    sentence = [line.rstrip() for line in file.readlines()]
    sentence_words = []
    sentence_tags = []
    for word in sentence:
      if word.strip():
        token, tag, _ = word.rstrip().split("\t")
        if token == '.':
          dataframe["sentences"].append(sentence_words)
          dataframe["tags"].append(sentence_tags)
          dataframe["split"].append(split)
          sentence_words = []
          sentence_tags = []  
          continue
        glob_vocabulary.add(token)
        glob_tags.add(tag)

        sentence_words.append(token)
        sentence_tags.append(tag)
        
    dataframe["sentences"].append(sentence_words)
    dataframe["tags"].append(sentence_tags)
    dataframe["split"].append(split)

df = pd.DataFrame(dataframe)
print(df.shape)
print(df["sentences"][0])

##Dividing dataset into Train, Validation and Test sets 

In [None]:
X_train = df[df["split"] == "train"]["sentences"].copy()
Y_train = df[df["split"] == "train"]["tags"].copy()

X_val = df[df["split"] == "val"]["sentences"].copy()
Y_val = df[df["split"] == "val"]["tags"].copy()

X_test = df[df["split"] == "test"]["sentences"].copy()
Y_test = df[df["split"] == "test"]["tags"].copy()


#Building Vocabulary

In [None]:
def build_vocabulary(sentences):
    """
    Given a dataset, builds the corresponding word vocabulary.

    :param df: dataset from which we want to build the word vocabulary (pandas.DataFrame)
    :return:
      - word vocabulary: vocabulary index to word
      - inverse word vocabulary: word to vocabulary index
      - word listing: set of unique terms that build up the vocabulary
    """
    t = Tokenizer(filters="", lower=False)
    t.fit_on_texts(sentences)
    word_to_idx = dict(zip(t.word_index.keys(), [a for a in t.word_index.values()])) 
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}
    return idx_to_word, word_to_idx, list(word_to_idx.keys()), t

MAX_SEQ_LENGTH = np.int(np.max([len(a) for a in X_train]))
EMBEDDING_SIZE  = 50  


# Embed the words using GloVe embeddings


##Loading the embedding model

In [None]:
def load_embedding_model(model_type, embedding_dimension=50):
    """
    Loads a pre-trained word embedding model via gensim library.

    :param model_type: name of the word embedding model to load.
    :param embedding_dimension: size of the embedding space to consider

    :return
        - pre-trained word embedding model (gensim KeyedVectors object)
    """

    download_path = ""

    # Find the correct embedding model name
    if model_type.strip().lower() == 'word2vec':
        download_path = "word2vec-google-news-300"

    elif model_type.strip().lower() == 'glove':
        download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)

    else:
        raise AttributeError("Unsupported embedding model type! Available ones: word2vec, glove")

    # Check download
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Word2Vec: 300")
        print("Glove: 50, 100, 200, 300")
        raise e

    return emb_model


# Modify these variables as you wish!
# Glove -> 50, 100, 200, 300
# Word2Vec -> 300
embedding_model_type = "glove"
embedding_dimension = 50

embedding_model = load_embedding_model(embedding_model_type, embedding_dimension)

## handling OOV

In [None]:
def co_occurrence_count(df, idx_to_word, word_to_idx, window_size=4):
    """
    Builds word-word co-occurrence matrix based on word counts.

    :param df: pre-processed dataset (pandas.DataFrame)
    :param idx_to_word: vocabulary map (index -> word) (dict)
    :param word_to_idx: vocabulary map (word -> index) (dict)

    :return
      - co_occurrence symmetric matrix of size |V| x |V| (|V| = vocabulary size)
    """
    lil = scipy.sparse.lil_matrix((len(word_to_idx)+1, len(word_to_idx)+1))
    for count,line in enumerate(df):
      lane_to_idx = []
      for v in line:
        # this allows the handling of OOV 
        if v not in word_to_idx:
          continue
        lane_to_idx.append(word_to_idx[v])
      for index, word in enumerate(lane_to_idx):
          for i in range(1, int(window_size)+1):
            if index - i >= 0:
                lil[word,lane_to_idx[index - i ]] -= -1
            if index + i < len(lane_to_idx):
                a = lane_to_idx[index + i ]
                lil[word, a] -= -1
    return lil

In [None]:
def check_OOV_terms(embedding_model, word_listing):
    """
    Checks differences between pre-trained embedding model vocabulary
    and dataset specific vocabulary in order to highlight out-of-vocabulary terms.

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param word_listing: dataset specific vocabulary (list)

    :return
        - list of OOV terms
    """
    return list(filter(lambda term : term not in embedding_model, word_listing))

oov_terms = check_OOV_terms(embedding_model, glob_vocabulary)

print("Total OOV terms: {0} ({1:.2f}%)".format(len(oov_terms), float(len(oov_terms)) / len(glob_vocabulary)))

In [None]:
def build_embedding_matrix(embedding_model, embedding_dimension, word_to_idx, idx_to_word, oov_terms, co_occurrence_count_matrix):
    """
    Builds the embedding matrix of a specific dataset given a pre-trained word embedding model

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param word_to_idx: vocabulary map (word -> index) (dict)
    :param idx_to_word: vocabulary map (index -> word) (dict) -- added by us
    :param oov_terms: list of OOV terms (list)
    :param co_occorruence_count_matrix: the co-occurrence count matrix of the given dataset (window size 1)

    :return
        - embedding matrix that assigns a high dimensional vector to each word in the dataset specific vocabulary (shape |V| x d)
    """    
    adj_oov = {} # --> oov : [adjacent terms]
    oov_embedded = {} # --> oov : [adj terms mean]

    def get_embedded_word(token):
      if token in embedding_model:
        return np.array(embedding_model[token])
      else:
        return np.array(oov_embedded[token])

    #############################
    # extracting adjacent words #
    #############################
    for oov_term in oov_terms:
      oov_index = word_to_idx[oov_term]
      adjacent_indices = co_occurrence_count_matrix.getrow(oov_index).nonzero()[1]
      adjacent_terms = list(map(idx_to_word.get, filter(lambda token : idx_to_word[token] not in oov_terms, adjacent_indices)))
      adj_oov.update({oov_term : adjacent_terms})
    #############################
    # neighbours' mean ##########
    #############################
    for oov_term, adj_terms in adj_oov.items():
      if len(adj_terms) is 0:
        # the list of adj words is empty --> assign a rndm vector
        adj_terms_mean = np.random.rand(embedding_dimension)
      else :
        # compute the mean of adj terms
        embedded_adj_terms = [embedding_model[term] for term in adj_terms]
        adj_terms_mean = np.mean(embedded_adj_terms, axis=0)
      oov_embedded.update({oov_term : adj_terms_mean})
    embedding_model.add(list(filter(lambda token : token in embedding_model or token in oov_embedded, word_to_idx.keys())), 
                    list(map(get_embedded_word, filter(lambda token : token in embedding_model or token in oov_embedded, word_to_idx.keys()))))
    return embedding_model 

#Preparing the sets to the usage

##Train set:
- X
  - Building vocabulary
  - Encoding sequences
  - Padding
  - Building embedded matrix
- Y
  - One hot encoding

In [None]:

#creating dictionary and embedding Train
X_train_idx_to_word, X_train_word_to_idx, X_train_word_listing, X_train_tokenizer= build_vocabulary(X_train)
Y_train_idx_to_word, Y_train_word_to_idx, Y_train_word_listing, Y_train_tokenizer= build_vocabulary(Y_train)

X_train_encoded = X_train_tokenizer.texts_to_sequences(X_train)
Y_train_encoded = Y_train_tokenizer.texts_to_sequences(Y_train)

X_train_padded = pad_sequences(X_train_encoded, maxlen=MAX_SEQ_LENGTH, padding="post", truncating="post")
Y_train_padded = pad_sequences(Y_train_encoded, maxlen=MAX_SEQ_LENGTH, padding="post", truncating="post")

X_train_oov = check_OOV_terms(embedding_model, X_train_word_listing)
X_train_co_oc = co_occurrence_count(X_train, X_train_idx_to_word, X_train_word_to_idx, window_size=4)

embedding_matrix = build_embedding_matrix(embedding_model, embedding_dimension, X_train_word_to_idx, X_train_idx_to_word, X_train_oov, X_train_co_oc)

X_train_embedded = np.zeros((*X_train_padded.shape,EMBEDDING_SIZE ))
for idx_line, line in enumerate(X_train_padded):
  for idx_word, vocab_idx in enumerate(line):
    if(vocab_idx != 0):
      X_train_embedded[idx_line,idx_word,:] = embedding_matrix[X_train_idx_to_word[vocab_idx]]

Y_train_ohe = to_categorical(Y_train_padded, num_classes= 46 )

##Validation set:
- X
  - Building vocabulary
  - Encoding sequences
  - Padding
  - Building embedded matrix
- Y
  - One hot encoding

In [None]:
#creating dictionary and embedding val
X_val_idx_to_word, X_val_word_to_idx, X_val_word_listing, X_val_tokenizer= build_vocabulary(X_val)

X_val_encoded = X_val_tokenizer.texts_to_sequences(X_val)
Y_val_encoded = Y_train_tokenizer.texts_to_sequences(Y_val)

X_val_padded = pad_sequences(X_val_encoded, maxlen=MAX_SEQ_LENGTH, padding="post", truncating="post")
Y_val_padded = pad_sequences(Y_val_encoded, maxlen=MAX_SEQ_LENGTH, padding="post", truncating="post")

X_val_oov = check_OOV_terms(embedding_matrix, X_val_word_listing)
X_val_co_oc = co_occurrence_count(X_val, X_val_idx_to_word, X_val_word_to_idx, window_size=4)

embedding_matrix = build_embedding_matrix(embedding_matrix, embedding_dimension, X_val_word_to_idx, X_val_idx_to_word, X_val_oov, X_val_co_oc)

X_val_embedded = np.zeros((*X_val_padded.shape,EMBEDDING_SIZE ))
for idx_line, line in enumerate(X_val_padded):
  for idx_word, vocab_idx in enumerate(line):
    if(vocab_idx != 0):
      X_val_embedded[idx_line,idx_word,:] = embedding_matrix[X_val_idx_to_word[vocab_idx]]

Y_val_ohe = to_categorical(Y_val_padded, num_classes= 46 )

##Test set:
- X
  - Building vocabulary
  - Encoding sequences
  - Padding
  - Building embedded matrix
- Y
  - One hot encoding

In [None]:
#creating dictionary and embedding test
X_test_idx_to_word, X_test_word_to_idx, X_test_word_listing, X_test_tokenizer= build_vocabulary(X_test)

X_test_encoded = X_test_tokenizer.texts_to_sequences(X_test)
Y_test_encoded = Y_train_tokenizer.texts_to_sequences(Y_test)

X_test_padded = pad_sequences(X_test_encoded, maxlen=MAX_SEQ_LENGTH, padding="post", truncating="post")
Y_test_padded = pad_sequences(Y_test_encoded, maxlen=MAX_SEQ_LENGTH, padding="post", truncating="post")

X_test_oov = check_OOV_terms(embedding_matrix, X_test_word_listing)
X_test_co_oc = co_occurrence_count(X_test, X_test_idx_to_word, X_test_word_to_idx, window_size=4)

embedding_matrix = build_embedding_matrix(embedding_matrix, embedding_dimension, X_test_word_to_idx, X_test_idx_to_word, X_test_oov, X_test_co_oc)

X_test_embedded = np.zeros((*X_test_padded.shape,EMBEDDING_SIZE ))
for idx_line, line in enumerate(X_test_padded):
  for idx_word, vocab_idx in enumerate(line):
    if(vocab_idx != 0):
      X_test_embedded[idx_line,idx_word,:] = embedding_matrix[X_test_idx_to_word[vocab_idx]]

Y_test_ohe = to_categorical(Y_test_padded, num_classes= 46 )

##Setting global variables

In [None]:
BATCH_SIZE = 20
EPOCHS = 20
VOCABULARY_SIZE = len(X_train_tokenizer.word_index) + 1
NUM_CLASSES = len(Y_train_word_listing) + 1
early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
Y_test_ohe = to_categorical(Y_test_padded, num_classes = 46)
Y_trai_ohe = to_categorical(Y_train_padded, num_classes = 46)
Y_val_ohe = to_categorical(Y_val_padded, num_classes = 46)

#Building and training models

TODO: ci sono due modell che mi paion uguali bisogna capire se uno è da eliminare (probabilmente si)

##Bidirecitonal LSTM with fully connected layer 

###Building model


In [None]:
#BiLSTM + FC
# create architecture
BiLSTMFC = Sequential()

# create embedding layer — usually the first layer in text problems
# vocabulary size — number of unique words in data
BiLSTMFC.add(Masking(mask_value=0, input_shape=(MAX_SEQ_LENGTH , 50)))

# add an RNN layer which contains 64 RNN cells
# True — return whole sequence; False — return single output of the end of the sequence
BiLSTMFC.add(Bidirectional(LSTM(64, dropout = 0.2,  return_sequences=True)))

# add time distributed (output at each sequence) layer
BiLSTMFC.add(TimeDistributed(Dense(NUM_CLASSES, activation='softmax')))
#compile model
BiLSTMFC.compile(loss      =  'categorical_crossentropy',
                  optimizer =  'adam',
                  metrics   =  ['acc'])
# check summary of the model
BiLSTMFC.summary()




### Training model 

In [None]:
historyBiLSTMFC = BiLSTMFC.fit(X_train_embedded, Y_train_ohe, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_val_embedded, Y_val_ohe), callbacks=[early_stop])


###Graphical view of the model 

In [None]:
plt.plot(historyBiLSTMFC.history['acc'])  
plt.plot(historyBiLSTMFC.history['loss'])  
plt.plot(historyBiLSTMFC.history['val_acc'])  
plt.plot(historyBiLSTMFC.history['val_loss']) 
plt.axis([0,25,0,1]) 
plt.title('model accuracy')  
plt.ylabel('accuracy')  
plt.xlabel('epoch')  
plt.legend(['accuracy', 'loss','val_accuracy', 'val_loss'], loc='best') 

##Bidirectional GRU with fully connected layer

### Building model 

In [None]:
#GRU + FC
# create architecture
BiGRUFC = Sequential()

BiGRUFC.add(Masking(mask_value=0,input_shape=(MAX_SEQ_LENGTH , 50)))

# add an RNN layer which contains 64 RNN cells
# True — return whole sequence; False — return single output of the end of the sequence
BiGRUFC.add(Bidirectional(GRU(64, dropout = 0.2, recurrent_dropout = 0.2, return_sequences=True)))

# add time distributed (output at each sequence) layer
BiGRUFC.add(TimeDistributed(Dense(NUM_CLASSES, activation='sigmoid')))

#compile model
BiGRUFC.compile(loss      =  'categorical_crossentropy',
                  optimizer =  'adam',
                  metrics   =  ['acc'])

# check summary of the model
BiGRUFC.summary()

### Training model 

In [None]:
historyBiGRUFC = BiGRUFC.fit(X_train_embedded, Y_train_ohe, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_val_embedded, Y_val_ohe), callbacks=[early_stop])

###Graphical view of the model 

In [None]:
plt.plot(historyBiGRUFC.history['acc'])  
plt.plot(historyBiGRUFC.history['loss'])  
plt.plot(historyBiGRUFC.history['val_acc'])  
plt.plot(historyBiGRUFC.history['val_loss']) 
plt.axis([0,20-1,0,1]) 
plt.title('model accuracy')  
plt.ylabel('accuracy')  
plt.xlabel('epoch')  
plt.legend(['accuracy', 'loss','val_accuracy', 'val_loss'], loc='best') 

## Double bidirectional LSTM with fully connected layer 

### Building model

In [None]:
#BiLSTM + BiLSTM + FC
# create architecture
BiBiLSTMFC = Sequential()

# create embedding layer — usually the first layer in text problems
# vocabulary size — number of unique words in data
BiBiLSTMFC.add(Masking(mask_value=0, input_shape=(MAX_SEQ_LENGTH , 50)))

# add an RNN layer which contains 64 RNN cells
# True — return whole sequence; False — return single output of the end of the sequence
BiBiLSTMFC.add(Bidirectional(LSTM(64, recurrent_dropout = 0.2, return_sequences=True)))

BiBiLSTMFC.add(Bidirectional(LSTM(64, dropout = 0.2, recurrent_dropout = 0.2, return_sequences=True)))

# add time distributed (output at each sequence) layer
BiBiLSTMFC.add(TimeDistributed(Dense(NUM_CLASSES, activation='softmax')))
#compile model
BiBiLSTMFC.compile(loss      =  'categorical_crossentropy',
                  optimizer =  'adam',
                  metrics   =  ['acc'])
# check summary of the model
BiBiLSTMFC.summary()



###Training model

In [None]:
historyBiBiLSTMFC = BiBiLSTMFC.fit(X_train_embedded, Y_train_ohe, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_val_embedded, Y_val_ohe), callbacks=[early_stop])


### Graphical view of the model

In [None]:
plt.plot(historyBiBiLSTMFC.history['acc'])  
plt.plot(historyBiBiLSTMFC.history['loss'])  
plt.plot(historyBiBiLSTMFC.history['val_acc'])  
plt.plot(historyBiBiLSTMFC.history['val_loss']) 
plt.axis([0,20-1,0,1]) 
plt.title('model accuracy')  
plt.ylabel('accuracy')  
plt.xlabel('epoch')  
plt.legend(['accuracy', 'loss','val_accuracy', 'val_loss'], loc='best') 

##Bidirectional LSTM with fully connected layers and CRF


### Building model
https://machinelearningmastery.com/keras-functional-api-deep-learning \\
questo link ha un ottima spiegazione del perchè la costruzione è diversa dal solito

In [None]:
import tensorflow_addons as tfa
from tf2crf import CRF, ModelWithCRFLoss
from keras.layers import Input 
from tensorflow.keras.models import Model

inputs = Input(shape=(MAX_SEQ_LENGTH , 50), dtype='float32')
output = Masking(mask_value=np.zeros((MAX_SEQ_LENGTH , 50), dtype="float32"))(inputs)
output = Bidirectional(LSTM(64, return_sequences = True, dropout=0.4))(output)
output = Dense(NUM_CLASSES+1, activation=None)(output)
crf = CRF(dtype='float32')
output = crf(output)
BiLSTMFCCRF = Model(inputs, output)

print(BiLSTMFCCRF.summary(line_length=150))

BiLSTMFCCRF = ModelWithCRFLoss(BiLSTMFCCRF)

BiLSTMFCCRF.compile(optimizer =  'adam',
                 metrics   =  ['acc'])
print(X_train_embedded.shape)



### Training model


In [None]:
historyBiLSTMFCCRF = BiLSTMFCCRF.fit(X_train_embedded, Y_train_padded, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_val_embedded, Y_val_padded))


### Graphical view of the model

In [None]:
plt.plot(historyBiLSTMFCCRF.history['val_crf_loss_val'])  
plt.plot(historyBiLSTMFCCRF.history['crf_loss'])  
plt.axis([0,20-1,0,50]) 
plt.title('model accuracy')  
plt.ylabel('accuracy')  
plt.xlabel('epoch')  
plt.legend(['val_crf_loss_val', 'loss'], loc='best') 

In [None]:
plt.plot(historyBiLSTMFCCRF.history['val_val_accuracy'])  
plt.plot(historyBiLSTMFCCRF.history['accuracy']) 
plt.title('model accuracy')  
plt.ylabel('accuracy')  
plt.xlabel('epoch')  
plt.legend(['val_accuracy', 'accuracy'], loc='best') 

# Conclusion 
## Evaluation of the choosen model
For this section we choose the double bidirectional LSTM + FC model (BiBiLSTMFC), because it is the model that performs better with the given paramteres.

In the following cells we are about to calculate the f1.

In [None]:
y_predict = BiBiLSTMFC.predict(X_test_embedded)

In [None]:
def depad(true, predicted):
  depad_pred = []
  depad_true = []
  for t, p in zip(true, predicted):
    if t != 0:
      depad_pred.append(p)
      depad_true.append(t)

  return (depad_true, depad_pred)

def depunctuate(true, predicted):
  depunctuate_pred = []
  depunctuate_true = []
  symbols = [Y_train_word_to_idx[a] for a in ["$", ",", ".", ":", "#"]]
  for t, p in zip(true, predicted):
    if t not in symbols:
      depunctuate_pred.append(p)
      depunctuate_true.append(t)
  return (depunctuate_true, depunctuate_pred)


In [None]:
y_pred = []
for a in y_predict:
  inner_y_pred = []
  for e in a:
    inner_y_pred.append(np.argmax(e))
  y_pred.append(inner_y_pred)
y_pred = np.array([np.array(y) for y in y_pred])

from sklearn.metrics import f1_score
f1_scores = []
print("-"*40)
for (true, predicted) in zip(Y_test_padded, y_pred):
  true, predicted = depad(true,predicted)
  true, predicted = depunctuate(true,predicted)
  if true:
    f1_scores.append(f1_score(true, predicted, average='macro'))
print(np.mean(f1_scores))