In [None]:
!pip install transformers
!pip install tensorflow==2.11

In [2]:
from transformers import AutoTokenizer
import pandas as pd
from sklearn import preprocessing
from transformers import TFAutoModelForTokenClassification
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from transformers import logging as hf_logging
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import warnings
import os
'''Mute warnings from TensorFlow'''
hf_logging.set_verbosity_error()
warnings.filterwarnings("ignore")
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
tf.random.set_seed(1234)

In [3]:
print(tf.__version__)

2.11.0


In [4]:
def label_encoding(labels, data):
    '''Encode the labels to numerical integers, and return the encoded labels,
    and label names'''
    le = preprocessing.LabelEncoder()
    le.fit(labels)
    label_n = list(le.classes_)
    encoded_labels = []
    for sent in data['semtag']:
        enc = le.transform(sent)
        encoded_labels.append(enc)
    return encoded_labels, label_n

In [5]:
def align_text_with_label_first(word_pos, labels):
    '''Align the labels with the tokenized word ids, if a word is splitted into multiple subwords, 
    align the first subword token with the label, returns the algined labels as a list'''
    adj_labels = []
    adj_labels.append(-100)
    for x, y in zip(word_pos[1:-1], word_pos[:-1]):
        if x == y:
            adj_labels.append(-100)
        elif x != y:
            adj_labels.append(labels[x])
    adj_labels.append(-100)
    return adj_labels

In [6]:
def align_text_with_label_last(word_pos, labels):
    '''Align the labels with the tokenized word ids, if a word is splitted into multiple subwords, 
    align the last subword token with the label, returns the algined labels as a list (default last)'''
    adj_labels = []
    adj_labels.append(-100)
    for word_id, next1 in zip(word_pos[1:-1], word_pos[2:]):
        if word_id == next1:
            adj_labels.append(-100)
        elif word_id != next1:
            adj_labels.append(labels[word_id])
            
    adj_labels.append(-100)
    return adj_labels

In [7]:
def tokenize_and_align(lm, text, enc_labels):
    '''Tokenize the text and align the labels (default align_text_with_label_first), 
    returns a tokenized encoded batch, containing the values: 
    1. input_ids, 2. token_type_ids, 3. attention_mask, 4. labels'''
    tk = AutoTokenizer.from_pretrained(lm, add_prefix_space=True)
    tk_data = tk(text, truncation=True, is_split_into_words=True)

    aligned_labels = []
    for n, label in enumerate(enc_labels):
        aligned_labels.append(align_text_with_label_first(tk_data.word_ids(n), label))
    
    tk_data['labels'] = aligned_labels
    return tk_data

In [8]:
def train_model(lm, lr, bs, ep, sl, label_names, train_tok, dev_tok):
    '''Trains a pretrained language model with the given (hyper)parameters, returns the trained model'''
    print("Training model: {}\nWith parameters:\nLearn rate: {}, Batch size: {}\nEpochs: {}, Sequence length: {}"
          .format(lm, lr, bs, ep, sl))

    X_train, Y_train = padding_convertTF(train_tok, sl)
    X_dev, Y_dev = padding_convertTF(dev_tok, sl)

    train_sample_weights = sample_weights(Y_train)
    dev_sample_weights = sample_weights(Y_dev)

    model = TFAutoModelForTokenClassification.from_pretrained(lm, num_labels=len(label_names))
    early_stopper = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True,
                                                             mode="min")
    optim = Adam(learning_rate=lr)
    loss = SparseCategoricalCrossentropy(from_logits=True, ignore_class=-100)
    model.compile(optimizer=optim, loss=loss, weighted_metrics=['accuracy'])
    model.fit(X_train, Y_train, sample_weight=train_sample_weights, validation_data=[X_dev, Y_dev, dev_sample_weights],
              epochs=ep, batch_size=bs, callbacks=[early_stopper])
    return model

In [9]:
def evaluate_model(model, lm, lr, bs, ep, sl, label_names, test_tok):
    '''Tests a pretrained language model, with the given model, test set, and (hyper)parameters,
    returns the classification report for the given test set'''
    print("Testing model: {}\nWith parameters:\nLearn rate: {}, Batch size: {}\nEpochs: {}, Sequence length: {}"
          .format(lm, lr, bs, ep, sl))

    X_test, Y_test = padding_convertTF(test_tok, sl)

    logits = model.predict(X_test)['logits']
    predictions = np.argmax(logits, axis=-1)

    Y_gold = []
    Y_pred = []
    for label, pred in zip(Y_test, predictions):
        for l, p in zip(label, pred):
            if l == -100:
                continue
            Y_gold.append(label_names[l])
            Y_pred.append(label_names[p])

    return classification_report(Y_gold, Y_pred, digits=4, zero_division=True)

In [10]:
def sample_weights(Y_labels):
    '''Takes as input the Y- train/dev set, and returns the sample weights as a 2d numpy array,
    used as input for the model.fit() sample_weight argument'''
    weights = []
    x = []
    for n in Y_labels:
        for n1 in n:
            if n1 == -100:
                x.append(0)
            else:
                x.append(1)
        weights.append(x)
        x = []
    return np.array(weights)

In [11]:
def evaluate_custom(model, sl, label_names, test_tok):
    '''Returns a nested list for both the gold labels and predictions, 
    where each list consist of the labels for each sentence'''
    X_test, Y_test = padding_convertTF(test_tok, sl)
    logits = model.predict(X_test)['logits']
    predictions = np.argmax(logits, axis=-1)

    Y_gold = []
    Y_pred = []
    Y_g = []
    Y_p = []
    for label, pred in zip(Y_test, predictions):
        for l, p in zip(label, pred):
            if l == -100:
                continue
            Y_g.append(label_names[l])
            Y_p.append(label_names[p])
        Y_gold.append(Y_g)
        Y_pred.append(Y_p)
        Y_g = []
        Y_p = []                    
    return Y_gold, Y_pred

In [12]:
def padding_convertTF(tok_output, sl):
    '''Pads and converts the input_ids, token_type_ids, attention_mask to tensors,
    also pads the '''
    if len(tok_output) == 4:
        input_ids = []
        token_type_ids = []
        attention_mask = []
        labels = []
        for n1, n2, n3, n4 in zip(tok_output['input_ids'], tok_output['token_type_ids'], 
                              tok_output['attention_mask'], tok_output['labels']):
            input_ids.append(n1)
            token_type_ids.append(n2)
            attention_mask.append(n3)
            labels.append(n4)

        input_ids_pad = pad_sequences(input_ids, padding='post', maxlen=sl)
        token_type_ids_pad = pad_sequences(token_type_ids, padding='post', maxlen=sl)
        attention_mask_pad = pad_sequences(attention_mask, padding='post', maxlen=sl)
        labels_pad = pad_sequences(labels, padding='post', value=-100, maxlen=sl)

        return {'input_ids': tf.constant(input_ids_pad), 'token_type_ids': tf.constant(token_type_ids_pad),
              'attention_mask': tf.constant(attention_mask_pad)}, labels_pad

    elif len(tok_output) == 3:
        input_ids = []
        attention_mask = []
        labels = []
        for n1, n2, n3 in zip(tok_output['input_ids'], tok_output['attention_mask'], tok_output['labels']):
            input_ids.append(n1)
            attention_mask.append(n2)
            labels.append(n3)

        input_ids_pad = pad_sequences(input_ids, padding='post', maxlen=sl)
        attention_mask_pad = pad_sequences(attention_mask, padding='post', maxlen=sl)
        labels_pad = pad_sequences(labels, padding='post', value=-100, maxlen=sl)

        return {'input_ids': tf.constant(input_ids_pad), 'attention_mask': tf.constant(attention_mask_pad)}, labels_pad

In [13]:
'''Reading in data'''
pmb_data = pd.read_csv('sem-pmb_4_0_0-gold.csv') # Change to "sem-pmb_4_0_0-all-gold.csv" to use with only multilingual data from PMB
data = pmb_data.groupby('sent_file').agg({'token': list, 'lemma': list, 'from': list, 'to': list, 'semtag': list}).reset_index()
labels = [val for sublist in data['semtag'] for val in sublist]

'''Encode labels'''
labels_enc, label_names = label_encoding(labels, data)
data['enc_semtag'] = labels_enc

'''Splitting data into train (80%), dev (10%) and test (10%)'''
df_train, df_test = train_test_split(data, test_size=0.20, random_state=1234)
df_dev, df_test = train_test_split(df_test, test_size=0.50, random_state=1234)

In [14]:
'''Check total labels in train set'''
labels_train = [val for sublist in df_train['enc_semtag'] for val in sublist]
len(labels_train)

92933

In [15]:
'''Check total labels in dev set'''
labels_dev = [val for sublist in df_dev['enc_semtag'] for val in sublist]
len(labels_dev)

11535

In [16]:
'''Check if total labels in test set are equal to amount predicted by model in classification report'''
labels_test = [val for sublist in df_test['enc_semtag'] for val in sublist]
len(labels_test)

11526

# BERT (bert-base-uncased)

In [None]:
'''Tokenizing train'''
x_train = df_train['token'].tolist()
y_train = df_train['enc_semtag'].tolist()

x_train_adj = []
for train_text in x_train:
    tt = ' '.join(train_text)
    tt = tt.split()
    x_train_adj.append(tt)
train_tok = tokenize_and_align('bert-base-uncased', x_train_adj, y_train)

'''Tokenizing dev'''
x_dev = df_dev['token'].tolist()
y_dev = df_dev['enc_semtag'].tolist()

x_dev_adj = []
for dev_text in x_dev:
    td = ' '.join(dev_text)
    td = td.split()
    x_dev_adj.append(td)
dev_tok = tokenize_and_align('bert-base-uncased', x_dev_adj, y_dev)

'''Tokenizing test'''
x_test = df_test['token'].tolist()
y_test = df_test['enc_semtag'].tolist()

x_test_adj = []
for test_text in x_test:
    ts = ' '.join(test_text)
    ts = ts.split()
    x_test_adj.append(ts)
test_tok = tokenize_and_align('bert-base-uncased', x_test_adj, y_test)

'''Training and evaluating BERT-base (bert-base-uncased)'''
BERT_base = train_model('bert-base-uncased', 5e-5, 8, 20, 80, label_names, train_tok, dev_tok)
print(evaluate_model(BERT_base, 'bert-base-uncased', 5e-5, 8, 20, 80, label_names, test_tok))

Training model: bert-base-uncased
With parameters:
Learn rate: 5e-05, Batch size: 8
Epochs: 20, Sequence length: 80
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Testing model: bert-base-uncased
With parameters:
Learn rate: 5e-05, Batch size: 8
Epochs: 20, Sequence length: 80
              precision    recall  f1-score   support

         ALT     0.9655    0.9655    0.9655        29
         AND     0.9796    0.9057    0.9412        53
         APX     1.0000    0.9231    0.9600        13
         ART     0.2353    0.8000    0.3636         5
         BOT     1.0000    1.0000    1.0000         1
         BUT     1.0000    1.0000    1.0000         7
         CLO     1.0000    1.0000    1.0000        19
         COL     1.0000    1.0000    1.0000        12
         CON     0.9875    0.9966    0.9920       871
         COO     0.8889    1.0000    0.9412        16
         CTC     0.6667    1.0000    0.8000         2
         DEF     0.9900    0.9900    0.9900       797
         DE

In [None]:
Y_gold1, Y_pred1 = evaluate_custom(BERT_base, 80, label_names, test_tok)



In [None]:
'''Sentences which BERT predicted wrong from the test set'''
num1 = 0
for x1, y1 in zip(Y_gold1, Y_pred1):
  num1 += 1
  if x1 != y1:
    print(num1, x1, y1)

23 ['QUE', 'PST', 'DEF', 'ART', 'EXS', 'QUE'] ['QUE', 'PST', 'DEF', 'CON', 'EXS', 'QUE']
36 ['DEF', 'CON', 'ENS', 'REL', 'DEF', 'CLO', 'NIL'] ['DEF', 'CON', 'EFS', 'REL', 'DEF', 'CLO', 'NIL']
38 ['DEF', 'CON', 'NOW', 'IST', 'NIL'] ['DEF', 'CON', 'NOW', 'EXS', 'NIL']
42 ['DEF', 'PER', 'EPS', 'DEF', 'NTH', 'NIL'] ['DEF', 'PER', 'EPS', 'DEF', 'ART', 'NIL']
50 ['PRO', 'NOW', 'EXG', 'REL', 'HAS', 'ROL', 'NIL'] ['PRO', 'NOW', 'EXG', 'REL', 'HAS', 'CON', 'NIL']
58 ['EMP', 'PRO', 'PST', 'EXS', 'NIL'] ['AND', 'PRO', 'PST', 'EXS', 'NIL']
79 ['DEF', 'PER', 'EPS', 'DEF', 'PER', 'NIL'] ['DEF', 'PER', 'EPS', 'DEF', 'LIT', 'NIL']
86 ['SUB', 'PRO', 'NOW', 'IST', 'NIL', 'SUB', 'DEF', 'PER', 'NOW', 'IST', 'NIL'] ['SUB', 'DST', 'NOW', 'IST', 'NIL', 'SUB', 'DEF', 'PER', 'NOW', 'IST', 'NIL']
103 ['DEF', 'PER', 'PST', 'EXS', 'REL', 'DEF', 'DOM', 'EQU', 'MOY', 'YOC', 'NIL'] ['DEF', 'PER', 'PST', 'EXS', 'REL', 'DEF', 'DOM', 'REL', 'MOY', 'YOC', 'NIL']
116 ['DEF', 'CON', 'EPS', 'ALT', 'NIL'] ['DEF', 'CON', 'EP

In [None]:
x_test[49:50]

[['I', 'am', 'singing', 'with', 'my', 'children', '.']]

# RoBERTa (roberta-base)

In [None]:
'''Tokenizing train'''
x_train = df_train['token'].tolist()
y_train = df_train['enc_semtag'].tolist()

x_train_adj = []
for train_text in x_train:
    tt = ' '.join(train_text)
    tt = tt.split()
    x_train_adj.append(tt)
train_tok = tokenize_and_align('roberta-base', x_train_adj, y_train)

'''Tokenizing dev'''
x_dev = df_dev['token'].tolist()
y_dev = df_dev['enc_semtag'].tolist()

x_dev_adj = []
for dev_text in x_dev:
    td = ' '.join(dev_text)
    td = td.split()
    x_dev_adj.append(td)
dev_tok = tokenize_and_align('roberta-base', x_dev_adj, y_dev)

'''Tokenizing test'''
x_test = df_test['token'].tolist()
y_test = df_test['enc_semtag'].tolist()

x_test_adj = []
for test_text in x_test:
    ts = ' '.join(test_text)
    ts = ts.split()
    x_test_adj.append(ts)
test_tok = tokenize_and_align('roberta-base', x_test_adj, y_test)

'''Training and evaluating ROBERTA (roberta-base)'''
ROBERTA_base = train_model('roberta-base', 5e-5, 32, 20, 80, label_names, train_tok, dev_tok)
print(evaluate_model(ROBERTA_base, 'robert-base', 5e-5, 32, 20, 80, label_names, test_tok))

Training model: roberta-base
With parameters:
Learn rate: 5e-05, Batch size: 32
Epochs: 20, Sequence length: 80
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Testing model: robert-base
With parameters:
Learn rate: 5e-05, Batch size: 32
Epochs: 20, Sequence length: 80
              precision    recall  f1-score   support

         ALT     1.0000    0.9655    0.9825        29
         AND     0.9792    0.8868    0.9307        53
         APX     1.0000    1.0000    1.0000        13
         ART     0.5714    0.8000    0.6667         5
         BOT     1.0000    1.0000    1.0000         1
         BUT     0.8750    1.0000    0.9333         7
         CLO     1.0000    1.0000    1.0000        19
         COL     1.0000    1.0000    1.0000        12
         CON     0.9931    0.9885    0.9908       871
         COO     0.8750    0.8750    0.8750        16
         CTC     1.0000    1.0000    1.0000         2
         DEF     0.9962    0.9962    0.99

In [None]:
Y_gold2, Y_pred2 = evaluate_custom(ROBERTA_base, 80, label_names, test_tok)



In [None]:
'''Sentences which RoBERTa predicted wrong from the test set'''
num2 = 0
for x2, y2 in zip(Y_gold2, Y_pred2):
  num2 += 1
  if x2 != y2:
    print(num2, x2, y2)

36 ['DEF', 'CON', 'ENS', 'REL', 'DEF', 'CLO', 'NIL'] ['DEF', 'CON', 'EFS', 'REL', 'DEF', 'CLO', 'NIL']
39 ['PRO', 'ENS', 'REL', 'DEF', 'CON', 'NIL'] ['PRO', 'ENS', 'REL', 'DEF', 'ROL', 'NIL']
42 ['DEF', 'PER', 'EPS', 'DEF', 'NTH', 'NIL'] ['DEF', 'PER', 'EPS', 'DEF', 'CON', 'NIL']
50 ['PRO', 'NOW', 'EXG', 'REL', 'HAS', 'ROL', 'NIL'] ['PRO', 'NOW', 'EXG', 'REL', 'HAS', 'CON', 'NIL']
86 ['SUB', 'PRO', 'NOW', 'IST', 'NIL', 'SUB', 'DEF', 'PER', 'NOW', 'IST', 'NIL'] ['SUB', 'DST', 'NOW', 'IST', 'NIL', 'SUB', 'DEF', 'PER', 'NOW', 'IST', 'NIL']
103 ['DEF', 'PER', 'PST', 'EXS', 'REL', 'DEF', 'DOM', 'EQU', 'MOY', 'YOC', 'NIL'] ['DEF', 'PER', 'PST', 'EXS', 'REL', 'DEF', 'DOM', 'REL', 'MOY', 'DOM', 'NIL']
116 ['DEF', 'CON', 'EPS', 'ALT', 'NIL'] ['DEF', 'CON', 'EPS', 'IST', 'NIL']
161 ['DIS', 'CON', 'ENS', 'NIL', 'QUC', 'DIS', 'QUC', 'CON', 'NIL'] ['DEF', 'PER', 'ENS', 'NIL', 'QUC', 'DIS', 'QUC', 'CON', 'NIL']
173 ['REL', 'DEF', 'YOC', 'NIL', 'DEF', 'CON', 'PST', 'EXS', 'REL', 'DEF', 'GEO', 'NIL'] 

In [None]:
x_test[688:689]

[['If',
  'he',
  'is',
  'not',
  'rich',
  ',',
  'he',
  'is',
  'at~any~rate',
  'happy',
  '.']]

#BERT Multilingual (bert-base-multilingual-uncased)

In [18]:
'''Tokenizing train'''
x_train = df_train['token'].tolist()
y_train = df_train['enc_semtag'].tolist()

x_train_adj = []
for train_text in x_train:
    tt = ' '.join(train_text)
    tt = tt.split()
    x_train_adj.append(tt)
train_tok = tokenize_and_align('bert-base-multilingual-uncased', x_train_adj, y_train)

'''Tokenizing dev'''
x_dev = df_dev['token'].tolist()
y_dev = df_dev['enc_semtag'].tolist()

x_dev_adj = []
for dev_text in x_dev:
    td = ' '.join(dev_text)
    td = td.split()
    x_dev_adj.append(td)
dev_tok = tokenize_and_align('bert-base-multilingual-uncased', x_dev_adj, y_dev)

'''Tokenizing test'''
x_test = df_test['token'].tolist()
y_test = df_test['enc_semtag'].tolist()

x_test_adj = []
for test_text in x_test:
    ts = ' '.join(test_text)
    ts = ts.split()
    x_test_adj.append(ts)
test_tok = tokenize_and_align('bert-base-multilingual-uncased', x_test_adj, y_test)

'''Training and evaluating BERT-multilingual (bert-base-multilingual-uncased)'''
BERT_multilingual = train_model('bert-base-multilingual-uncased', 5e-5, 16, 20, 70, label_names, train_tok, dev_tok)
print(evaluate_model(BERT_multilingual, 'bert-base-multilingual-uncased', 5e-5, 16, 20, 70, label_names, test_tok))

Training model: bert-base-multilingual-uncased
With parameters:
Learn rate: 5e-05, Batch size: 16
Epochs: 20, Sequence length: 70
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Testing model: bert-base-multilingual-uncased
With parameters:
Learn rate: 5e-05, Batch size: 16
Epochs: 20, Sequence length: 70
              precision    recall  f1-score   support

         ALT     0.9767    1.0000    0.9882        42
         AND     0.9412    0.8649    0.9014        74
         APX     0.9167    1.0000    0.9565        11
         ART     0.7333    0.7857    0.7586        14
         BOT     1.0000    1.0000    1.0000         2
         BUT     0.7778    1.0000    0.8750         7
         CLO     0.9310    1.0000    0.9643        27
         COL     0.9655    1.0000    0.9825        28
         CON     0.9814    0.9814    0.9814      1343
         COO     0.6667    1.0000    0.8000        12
         CTC     1.0000    0.5000    0.6667         2
         DEF     0.9886   

In [19]:
Y_gold3, Y_pred3 = evaluate_custom(BERT_multilingual, 70, label_names, test_tok)



In [23]:
'''Sentences which BERT multilingual predicted wrong from the test set'''
num3 = 0
for x3, y3 in zip(Y_gold3, Y_pred3):
  num3 += 1
  if x3 != y3:
    print(num3, x3, y3)

7 ['PRO', 'PST', 'EXS', 'NIL'] ['PRO', 'PST', 'IST', 'NIL']
8 ['DEF', 'CON', 'NOW', 'NOT', 'IST', 'NIL'] ['DEF', 'CON', 'NOW', 'IST', 'IST', 'NIL']
15 ['QUE', 'PST', 'DEF', 'YOC', 'CON', 'EXS', 'DIS', 'NIL', 'QUE'] ['QUE', 'PST', 'DEF', 'YOC', 'HAP', 'EXS', 'DIS', 'NIL', 'QUE']
25 ['PRO', 'NOW', 'IST', 'EMP'] ['HAS', 'NOW', 'IST', 'EMP']
31 ['DEF', 'CON', 'NOW', 'DEG', 'NIL', 'BUT', 'DEF', 'CON', 'NOW', 'NOT', 'INT', 'IST', 'NIL'] ['DEF', 'CON', 'NOW', 'IST', 'NIL', 'BUT', 'DEF', 'CON', 'NOW', 'NOT', 'INT', 'IST', 'NIL']
40 ['DEF', 'CON', 'EPS', 'DEF', 'CON', 'NIL'] ['DEF', 'CON', 'ENS', 'DEF', 'CON', 'NIL']
41 ['HAS', 'CON', 'PST', 'INT', 'IST', 'NIL'] ['HAS', 'CON', 'PST', 'INT', 'DEG', 'NIL']
57 ['PRX', 'CON', 'NOW', 'EXG', 'NIL'] ['PRX', 'POS', 'NOW', 'EXG', 'NIL']
59 ['PRO', 'EPS', 'QUV', 'CON', 'REL', 'DEF', 'CON', 'NIL'] ['PRO', 'EPS', 'DIS', 'CON', 'REL', 'DEF', 'CON', 'NIL']
63 ['PRO', 'ENS', 'NIL', 'NOT', 'NIL'] ['PRO', 'ENS', 'PRO', 'NOT', 'NIL']
66 ['DEF', 'PER', 'ENS', 'DE

In [32]:
x_test[417:418]

[['Plots', 'gingen', 'de', 'lichten', 'uit', '.']]

# Example tokenize output

In [None]:
'''
Example of how the BERT model tokenizes the sentence, and the label alignment process.

Example sentence:   'ø', 'Taninna', 'is', 'paling', '.
Encoded labels:      11,    48,      44,     22,    42
Encoded Text:      '[CLS]', 'ø', 'tan', '##inn', '##a', 'is', 'pali', '##ng', '.', '[SEP]'
Label alignment:    -100,    11,   48,   -100,   -100,   44,    22,    -100,  42,   -100
'''

In [None]:
test_text = df_test['token'].tolist()
test_label = df_test['enc_semtag'].tolist()
test_label_ori = df_test['semtag'].tolist()

test_text_adj = []
for test_t in test_text:
    t = ' '.join(test_t)
    t = t.split()
    test_text_adj.append(t)

example_tok = tokenize_and_align('bert-base-uncased', test_text_adj, test_label)

In [None]:
num = 0
for n, label in enumerate(test_label):
  num += 1
  if len(example_tok.word_ids(n)) != (len(label) + 2):
    print(num, example_tok.word_ids(n), label)

In [None]:
for n1, l1 in zip(test_text[52:53], test_label[52:53]):
  print(n1)
  print(l1)

num10 = 0
for n, label in enumerate(test_label):
  num10 += 1
  print(num10, example_tok.tokens(n))
  
example_tok['labels'][52:53]