In [None]:
import os
from os.path import join
import pandas as pd
import numpy as np
import tensorflow as tf

import torch

from transformers import RobertaTokenizer, AutoModelForSequenceClassification, AutoConfig
from transformers import RobertaForMaskedLM, RobertaModel
from transformers import TFRobertaForSequenceClassification, TFRobertaModel
from transformers import pipeline
from transformers import InputExample, InputFeatures

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

### Functions

In [None]:
def convert_to_input(sentences,tokenizer):
    
    input_ids = []
    attention_masks = []
    token_type_ids = []
  
    for x in sentences:
        inputs = tokenizer.encode_plus(x, 
                                          add_special_tokens=True, 
                                          return_token_type_ids=True,
                                          truncation=True, 
                                          max_length=max_length)
        i, t = inputs["input_ids"], inputs["token_type_ids"]
        m = [1] * len(i)

        padding_length = max_length - len(i)

        i = i + ([pad_token] * padding_length)
        m = m + ([0] * padding_length)
        t = t + ([pad_token_segment_id] * padding_length)

        input_ids.append(i)
        attention_masks.append(m)
        token_type_ids.append(t)
  
    return [np.asarray(input_ids), 
            np.asarray(attention_masks), 
            np.asarray(token_type_ids)]

def example_to_features(input_ids, attention_masks, token_type_ids, y):
    return {"input_ids": input_ids,
            "attention_mask": attention_masks,
            "token_type_ids": token_type_ids},y

def predict(tokenizer, model, sentences):
    tf_batch = tokenizer(sentences, max_length=256, padding=True, truncation=True, return_tensors='tf')
    tf_outputs = model(tf_batch)
    tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
    label = tf.argmax(tf_predictions, axis=1)
    label = label.numpy()
    return label

def predict_alt(tokenizer, model, sentences):
    tf_batch = tokenizer(sentences, max_length=256, padding=True, truncation=True, return_tensors='tf')
    tf_outputs = model(tf_batch)
    tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
    label = tf.argmax(tf_predictions, axis=1)
    label = label.numpy()
    return label, tf_predictions

### Train sentence classification

In [None]:
MODEL_PATH = join('..','data','models','original') #folder contain RobBERT model
MODEL_FINAL_PATH = join('..','data','models','model1') #folder contain RobBERT model#folder to save finetuned model
OUTPUT_DATA_PATH = join('..','data','output','')

model_nl = TFRobertaForSequenceClassification.from_pretrained(MODEL_PATH,num_labels=2)
tokenizer_nl = RobertaTokenizer.from_pretrained(MODEL_PATH)

### Read data

In [None]:
#Import dataframe with labelled sentences
df = pd.read_excel(OUTPUT_DATA_PATH + 'df_results_mvb_big_labelled.xlsx',index_col=0)

df_nl = df[df['dc:language']=='nl'].reset_index()
dft = df_nl[df_nl.label.isna()==False]

# necessary step because otherwise the tokenizer produces incorrect token during training
for row in dft.index:
    dft.loc[row, 'dnb_nlp:sentence'] = dft.loc[row, 'dnb_nlp:sentence'].replace("\n", " ")

### Train model

In [None]:
X = (np.array(dft['dnb_nlp:formatted']))
y = (np.array(dft['label']))
X = np.array([test.encode("ascii", "ignore").decode() for test in list(X)])
X = np.array([test.replace('\n',' ') for test in list(X)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

print("Train dataset shape: {0}, \nTest dataset shape: {1}".format(X_train.shape, X_test.shape))

pad_token = 0
pad_token_segment_id = 0
max_length = 256

X_test_input = convert_to_input(X_test,tokenizer_nl)
X_train_input = convert_to_input(X_train,tokenizer_nl)

train_ds = tf.data.Dataset.from_tensor_slices((X_train_input[0],X_train_input[1],X_train_input[2],y_train)).map(example_to_features).shuffle(100).batch(32).repeat(5)
test_ds = tf.data.Dataset.from_tensor_slices((X_test_input[0],X_test_input[1],X_test_input[2],y_test)).map(example_to_features).batch(64)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, 
                                     epsilon=1e-08, 
                                     clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) #can be used when there are two or more label classes
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model_nl.compile(optimizer=optimizer, loss=loss, metrics=[metric])
model_nl.fit(train_ds, epochs=5, validation_data=test_ds) #train model for finxed number of epochs

model_nl.save_pretrained(save_directory=MODEL_FINAL_PATH)

### Test performance Dutch model

In [None]:
model = dict()
tokenizer = dict()

model['nl'] = TFRobertaForSequenceClassification.from_pretrained(MODEL_FINAL_PATH,num_labels=2)
tokenizer['nl'] = RobertaTokenizer.from_pretrained(MODEL_NL)

In [None]:
#Whole labelled set
dft['prediction'] = -1  # sentence is not processed

for row in dft.index:
    p = predict(tokenizer['nl'],model['nl'], [dft.loc[row, 'dnb_nlp:formatted']])
    print("+" if p==1 else "-", end='')
    dft.loc[row, 'prediction'] = p

print(confusion_matrix(dft['label'], dft['prediction']))
print(accuracy_score(dft['label'], dft['prediction']))
print(precision_score(dft['label'], dft['prediction'],average='weighted'))
print(recall_score(dft['label'], dft['prediction'],average='weighted'))
print(f1_score(dft['label'], dft['prediction'],average='weighted'))

### Apply model on complete dataset

In [None]:
#Whole labelled set
df_nl['prediction'] = -1  # sentence is not processed

for row in df_nl.index:
    p = predict(tokenizer['nl'],model['nl'], [df_nl.loc[row, 'dnb_nlp:sentence']])
    print("+" if p==1 else "-", end='')
    df_nl.loc[row, 'prediction'] = p
    
df_nl.to_excel(OUTPUT_DATA_PATH + 'df_results_mvb_big_prediction.xlsx')