In [1]:
import dataiku
import dataikuapi
from dataiku import pandasutils as pdu
from dataiku import Dataset

import pandas as pd
import numpy as np
import tensorflow as tf

import torch

from transformers import RobertaTokenizer, AutoModelForSequenceClassification, AutoConfig
from transformers import RobertaForMaskedLM, RobertaModel
from transformers import TFRobertaForSequenceClassification, TFRobertaModel
from transformers import pipeline
from transformers import InputExample, InputFeatures

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score



In [2]:
def convert_to_input(sentences,tokenizer):
    
    input_ids = []
    attention_masks = []
    token_type_ids = []
  
    for x in sentences:
        inputs = tokenizer.encode_plus(x, 
                                          add_special_tokens=True, 
                                          return_token_type_ids=True,
                                          truncation=True, 
                                          max_length=max_length)
        i, t = inputs["input_ids"], inputs["token_type_ids"]
        m = [1] * len(i)

        padding_length = max_length - len(i)

        i = i + ([pad_token] * padding_length)
        m = m + ([0] * padding_length)
        t = t + ([pad_token_segment_id] * padding_length)

        input_ids.append(i)
        attention_masks.append(m)
        token_type_ids.append(t)
  
    return [np.asarray(input_ids), 
            np.asarray(attention_masks), 
            np.asarray(token_type_ids)]

def example_to_features(input_ids, attention_masks, token_type_ids, y):
    return {"input_ids": input_ids,
            "attention_mask": attention_masks,
            "token_type_ids": token_type_ids},y

def predict(tokenizer, model, sentences):
    tf_batch = tokenizer(sentences, max_length=256, padding=True, truncation=True, return_tensors='tf')
    tf_outputs = model(tf_batch)
    tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
    label = tf.argmax(tf_predictions, axis=1)
    label = label.numpy()
    return label

def predict_alt(tokenizer, model, sentences):
    tf_batch = tokenizer(sentences, max_length=256, padding=True, truncation=True, return_tensors='tf')
    tf_outputs = model(tf_batch)
    tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
    label = tf.argmax(tf_predictions, axis=1)
    label = label.numpy()
    return label, tf_predictions

In [3]:
MODEL_NL = '/data/library/python/robbert-v2-dutch-base'
MODEL_EN = '/data/library/python/roberta-base'

In [4]:
df = Dataset('df_results_mvb_big_para_complete_labelled').get_dataframe().set_index('col_0')
df_nl = df[df['dc:language']=='nl'].reset_index()
df_en = df[df['dc:language']=='en'].reset_index()
#dft = df_nl[df_nl.label.isna()==False]

In [5]:
model = dict()
tokenizer = dict()

model_dir = '/data/dataiku/managed_folders/solvency2/TV_TEXTMINING/Lc1UMaK3'
model['nl'] = TFRobertaForSequenceClassification.from_pretrained(model_dir,num_labels=3)
tokenizer['nl'] = RobertaTokenizer.from_pretrained(MODEL_NL)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at /data/dataiku/managed_folders/solvency2/TV_TEXTMINING/Lc1UMaK3.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [6]:
#Whole labelled set
df_nl['prediction'] = -1  # sentence is not processed

for row in df_nl.index:
    p = predict(tokenizer['nl'],model['nl'], [df_nl.loc[row, 'dnb_nlp:formatted']])
    print("+" if p==1 else "-", end='')
    df_nl.loc[row, 'prediction'] = p

---+-++---+++++--++++++--+++--+++-+--++++-----+-++++++--+++-+---++-+++++++--+-+-+-+--++--++++++-+-+------++-++++++-+--+--------+-++++-+++-++-+-++-++++++-++-------+++++---+---+---++-----+-+---+-++---+--++++++---++-++++++--+--++---++-+-+-++-+-+-++-++---++--+----+++----+-++++----+++-+---+-+--++++++-++----++--------+++-+-+------+----+++-++----+++-+----++--+---+-+++++-+++++++--+++++-+--+----+++++-+-++--+--++++-+--+++--++---+-----+++++-++-++++++++++++++-+-+-+--+++++++++++++-+++-+-++++++----++-+++++-++-----++-++++-+-++++++--------+---+++++------++-++--+-+---+++-+++----+-+-+-+----------+-+-------++--++---++-+-+------++++++---++++++++++++++-++++++++++++-+-+-++----+--+++++-+++----+--+-+++--++++----+---++-----+-----+----++---+++-++----+++-++++-+---------++----+----+-+--+--++-+--+---++--+++--+---+----+-+------+++++----+--++++-++++-+++++-------++++++++++-+++-++---------+-+-++++++++++++-++-++-------++-++-------------------++--+++-++--++-----++--+++++--++-+----++-++-------++-+-++-++--++--+-++++++---+

In [7]:
client = dataiku.api_client()
project = client.get_project("CLIMATEANALYSIS")
name = 'results_all_labelled_complete_specificity'
d = dataikuapi.dss.dataset.DSSManagedDatasetCreationHelper(project,name)
d = d.with_store_into('filesystem_managed_solvency2')
d.create(name)
output = Dataset(name)
output.write_with_schema(df_nl, dropAndCreate=True)

22243 rows successfully written (1KCOo1Ioez)
