In [1]:
import dataiku
import dataikuapi
from dataiku import pandasutils as pdu
from dataiku import Dataset

import pandas as pd
import numpy as np
import tensorflow as tf

import torch

from transformers import RobertaTokenizer, AutoModelForSequenceClassification, AutoConfig
from transformers import RobertaForMaskedLM, RobertaModel
from transformers import TFRobertaForSequenceClassification, TFRobertaModel
from transformers import pipeline
from transformers import InputExample, InputFeatures

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score



### Functions

In [2]:
def convert_to_input(sentences,tokenizer):
    
    input_ids = []
    attention_masks = []
    token_type_ids = []
  
    for x in sentences:
        inputs = tokenizer.encode_plus(x, 
                                          add_special_tokens=True, 
                                          return_token_type_ids=True,
                                          truncation=True, 
                                          max_length=max_length)
        i, t = inputs["input_ids"], inputs["token_type_ids"]
        m = [1] * len(i)

        padding_length = max_length - len(i)

        i = i + ([pad_token] * padding_length)
        m = m + ([0] * padding_length)
        t = t + ([pad_token_segment_id] * padding_length)

        input_ids.append(i)
        attention_masks.append(m)
        token_type_ids.append(t)
  
    return [np.asarray(input_ids), 
            np.asarray(attention_masks), 
            np.asarray(token_type_ids)]

def example_to_features(input_ids, attention_masks, token_type_ids, y):
    return {"input_ids": input_ids,
            "attention_mask": attention_masks,
            "token_type_ids": token_type_ids},y

def predict(tokenizer, model, sentences):
    tf_batch = tokenizer(sentences, max_length=256, padding=True, truncation=True, return_tensors='tf')
    tf_outputs = model(tf_batch)
    tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
    label = tf.argmax(tf_predictions, axis=1)
    label = label.numpy()
    return label

def predict_alt(tokenizer, model, sentences):
    tf_batch = tokenizer(sentences, max_length=256, padding=True, truncation=True, return_tensors='tf')
    tf_outputs = model(tf_batch)
    tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
    label = tf.argmax(tf_predictions, axis=1)
    label = label.numpy()
    return label, tf_predictions

In [3]:
MODEL_NL = '/data/library/python/robbert-v2-dutch-base'
MODEL_EN = '/data/library/python/roberta-base'

### Train sentence classification

In [4]:
model_nl = TFRobertaForSequenceClassification.from_pretrained(MODEL_NL,num_labels=2)
tokenizer_nl = RobertaTokenizer.from_pretrained(MODEL_NL)

# model_en = TFRobertaForSequenceClassification.from_pretrained(MODEL_EN,num_labels=4)
# tokenizer_en = RobertaTokenizer.from_pretrained(MODEL_EN)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at /data/library/python/robbert-v2-dutch-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Read data

In [5]:
df = Dataset('df_results_mvb_big_labelled').get_dataframe().set_index('col_0')
df_nl = df[df['dc:language']=='nl'].reset_index()
df_en = df[df['dc:language']=='en'].reset_index()
dft = df_nl[df_nl.label.isna()==False]

In [6]:
# necessary step because otherwise the tokenizer produces incorrect token during training
for row in dft.index:
    dft.loc[row, 'dnb_nlp:formatted'] = dft.loc[row, 'dnb_nlp:formatted'].replace("\n", " ")
# for row in df_en.index:
#     df_en.loc[row, 'dnb_nlp:sentence'] = df_en.loc[row, 'dnb_nlp:sentence'].replace("\n", " ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


### Train Dutch model

In [7]:
# X = (np.array(dft['dnb_nlp:formatted']))
# y = (np.array(dft['label']))
# X = np.array([test.encode("ascii", "ignore").decode() for test in list(X)])
# X = np.array([test.replace('\n',' ') for test in list(X)])

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

# print("Train dataset shape: {0}, \nTest dataset shape: {1}".format(X_train.shape, X_test.shape))

# pad_token = 0
# pad_token_segment_id = 0
# max_length = 256

# X_test_input = convert_to_input(X_test,tokenizer_nl)
# X_train_input = convert_to_input(X_train,tokenizer_nl)

# train_ds = tf.data.Dataset.from_tensor_slices((X_train_input[0],X_train_input[1],X_train_input[2],y_train)).map(example_to_features).shuffle(100).batch(32).repeat(5)
# test_ds = tf.data.Dataset.from_tensor_slices((X_test_input[0],X_test_input[1],X_test_input[2],y_test)).map(example_to_features).batch(64)

In [8]:
# optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, 
#                                      epsilon=1e-08, 
#                                      clipnorm=1.0)
# loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) #can be used when there are two or more label classes
# metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

# model_nl.compile(optimizer=optimizer, loss=loss, metrics=[metric])
# model_nl.fit(train_ds, epochs=5, validation_data=test_ds) #train model for finxed number of epochs

In [9]:
# #Lc1UMaK3 is code of model_nl_methods
# model_dir = '/data/dataiku/managed_folders/solvency2/TV_TEXTMINING/Lc1UMaK3'
# model_nl.save_pretrained(save_directory=model_dir)

# # folder = dataiku.Folder('c9VWnxpk')  
# # model_nl.save_pretrained(save_directory=folder.get_path())

### Test performance Dutch model

In [10]:
model = dict()
tokenizer = dict()

model_dir = '/data/dataiku/managed_folders/solvency2/TV_TEXTMINING/Lc1UMaK3'
model['nl'] = TFRobertaForSequenceClassification.from_pretrained(model_dir,num_labels=2)
tokenizer['nl'] = RobertaTokenizer.from_pretrained(MODEL_NL)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at /data/dataiku/managed_folders/solvency2/TV_TEXTMINING/Lc1UMaK3.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [11]:
#Whole labelled set
dft['prediction'] = -1  # sentence is not processed

for row in dft.index:
    p = predict(tokenizer['nl'],model['nl'], [dft.loc[row, 'dnb_nlp:formatted']])
    print("+" if p==1 else "-", end='')
    dft.loc[row, 'prediction'] = p

#Sentences wrong prediction
#print(dft[(dft['label']!=dft['prediction'])]['dnb_nlp:sentence'].values)

print(confusion_matrix(dft['label'], dft['prediction']))
print(accuracy_score(dft['label'], dft['prediction']))
print(precision_score(dft['label'], dft['prediction'],average='weighted'))
print(recall_score(dft['label'], dft['prediction'],average='weighted'))
print(f1_score(dft['label'], dft['prediction'],average='weighted'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


-----++-----------+-----+----+-------------+-+++--+++++++++++++--++++++++-++++++++----------+--+++++----++---+----++--+-+-+----+-+-----++----------++---+---+++---+++++-++++++++++++-+--+++-+++-+++--------------------------------------------------------------------------------------------------------------+-+++++++++-++++-+++++++++++++-++++-+-+++---+++++++++++++++-+--+++++++++-++++++-+++++----++----------+----+-++++++-+++-++-++++-+-+--++++-++++-+-++++++-++++-++----++++--------------+++++++-++-++-----++++++++----+--------+-+---+---+++++++-+++-++++++++++---------+---+-------+++++---++++-------------------------+++++-++++-------------+-+-+++++++++----------------+-+-------+--+---+----+-+---+--+++----++-----++++-------++-----+--+++------+--+++++--------------------+-++++-+-+++++---+----------------++------------++---+--+++---++++-------------+++++++-----------------+++++++-++-++++++--++-+++--++++-+--+---------+----++++-+-+-++++++---+----++-----+-+-++++++++-++--+----++--+++--------+++--------

In [18]:
#Training set
X = (np.array(dft['dnb_nlp:formatted']))
y = (np.array(dft['label']))
X = np.array([test.encode("ascii", "ignore").decode() for test in list(X)])
X = np.array([test.replace('\n',' ') for test in list(X)])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

y_predict = []
for i in range(0,len(X_train)):
    p,prob = predict_alt(tokenizer['nl'],model['nl'], [X_train[i]])
    #print("+" if p==1 else "-", end='')
    if p != y_train[i]:
        print(p)
        print(y_train[i])
        print(prob)
        print(X_train[i])
    y_predict = y_predict + list(p)

print(confusion_matrix(y_train,y_predict))
print(accuracy_score(y_train, y_predict))
print(precision_score(y_train, y_predict,average='weighted'))
print(recall_score(y_train, y_predict,average='weighted'))
print(f1_score(y_train, y_predict,average='weighted'))

[0]
1.0
tf.Tensor([[0.6965838 0.3034161]], shape=(1, 2), dtype=float32)
Bij wijze van  stresstest  heeft PME daarom ook een jaar ( 2035 ) met extreem weer laten doorrekenen .
[1]
0.0
tf.Tensor([[0.37110218 0.62889785]], shape=(1, 2), dtype=float32)
VISIE PMT De inzet van PMT is het bieden van een betaalbaar en duurzaam pensioen voor alle deelnemers met maatwerk binnen de collectieve regeling .
[[874   1]
 [  1 792]]
0.9988009592326139
0.9988009592326139
0.9988009592326139
0.9988009592326139


In [17]:
#Test set
X = (np.array(dft['dnb_nlp:formatted']))
y = (np.array(dft['label']))
X = np.array([test.encode("ascii", "ignore").decode() for test in list(X)])
X = np.array([test.replace('\n',' ') for test in list(X)])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

y_predict = []
for i in range(0,len(X_test)):
    p,prob = predict_alt(tokenizer['nl'],model['nl'], [X_test[i]])
    #print("+" if p==1 else "-", end='')
    if p != y_test[i]:
        print(p)
        print(y_test[i])
        print(prob)
        print(X_test[i])
    y_predict = y_predict + list(p)

print(confusion_matrix(y_test,y_predict))
print(accuracy_score(y_test, y_predict))
print(precision_score(y_test, y_predict,average='weighted'))
print(recall_score(y_test, y_predict,average='weighted'))
print(f1_score(y_test, y_predict,average='weighted'))

[1]
0.0
tf.Tensor([[0.00126092 0.99873906]], shape=(1, 2), dtype=float32)
van klimaatbestendige infrastructuur .
[1]
0.0
tf.Tensor([[0.00847464 0.99152535]], shape=(1, 2), dtype=float32)
PME in 2018 geen investeringen binnen dit thema kunnen doen .
[1]
0.0
tf.Tensor([[0.2655244 0.7344757]], shape=(1, 2), dtype=float32)
Een goed pensioen is er ook voor je partner Impact Investing is Samen met de nieuwe adviseur en de nieuwe uitvoerder van het vermogensbeheer het doen van gerichte willen we verder stappen zetten op dit gebied waarbij we ons in 2021 en 2022 zullen beleggingen welke richten op :
[0]
1.0
tf.Tensor([[9.998259e-01 1.741027e-04]], shape=(1, 2), dtype=float32)
5.2.5 Door transities betere aansluiting met ons maatschappelijk verantwoord beleggingsbeleid
[0]
1.0
tf.Tensor([[0.99065405 0.00934594]], shape=(1, 2), dtype=float32)
Uitsluiting wordt onder andere toegepast lijkt het erop dat we de komende jaren , vooral in de VS , ten aanzien van controversile wapens , tabak , thermisc

### Apply model on complete dataset

In [14]:
# #Whole labelled set
# df_nl['prediction'] = -1  # sentence is not processed

# for row in df_nl.index:
#     p = predict(tokenizer['nl'],model['nl'], [df_nl.loc[row, 'dnb_nlp:sentence']])
#     print("+" if p==1 else "-", end='')
#     df_nl.loc[row, 'prediction'] = p

In [15]:
# client = dataiku.api_client()
# project = client.get_project("CLIMATEANALYSIS")
# name = 'results_labelled_methods_incl_prediction'
# d = dataikuapi.dss.dataset.DSSManagedDatasetCreationHelper(project,name)
# d = d.with_store_into('filesystem_managed_solvency2')
# d.create(name)
# output = Dataset(name)
# output.write_with_schema(df_nl, dropAndCreate=True)