## We now redefine y as target
y should now be the clusters!

In [None]:
import pandas as pd
import tensorflow as tf
df_model=pd.read_csv('../csv/TRAIN_DS_Clusters.csv', sep=";") 
df_model.head()

### Create OHE dataframe for targets

or
*Convert y from pd Series to pd dataframe!*

In [None]:
 
X = df_model["text"].astype(str).tolist()

#y = df_model['cluster'].to_frame() 
y = pd.get_dummies(df_model['cluster'])

In [None]:
# Make train test val 

from sklearn.model_selection import train_test_split

# Split Train and Validation data
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1)

# Keep some data for inference (testing)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15)
print ('X_train',len(X_train))
print ('X_test',len(X_test))
print ('X_val',len(X_val))


In [None]:
# Initiate GPU with maximal memory allocation

print (tf.__version__)
# Ref: https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)]) # Notice here
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
from transformers import AutoTokenizer
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
tokenizer = RobertaTokenizer.from_pretrained("pdelobelle/robbert-v2-dutch-base")
tokenizer = AutoTokenizer.from_pretrained("pdelobelle/robbert-v2-dutch-base")

model = TFRobertaForSequenceClassification.from_pretrained("pdelobelle/robbert-v2-dutch-base", num_labels=len(set(y)))

In [None]:
train_encodings = tokenizer(X_train, max_length=128, truncation=True, padding=True)
val_encodings   = tokenizer(X_val,   max_length=128, truncation=True, padding=True)
test_encodings  = tokenizer(X_test,  max_length=128, truncation=True, padding=True)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_val
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
)) 

In [None]:
# Create AdamW optimizer

from official.nlp import optimization 
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 5e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

### Set LOSS & METRICS
IF we have targets as **integers** and not as one_hot_encoded dataframe, we have to use

**SparseCategoricalCrossentropy** instead of *CategoricalCrossentropy*
and 

**SparseCategoricalAccuracy** instead of *CategoricalAccuracy*

In [None]:
"""# Compile the model

loss= tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False,
    reduction="auto",
    name="sparse_categorical_crossentropy",
) 
metrics = tf.keras.metrics.SparseCategoricalAccuracy(name="sparse_categorical_accuracy", dtype=None)

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

model.summary()"""

### But we are using OHE columns
Because loss turns into nan with above loss function

In [None]:
# Compile the model

loss=tf.keras.losses.CategoricalCrossentropy(
    from_logits=False,
    label_smoothing=0.0,
    axis=-1,
    reduction="auto",
    name="categorical_crossentropy",
)
metrics = tf.keras.metrics.CategoricalAccuracy(name="categorical_accuracy", dtype=None)

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

model.summary()

In [None]:
# Run the model
 
tf.config.run_functions_eagerly(True)

BATCH_SIZE = 2
EPOCHS =6
history=model.fit(
    train_dataset.batch(BATCH_SIZE) ,
    epochs=EPOCHS,
    validation_data=val_dataset.batch(BATCH_SIZE)
)


In [None]:
loss, accuracy = model.evaluate(test_dataset)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

In [None]:

import matplotlib.pyplot as plt

history_dict = history.history
print(history_dict.keys())

acc = history_dict['categorical_accuracy']
val_acc = history_dict['val_categorical_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
# r is for "solid red line"
plt.plot(epochs, loss, 'r', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
# plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

In [None]:
dataset_name = 'kpmg_model_h_robberta_dutch_softmax'
saved_model_path = './{}_bert'.format(dataset_name.replace('/', '_'))

model.save(saved_model_path, include_optimizer=False)

In [None]:
reloaded_model = tf.saved_model.load(saved_model_path)

In [None]:
import re

with open ('../processed_data/NL/NL_312-2019-012471.txt') as f:
    text=f.read() 
text = clean_text(text)
#print (text)

#not enough values to unpack (expected 2, got 1)
#input_ids = input_ids.unsqueeze(0)
#attention_mask = attention_mask.unsqueeze(0)

filename = 'NLP_model.sav' 
#loaded_model = pickle.load(open(filename, 'rb'))


encodings = tokenizer([text], truncation=True, padding=True)
ds = tf.data.Dataset.from_tensor_slices(dict(encodings))

print (ds)
ds=ds.batch(1, drop_remainder=True)
print(ds)
predictions = model.predict(ds)

mapping = {i: name for i, name in enumerate(y.columns)}

import numpy as np
print(mapping[np.argmax(predictions[0])])

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

def plot_loss(history):
# Use a log scale to show the wide range of values.
    plt.semilogy(history.epoch,  history.history['loss'],
               color='red', label='Train Loss')
    plt.semilogy(history.epoch,  history.history['val_loss'],
          color='green', label='Val Loss',
          linestyle="--")
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
  
    plt.legend()

plot_loss(history)

In [None]:
!pip install seaborn
from sklearn.metrics import roc_curve,confusion_matrix,auc
import seaborn as sns

def plot_cm(y_true, y_pred, title):
    ''''
    input y_true-Ground Truth Labels
          y_pred-Predicted Value of Model
          title-What Title to give to the confusion matrix
    
    Draws a Confusion Matrix for better understanding of how the model is working
    
    return None
    
    '''
    
    figsize=(10,10)
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
            elif c == 0:
                annot[i, j] = ''
            else:
                annot[i, j] = '%.1f%%\n%d' % (p, c)
    cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'
    fig, ax = plt.subplots(figsize=figsize)
    plt.title(title)
    sns.heatmap(cm, cmap= "YlGnBu", annot=annot, fmt='', ax=ax)
 
val_dataset=val_dataset.batch(1, drop_remainder=True) 
y_predict=model.predict(val_dataset, verbose=1)

In [None]:
print (y_predict[0])
y_predict[float(y_predict[0])> 0.5] = 1
y_predict[y_predict <= 0.5] = 0
plot_cm(y_val, y_predict, 'Performance-Confusion Matrix')
plot_cm()
