# Initialization

In [1]:
!pip install transformers[sentencepiece]

Collecting transformers[sentencepiece]
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 5.3MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 38.4MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)

In [2]:
import numpy as np
import tensorflow as tf
import pandas as pd

import os, tqdm

from transformers import XLMRobertaTokenizer, AdamWeightDecay
from transformers import TFXLMRobertaForSequenceClassification
from transformers.modeling_tf_outputs import TFTokenClassifierOutput

from sklearn.metrics import classification_report

In [3]:
SEQ_LENGTH       = 128 # @param {type: "integer"}
TRAIN_BATCH_SIZE = 64  # @param {type: "integer"}
VALID_BATCH_SIZE = 512 # @param {type: "integer"}
EPOCHS           = 4   # @param {type: "integer"}

LEARNING_RATE   = 2e-5 # @param {type: "number"}
L2_WEIGHT_DECAY = 0.01 # @param {type: "number"}
LR_DECAY        = 0.95 # @param {type: "number"}

USE_TPU  = True # @param {type: "boolean"}
USE_FP16 = True  # @param {type: "boolean"}

USE_TPU = USE_TPU and ('COLAB_TPU_ADDR' in os.environ)

In [4]:
tf.random.set_seed(42)

if USE_FP16:
    if USE_TPU:
        tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
    else:
        tf.keras.mixed_precision.set_global_policy('mixed_float16')

if USE_TPU:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    tpu_strategy = tf.distribute.TPUStrategy(resolver)

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


INFO:tensorflow:Initializing the TPU system: grpc://10.81.78.186:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.81.78.186:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


In [5]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=9096718.0, style=ProgressStyle(descript…




# Prepare data

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
data_files = {'Deutsch': {
        'train': '/content/drive/MyDrive/Colab Notebooks/train.csv',

        'dev':  '/content/drive/MyDrive/Colab Notebooks/test_blind.csv',

        #'true': '/content/drive/MyDrive/Colab Notebooks/dev.csv'
    } }

label_map = {
    'literally': 0,
    'figuratively':     1,
    'both':     2,
    'undecidable': 3
}

In [8]:
def _load_data(filename, is_train=0):
    train = pd.read_csv(filename, sep=";", encoding="utf-8")

    if is_train :
      cut_off_X, cut_off_y = len(train["text"])%TRAIN_BATCH_SIZE,len(train["label"])%TRAIN_BATCH_SIZE 
      boundary_X = len(train["text"])-cut_off_X-1
      boundary_y = len(train["label"])-cut_off_y-1

      X = tokenizer.batch_encode_plus(tqdm.tqdm(train["text"][0:boundary_X]),
                                    padding        = 'max_length',
                                    truncation     = True,
                                    max_length     = 128)
      y = train["label"][0:boundary_y].map(label_map).to_numpy(dtype=np.int32)

    else :    
      X = tokenizer.batch_encode_plus(tqdm.tqdm(train["text"]),
                                      padding        = 'max_length',
                                      truncation     = True,
                                      max_length     = 128)
      y = train["label"].map(label_map).to_numpy(dtype=np.int32)
   
    X['input_ids']      = np.asarray(X['input_ids'], dtype=np.int32)
    X['attention_mask'] = np.asarray(X['attention_mask'], dtype=np.float32)
    
        
    return X, y

data_dict = {}

for lang, files in data_files.items():
    X_train, y_train = _load_data(files['train'], 1)
    X_dev,   y_dev   = _load_data(files['dev'], 0)
   # X_dev = (X_dev["input_ids"], X_dev["attention_mask"])

    data_dict['X_{}_train_ids'.format(lang)]  = X_train['input_ids']
    data_dict['X_{}_train_mask'.format(lang)] = X_train['attention_mask']
    data_dict['y_{}_train'.format(lang)]      = y_train

    data_dict['X_{}_dev_ids'.format(lang)]  = X_dev['input_ids']
    data_dict['X_{}_dev_mask'.format(lang)] = X_dev['attention_mask']
    data_dict['y_{}_dev'.format(lang)]      = y_dev

100%|██████████| 6847/6847 [00:02<00:00, 2444.67it/s]
100%|██████████| 1511/1511 [00:00<00:00, 2669.15it/s]


In [9]:
X_train = (X_train["input_ids"], X_train["attention_mask"])

In [10]:
sample_weights_train = np.full(y_train.shape,   1, dtype=np.float32)

# Prepare the model

In [11]:
class TFXLMRobertaForHopeSpeechDetection(TFXLMRobertaForSequenceClassification):

    def call(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        labels=None,
        training=False,
        **kwargs,
    ):
        _kwargs = {'labels': labels, 'training': training, **kwargs}
        if output_attentions is not None:
            _kwargs['output_attentions'] = output_attentions
        if output_hidden_states is not None:
            _kwargs['output_hidden_states'] = output_hidden_states
        if return_dict is not None:
            _kwargs['return_dict'] = return_dict
        
        outputs = super(TFXLMRobertaForHopeSpeechDetection, self).call(input_ids,
                                                                       attention_mask,
                                                                       token_type_ids,
                                                                       position_ids,
                                                                       head_mask,
                                                                       inputs_embeds,
                                                                       **_kwargs)

        if 'mixed' in self.dtype_policy.name.lower():
            return_dict = return_dict if return_dict is not None else self.config.return_dict
            
            if return_dict:
                dtype = self.dtype_policy.variable_dtype
                
                outputs = TFTokenClassifierOutput(
                    loss          = None if outputs.loss is None else tf.cast(outputs.loss, dtype),
                    logits        = None if outputs.logits is None else tf.cast(outputs.logits, dtype),
                    hidden_states = None if outputs.hidden_states is None else tf.cast(outputs.hidden_states, dtype),
                    attentions    = None if outputs.attentions is None else tf.cast(outputs.attentions, dtype),
                )
            else:
                outputs = tuple(tf.cast(o, self.dtype_policy.variable_dtype) for o in outputs)
        
        return outputs
    
    
    def compile(self, optimizer='Adam', lr_decay=None, loss=None, metrics=None,
                      loss_weights=None, weighted_metrics=None, run_eagerly=None,
                      **kwargs):    
        # We use unaltered embeddings, because the training set only contains a
        # small subset of all possible tokens, leaving most embeddings untrained
        # anyway.
        self.roberta.embeddings.trainable = False

        opt_weights = False

        if lr_decay:
            optim_dict    = tf.keras.optimizers.serialize(optimizer)
            learning_rate = optim_dict['config']['learning_rate']

            optimizer   = [optimizer]
            opt_weights = [self.classifier.trainable_weights]

            n_layers = len(self.roberta.encoder.layer)

            with self.distribute_strategy.scope():
                for i, layer in enumerate(self.roberta.encoder.layer, 1):
                    lr = learning_rate * (lr_decay ** (n_layers - i))

                    if optim_dict['class_name'] == 'AdamWeightDecay':
                        optimizer.append(AdamWeightDecay(**{
                            **optim_dict['config'],
                            'exclude_from_weight_decay': ("LayerNorm", "layer_norm", "bias"),
                            'learning_rate': lr
                        }))
                    else:
                        optimizer.append(tf.keras.optimizers.get({
                            'class_name': optim_dict['class_name'],
                            'config': {**optim_dict['config'], 'learning_rate': lr}
                        }))
                    
                    opt_weights.append(layer.trainable_weights)

        self._opt_weights = opt_weights

        super(TFXLMRobertaForHopeSpeechDetection, self).compile(optimizer = optimizer,
                                                                loss = loss,
                                                                metrics = metrics,
                                                                loss_weights = loss_weights,
                                                                weighted_metrics = weighted_metrics,
                                                                run_eagerly = run_eagerly,
                                                                **kwargs)


    def train_step(self, data):
        if not self._opt_weights:
            return super(TFXLMRobertaForHopeSpeechDetection, self).train_step(data)

        x, y, sample_weight = tf.keras.utils.unpack_x_y_sample_weight(data)

        all_weights = [w for ws in self._opt_weights for w in ws]

        with tf.GradientTape(persistent=True, watch_accessed_variables=False) as tape:
            tape.watch(all_weights)

            y_pred = self(x, training=True)
            loss   = self.compiled_loss(y, y_pred, sample_weight=sample_weight,
                                      regularization_losses=self.losses)

        for optim, current_weights in zip(self.optimizer, self._opt_weights):
            optim.minimize(loss, current_weights, tape=tape)

        self.compiled_metrics.update_state(y, y_pred, sample_weight=sample_weight)

        return {m.name: m.result() for m in self.metrics}

In [18]:
def create_model():    
    if L2_WEIGHT_DECAY:
        optimizer = AdamWeightDecay(learning_rate     = LEARNING_RATE,
                                    weight_decay_rate = L2_WEIGHT_DECAY,
                                    exclude_from_weight_decay = ('LayerNorm', 'layer_norm', 'bias'))
    else:
        optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
    loss      = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metrics   = tf.keras.metrics.SparseCategoricalAccuracy(name='acc')
    roberta   = TFXLMRobertaForHopeSpeechDetection.from_pretrained('xlm-roberta-base',
                                                                   num_labels  = 4,
                                                                   from_pt     = True)
    roberta.compile(optimizer=optimizer, loss=loss, metrics=metrics)
    
    return roberta


if USE_TPU:
    with tpu_strategy.scope():
        roberta = create_model()
else:
    roberta = create_model()

All PyTorch model weights were used when initializing TFXLMRobertaForHopeSpeechDetection.

Some weights or buffers of the TF 2.0 model TFXLMRobertaForHopeSpeechDetection were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
def make_class_weights():
    values = np.bincount(y_train)
    total = len(y_train)

    weight_for_0 = (1 / values[0]) * (total / 2.0)
    weight_for_1 = (1 / values[1]) * (total / 2.0)
    weight_for_2 = 0.01
    weight_for_3 = 0.01

    class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2, 3: weight_for_3}

    return(class_weight)

In [20]:
class_weights = make_class_weights()

In [None]:
class_weights

{0: 2.9285714285714284, 1: 0.6056076419600211, 2: 0.01, 3: 0.01}

In [None]:
np.bincount(y_train)
y_train

array([0, 0, 1, ..., 1, 1, 1], dtype=int32)

In [21]:
import time
start = time.time()

# Train the model

In [22]:
roberta.fit(x = X_train,
            y = y_train,
            sample_weight   = sample_weights_train,
            batch_size      = TRAIN_BATCH_SIZE,
            epochs          = EPOCHS,
            class_weight=class_weights
            )

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


Epoch 1/4


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 128) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 128) dtype=float32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None,) dtype=int32>, <tf.Tensor 'cond_8/Identity_3:0' shape=(None, 1) dtype=float32>]


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported


Cause: while/else statement not yet supported


Cause: while/else statement not yet supported


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 128) dtype=int32>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 128) dtype=float32>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None,) dtype=int32>, <tf.Tensor 'cond_8/Identity_3:0' shape=(None, 1) dtype=float32>]










Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f03d44c3e50>

In [23]:
end = time.time()


In [24]:
print(end-start)

162.33856463432312


In [None]:
roberta.save_pretrained('/content/drive/MyDrive/Colab Notebooks/saved_model_final1/')

# Evaluation

In [None]:
y_pred_deutsch = roberta(X_dev).logits.numpy().argmax(-1)

In [None]:
true = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dev.csv', sep=";", encoding="utf-8", index_col=0)
y_true = true["label"].map(label_map).to_numpy(dtype=np.int32)

In [None]:
print(true.iloc[0])

src_id                                       T951103.116.45
idiom                                       an Boden liegen
label                                                  both
text      Minute glich Obradovic erstmals aus ( 91:91 ) ...
Name: 0, dtype: object


In [None]:
print(classification_report(
    y_true = y_true,
    y_pred = y_pred_deutsch,
    target_names = ('literally', 'figuratively', 'both', 'undecidable'),
    digits = 4
  )
)

              precision    recall  f1-score   support

   literally     0.6523    0.9167    0.7622       264
figuratively     0.9776    0.8995    0.9369      1214
        both     0.0000    0.0000    0.0000         2
 undecidable     0.0000    0.0000    0.0000         8

    accuracy                         0.8965      1488
   macro avg     0.4075    0.4540    0.4248      1488
weighted avg     0.9133    0.8965    0.8996      1488



  _warn_prf(average, modifier, msg_start, len(result))


# Create submission files

In [None]:
label_map_reverse = {
    0: 'literally',
    1: 'figuratively',
    2: 'both',
    3: 'undecidable'
}

In [None]:
def map_back(y_pred):
  y_string_rep = []
  for y in y_pred : 
    y_string_rep.append(label_map_reverse[y])
  return y_string_rep

In [None]:
pred_string = map_back(y_pred_deutsch) 

In [None]:
dev = pd.read_csv(data_files["Deutsch"]["dev"], sep=";", encoding="utf-8", index_col=0)
dev["label"] = pred_string

In [None]:
file = open("/content/drive/MyDrive/Colab Notebooks/predictions_train_only.tsv", "w")
for index, row in dev.iterrows():
    file.write(row['src_id']+"\t"+row['idiom']+"\t"+row['label']+"\t"+row['text']+"\n")
file.close()