In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


#### Imports and TPU setting

In [None]:
! pip install --upgrade kaggle -q
! pip install transformers -q
! pip install emoji -qq
! pip install googletrans -qq

In [None]:
import os
import re
import time
import numpy as np
import pandas as pd
import transformers
from tqdm import tqdm
import tensorflow as tf
from google.colab import files
import tensorflow_datasets as tfds
from transformers import BertTokenizer
from tensorflow.keras.models import Model
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from transformers import TFAutoModel, AutoTokenizer

from setup import set_TPU
from text_models import XLMRobertaInputs

import matplotlib.pyplot as plt
%matplotlib inline

tf.get_logger().setLevel('ERROR')

#### Load the data

In [None]:
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c contradictory-my-dear-watson

Saving kaggle.json to kaggle.json
Downloading test.csv.zip to /content
  0% 0.00/536k [00:00<?, ?B/s]
100% 536k/536k [00:00<00:00, 36.4MB/s]
Downloading train.csv.zip to /content
  0% 0.00/1.23M [00:00<?, ?B/s]
100% 1.23M/1.23M [00:00<00:00, 83.8MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/66.0k [00:00<?, ?B/s]
100% 66.0k/66.0k [00:00<00:00, 58.1MB/s]


In [None]:
!unzip '/content/train.csv.zip'
!unzip '/content/test.csv.zip'

Archive:  /content/train.csv.zip
  inflating: train.csv               
Archive:  /content/test.csv.zip
  inflating: test.csv                


In [None]:
df = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

In [None]:
df.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,3931fbe82a,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.,fr,French,0
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1


In [None]:
test.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language
0,c6d58c3f69,بکس، کیسی، راہیل، یسعیاہ، کیلی، کیلی، اور کولم...,"کیسی کے لئے کوئی یادگار نہیں ہوگا, کولمین ہائی...",ur,Urdu
1,cefcc82292,هذا هو ما تم نصحنا به.,عندما يتم إخبارهم بما يجب عليهم فعله ، فشلت ال...,ar,Arabic
2,e98005252c,et cela est en grande partie dû au fait que le...,Les mères se droguent.,fr,French
3,58518c10ba,与城市及其他公民及社区组织代表就IMA的艺术发展进行对话&amp,IMA与其他组织合作，因为它们都依靠共享资金。,zh,Chinese
4,c32b0d16df,Она все еще была там.,"Мы думали, что она ушла, однако, она осталась.",ru,Russian


In [None]:
lang_dist_df = pd.DataFrame(df.language.value_counts()).reset_index()
lang_dist_df.columns = ['Language','Count']

lang_dist_df['Count (%)'] = lang_dist_df['Count'].apply(lambda x: round(x*100/lang_dist_df.Count.sum(),2))
lang_dist_df

Unnamed: 0,Language,Count,Count (%)
0,English,6870,56.68
1,Chinese,411,3.39
2,Arabic,401,3.31
3,French,390,3.22
4,Swahili,385,3.18
5,Urdu,381,3.14
6,Vietnamese,379,3.13
7,Russian,376,3.1
8,Hindi,374,3.09
9,Greek,372,3.07


In [None]:
df['text_length'] = df.premise.apply(lambda x: len(x))
lang_dist_df = pd.DataFrame(df.groupby(['language'])['text_length'].mean()).sort_values('text_length')
lang_dist_df.style.bar(subset=['text_length'], align='mid', color='#d65f5f')

Unnamed: 0_level_0,text_length
language,Unnamed: 1_level_1
Chinese,33.379562
Arabic,90.184539
Urdu,98.706037
Thai,98.754717
Turkish,102.908832
Swahili,104.420779
Hindi,105.94385
Russian,110.143617
English,111.195633
Bulgarian,112.30117


#### Preprocess

In [None]:
slice_df = int(df.shape[0]*0.9)
train = df.iloc[:slice_df]
validation = df.iloc[slice_df:]

In [None]:
print(f"train shape: {train.shape} \nvalidation shape: {validation.shape}")

train shape: (10908, 7) 
validation shape: (1212, 7)


#### Modelling

###### Build model inputs

In [None]:
strategy = set_TPU()

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


Running on TPU: grpc://10.122.119.170:8470




REPLICAS: 8


In [None]:
# Configuration
MODEL = 'jplu/tf-xlm-roberta-large'
EPOCHS = 10
MAX_LEN = 96

BATCH_SIZE = 16 * strategy.num_replicas_in_sync

In [None]:
xlmroberta_inputs_train = XLMRobertaInputs(train[['premise','hypothesis']].values.tolist(), train.label.values, max_length=MAX_LEN, batch_size=BATCH_SIZE)
%time train_inputs = xlmroberta_inputs_train.process_examples(train=True)

xlmroberta_inputs_val = XLMRobertaInputs(validation[['premise','hypothesis']].values.tolist(), validation.label.values, max_length=MAX_LEN, batch_size=BATCH_SIZE)
%time validation_inputs = xlmroberta_inputs_val.process_examples(train=False)

CPU times: user 2.38 s, sys: 61.4 ms, total: 2.44 s
Wall time: 2.45 s
CPU times: user 249 ms, sys: 3.11 ms, total: 252 ms
Wall time: 251 ms


In [None]:
def build_model(transformer,max_len):
    
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    sequence_output = transformer(input_ids)[0]
    cls_token = sequence_output[:, 0, :]
    cls_token = Dropout(0.2)(cls_token)
    cls_token = Dense(32,activation='relu')(cls_token)
    out = Dense(3, activation='softmax')(cls_token)

    model = Model(inputs=input_ids, outputs=out)
    model.compile(
        Adam(lr=2e-5), 
        loss='sparse_categorical_crossentropy', 
        metrics=['accuracy']
    )
    
    return model

In [None]:
def scheduler(epoch, lr):
  if epoch < 3:
    return lr*1.1
  else:
    return lr * tf.math.exp(-0.1)

In [None]:
def scheduler(epoch, lr, lr_start=0.00001, lr_max=0.00003, lr_min=0.000001, lr_rampup_epochs=3, lr_sustain_epochs=0, lr_exp_decay=.6):
  if epoch < lr_rampup_epochs:
      lr = (lr_max - lr_start) / lr_rampup_epochs * epoch + lr_start
  elif epoch < lr_rampup_epochs + lr_sustain_epochs:
      lr = lr_max
  else:
      lr = (lr_max - lr_min) * lr_exp_decay**(epoch - lr_rampup_epochs - lr_sustain_epochs) + lr_min
  return lr

In [None]:
from tensorflow.keras.layers import Dense, Input,Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

transformer_layer = TFAutoModel.from_pretrained(MODEL)
model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Model: "functional_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 96)]              0         
_________________________________________________________________
tf_roberta_model_4 (TFRobert ((None, 96, 1024), (None, 559890432 
_________________________________________________________________
tf_op_layer_strided_slice_6  [(None, 1024)]            0         
_________________________________________________________________
dropout_376 (Dropout)        (None, 1024)              0         
_________________________________________________________________
dense_12 (Dense)             (None, 32)                32800     
_________________________________________________________________
dense_13 (Dense)             (None, 3)                 99        
Total params: 559,923,331
Trainable params: 559,923,331
Non-trainable params: 0
_______________________________________

###### Training

In [None]:
n_steps = train.shape[0]//BATCH_SIZE
callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

train_history = model.fit(train_inputs, validation_data=validation_inputs, steps_per_epoch=n_steps, callbacks=[callback], epochs=EPOCHS, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
text = [['its a nice day', 'the weather outside seems to be good']]
token = xlmroberta_inputs_train.tokenizer.batch_encode_plus(text,
                        max_length = MAX_LEN, # max length of the text that can go to BERT
                        pad_to_max_length = True, # add [PAD] tokens
                        truncation = True
                    )
xlmroberta_inputs_pred = np.array(token['input_ids'])

model.predict(xlmroberta_inputs_pred) # Entailment

array([[9.9785525e-01, 2.1113728e-03, 3.3253353e-05]], dtype=float32)

In [None]:
text = [['its a bed day', 'the weather outside seems to be good']]
token = xlmroberta_inputs_train.tokenizer.batch_encode_plus(text,
                        max_length = MAX_LEN, # max length of the text that can go to BERT
                        pad_to_max_length = True, # add [PAD] tokens
                        truncation = True
                    )
xlmroberta_inputs_pred = np.array(token['input_ids'])

model.predict(xlmroberta_inputs_pred) # Neutral

array([[0.00105119, 0.8581041 , 0.1408447 ]], dtype=float32)

###### Build model

In [None]:
LR = 2e-5

EPOCHS = 2

model = TFBertForSequenceClassification.from_pretrained(
    'bert-base-multilingual-uncased', 
    config=BertConfig.from_pretrained('bert-base-multilingual-uncased', num_labels=46)
)

optimizer = tf.keras.optimizers.Adam(learning_rate=LR, epsilon=1e-08)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

my_callbacks = [
                tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=1, verbose=0, mode='min', baseline=None, restore_best_weights=True)
]
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=999358484.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing TFBertForSequenceClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier', 'dropout_37']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
