In [19]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


#### Imports and TPU setting

In [1]:
 ! pip install --upgrade kaggle -q
 ! pip install transformers -q

[K     |████████████████████████████████| 778kB 3.4MB/s 
[K     |████████████████████████████████| 890kB 10.3MB/s 
[K     |████████████████████████████████| 3.0MB 19.3MB/s 
[K     |████████████████████████████████| 1.1MB 24.2MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
import os
import re
import time
import numpy as np
import pandas as pd
import transformers
from tqdm import tqdm
import tensorflow as tf
from google.colab import files
import tensorflow_datasets as tfds
from transformers import BertTokenizer
from utils.text_models import BertInputs
from tensorflow.keras.models import Model
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from transformers import TFBertForSequenceClassification

import matplotlib.pyplot as plt
%matplotlib inline

tf.get_logger().setLevel('ERROR')

Using TensorFlow backend.


In [3]:
try:

    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print(f"Running on TPU: {tpu.master()}")
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print(f"REPLICAS: {strategy.num_replicas_in_sync}")

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


Running on TPU  grpc://10.55.8.130:8470
REPLICAS:  8


###### Load the data

In [4]:
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c 'jigsaw-multilingual-toxic-comment-classification' -p 'dataset' -q

Saving kaggle.json to kaggle.json


In [5]:
!unzip '/content/dataset/jigsaw-toxic-comment-train.csv.zip'
!unzip '/content/dataset/test.csv.zip'
!unzip '/content/dataset/validation.csv.zip'

Archive:  /content/dataset/jigsaw-toxic-comment-train.csv.zip
  inflating: jigsaw-toxic-comment-train.csv  
Archive:  /content/dataset/test.csv.zip
  inflating: test.csv                
Archive:  /content/dataset/validation.csv.zip
  inflating: validation.csv          


In [6]:
train = pd.read_csv('/content/jigsaw-toxic-comment-train.csv')
validation = pd.read_csv('/content/validation.csv')
test = pd.read_csv('/content/test.csv')

#### Preprocess

In [7]:
for col in train.columns:
  if (col not in validation.columns):
    print(f"Dropping {col}")
    train.drop([col],axis=1,inplace=True)

In [8]:
def clean_text(text):
  text = re.sub(r'[^\w\s]','',text, re.UNICODE)
  text = text.replace('\n', ' ')
  text = text.lower()
  return text
  
%time train['clean_text'] = train.comment_text.apply(lambda x: clean_text(x))
%time validation['clean_text'] = validation.comment_text.apply(lambda x: clean_text(x))
%time test['clean_text'] = test.content.apply(lambda x: clean_text(x))
train.head()

CPU times: user 2.11 s, sys: 38.7 ms, total: 2.15 s
Wall time: 2.15 s
CPU times: user 92.2 ms, sys: 0 ns, total: 92.2 ms
Wall time: 92.1 ms
CPU times: user 770 ms, sys: 21.7 ms, total: 792 ms
Wall time: 794 ms


Unnamed: 0,id,comment_text,toxic,clean_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,explanation why the edits made under my userna...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,daww he matches this background colour im seem...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,hey man im really not trying to edit war its j...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,more i cant make any real suggestions on impr...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,you sir are my hero any chance you remember wh...


In [9]:
print(f"train shape: {train.shape} \nvalidation shape: {validation.shape} \ntest shape: {test.shape}")
print(f"ratio between lables in validation: {validation.toxic.value_counts()[1] / validation.toxic.value_counts()[0]}")
print("-"*4)
mean_word_len = train.clean_text.apply(lambda x: len(x.split(" "))).mean()
print(f"Dataset with shape of {train.shape[0]} samples. \nMean number of words is: {mean_word_len}. \nDistribution of lables is: \n{train.toxic.value_counts()}")

train shape: (223549, 4) 
validation shape: (8000, 5) 
test shape: (63812, 4)
ratio between lables in validation: 0.18168389955686853
----
Dataset with shape of 223549 samples. 
Mean number of words is: 70.65667482296946. 
Distribution of lables is: 
0    202165
1     21384
Name: toxic, dtype: int64


#### Model inputs

In [16]:
# Configuration
EPOCHS = 3
BATCH_SIZE = 16 * 4 * strategy.num_replicas_in_sync
MAX_LEN = 200

In [24]:
bert_inputs_train = BertInputs(texts=train.clean_text.astype(str), lables=train.toxic, max_length=MAX_LEN, batch_size=BATCH_SIZE, bert_model_name='bert-base-multilingual-uncased')
train_inputs = bert_inputs_train.process_examples(train=True)

bert_inputs_validation = BertInputs(texts=validation.clean_text.astype(str), lables=validation.toxic, max_length=MAX_LEN, batch_size=BATCH_SIZE, bert_model_name='bert-base-multilingual-uncased')
validation_inputs = bert_inputs_validation.process_examples(train=False)

223549it [05:43, 650.71it/s]
8000it [00:12, 633.85it/s]


#### Build model

In [None]:
# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
LR = 2e-5

# we will do just 1 epoch for illustration, though multiple epochs might be better as long as we will not overfit the model
EPOCHS = 2
# model initialization
model = TFBertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased')

# choosing Adam optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=LR, epsilon=1e-08)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

my_callbacks = [
                tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='min', baseline=None, restore_best_weights=True)
]
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  167356416 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 167,357,954
Trainable params: 167,357,954
Non-trainable params: 0
_________________________________________________________________


In [None]:
bert_history = model.fit(train_inputs, 
                         epochs=EPOCHS, 
                         validation_data=validation_inputs,
                         callbacks=my_callbacks)

Epoch 1/2
Epoch 2/2


In [None]:
model.save_pretrained('/content/drive/My Drive/projects/Jigsaw Multilingual Toxic Comment Classification/bert_model')
tokenizer.save_pretrained('/content/drive/My Drive/projects/Jigsaw Multilingual Toxic Comment Classification/bert_model')

('/content/drive/My Drive/projects/Jigsaw Multilingual Toxic Comment Classification/bert_model/vocab.txt',
 '/content/drive/My Drive/projects/Jigsaw Multilingual Toxic Comment Classification/bert_model/special_tokens_map.json',
 '/content/drive/My Drive/projects/Jigsaw Multilingual Toxic Comment Classification/bert_model/added_tokens.json')

In [29]:
new_model = TFBertForSequenceClassification.from_pretrained('/content/drive/My Drive/projects/Jigsaw Multilingual Toxic Comment Classification/bert_model')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)


If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [30]:
new_model.summary()

Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  167356416 
_________________________________________________________________
dropout_75 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 167,357,954
Trainable params: 167,357,954
Non-trainable params: 0
_________________________________________________________________


In [32]:
LR = 2e-5
EPOCHS=10

optimizer = tf.keras.optimizers.Adam(learning_rate=LR, epsilon=1e-08)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
new_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

new_model.evaluate(validation_inputs)



[0.6951270699501038, 0.8500000238418579]

In [33]:
bert_inputs_validation_6000 = BertInputs(texts=validation.clean_text.astype(str).iloc[:6000], lables=validation.toxic.iloc[:6000], max_length=MAX_LEN, batch_size=BATCH_SIZE, bert_model_name='bert-base-multilingual-uncased')
validation_inputs_6000 = bert_inputs_validation_6000.process_examples(train=True)

bert_inputs_validation_2000 = BertInputs(texts=validation.clean_text.astype(str).iloc[6000:], lables=validation.toxic.iloc[6000:], max_length=MAX_LEN, batch_size=BATCH_SIZE, bert_model_name='bert-base-multilingual-uncased')
validation_inputs_2000 = bert_inputs_validation_2000.process_examples(train=False)

6000it [00:08, 668.47it/s]
2000it [00:03, 650.15it/s]


In [34]:
new_model.fit(validation_inputs_6000, 
              epochs=EPOCHS,
              validation_data=validation_inputs_2000)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f6039f67be0>