## Installing Libraries

In [1]:
!pip install nltk
!pip install transformers --quiet

import re
import nltk
import string
import os, gc
import pandas as pd
import tensorflow as tf
from transformers import TFAutoModel
from transformers import AutoTokenizer
nltk.download('stopwords')

You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

D0502 13:07:41.803092410   16828 config.cc:119]                        gRPC EXPERIMENT tcp_frame_size_tuning               OFF (default:OFF)
D0502 13:07:41.803135650   16828 config.cc:119]                        gRPC EXPERIMENT tcp_rcv_lowat                       OFF (default:OFF)
D0502 13:07:41.803139326   16828 config.cc:119]                        gRPC EXPERIMENT peer_state_based_framing            OFF (default:OFF)
D0502 13:07:41.803142000   16828 config.cc:119]                        gRPC EXPERIMENT flow_control_fixes                  ON  (default:ON)
D0502 13:07:41.803144387   16828 config.cc:119]                        gRPC EXPERIMENT memory_pressure_controller          OFF (default:OFF)
D0502 13:07:41.803146802   16828 config.cc:119]                        gRPC EXPERIMENT unconstrained_max_quota_buffer_size OFF (default:OFF)
D0502 13:07:41.803150178   16828 config.cc:119]                        gRPC EXPERIMENT new_hpack_huffman_decoder           ON  (default:ON)
D0502 13:07:41.

True

## Setting data paths

In [2]:
main_data_dir_path = "../input/jigsaw-multilingual-toxic-comment-classification/"
toxic_comment_train_csv_path = main_data_dir_path + "jigsaw-toxic-comment-train.csv"
unintended_bias_train_csv_path = main_data_dir_path + "jigsaw-unintended-bias-train.csv"
validation_csv_path = main_data_dir_path + "validation.csv"
test_csv_path = main_data_dir_path + "test.csv"
submission_csv_path = main_data_dir_path + "sample_submission.csv"

## TPU Configurations

In [3]:
#################### TPU Configurations ####################
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

AUTO = tf.data.experimental.AUTOTUNE
# Configuration
EPOCHS = 2
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192
MODEL = 'xlm-roberta-large'
NUM_SAMPLES = 150000
RANDOM_STATE = 42
LEARNING_RATE = 1e-5 ######################### MAIN CHANGE ############################
WEIGHT_DECAY = 1e-6

Running on TPU  
INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.
INFO:tensorflow:Initializing the TPU system: local
INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/

## Reading & Balancing the data by Target column

In [4]:
## Reading csv files 
train1 = pd.read_csv(toxic_comment_train_csv_path)
train2 = pd.read_csv(unintended_bias_train_csv_path)
valid = pd.read_csv(validation_csv_path)
test = pd.read_csv(test_csv_path)
sub = pd.read_csv(submission_csv_path)

## Converting floating points to integers ##
train2.toxic = train2['toxic'].round().astype(int)

##### BALANCING THE DATA ##### 
# : Taking all the data from toxic_comment_train_file & all data corresponding to unintended bias train file
# & sampling 150k observations randomly from non-toxic observation population.

# Combine train1 with a subset of train2
train = pd.concat([
    train1[['comment_text', 'toxic']],
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=NUM_SAMPLES, random_state=RANDOM_STATE)
])

## Dropping missing observations with respect to comment-text column 
train = train.dropna(subset=['comment_text'])

In [5]:
def encode(texts, tokenizer, max_len):
    """
    Function takes a list of texts, tokenizer (object)
    initialized from HuggingFace library, max_len (defines
    of how long the sentence lengths should be).
    """       
    tokens = tokenizer(texts, max_length=max_len, 
                    truncation=True, padding='max_length',
                    add_special_tokens=True, return_tensors='np')
    
    return tokens

## Encoding comment_text

In [6]:
## Intializing the tokenizer ##
tokenizer = AutoTokenizer.from_pretrained(MODEL)

train_inputs = encode(train['comment_text'].values.tolist(), 
                      tokenizer, max_len=MAX_LEN)
validation_inputs = encode(valid['comment_text'].values.tolist(),
                          tokenizer, max_len=MAX_LEN)
test_inputs = encode(test['content'].values.tolist(),
                    tokenizer, max_len=MAX_LEN)

## Preparing data using tf.data.Data API

In [7]:
def map_fn(input_ids, attention_mask, labels=None):
    if labels is not None:
        return {"input_ids": input_ids, "attention_mask": attention_mask}, labels
    else:
        return {"input_ids": input_ids, "attention_mask": attention_mask}

In [8]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_inputs["input_ids"],
                                                    train_inputs["attention_mask"],
                                                   train['toxic']))
train_dataset = train_dataset.map(map_fn)
train_dataset = train_dataset.repeat().shuffle(2048).batch(BATCH_SIZE).prefetch(AUTO)

validation_dataset = tf.data.Dataset.from_tensor_slices((validation_inputs['input_ids'],
                                                         validation_inputs['attention_mask'],
                                                        valid['toxic']))
validation_dataset = validation_dataset.map(map_fn)
validation_dataset = validation_dataset.batch(BATCH_SIZE).prefetch(AUTO)

test_dataset = tf.data.Dataset.from_tensor_slices((test_inputs['input_ids'],
                                                  test_inputs['attention_mask']))
test_dataset = test_dataset.map(map_fn)
test_dataset = test_dataset.batch(BATCH_SIZE)

## Building the model

In [9]:
def build_model(transformer_layer, max_len):
    """
    Creating the model input layers, output layers,
    model definition and compilation.
        
    Returns: model object after compiling. 
    """
    input_ids = tf.keras.layers.Input(shape=(max_len,), 
                                      dtype=tf.int32, 
                                      name="input_ids")
    attention_mask = tf.keras.layers.Input(shape=(max_len,), 
                                       dtype=tf.int32, 
                                       name="attention_mask")
    embeddings = transformer_layer(input_ids, 
                                 attention_mask=attention_mask)[1]
    x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
    y = tf.keras.layers.Dense(1, activation='sigmoid',name='outputs')(x)
    model = tf.keras.models.Model(inputs=[input_ids, attention_mask],
                             outputs=y)
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE,
                                         weight_decay=WEIGHT_DECAY)
    loss = tf.keras.losses.BinaryCrossentropy()
    AUC = tf.keras.metrics.AUC()
    
    model.compile(loss=loss, metrics=[AUC], optimizer=optimizer)    
    return model

## Loading model on TPUs

In [10]:
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer,
                        max_len=MAX_LEN)

All model checkpoint layers were used when initializing TFXLMRobertaModel.

All the layers of TFXLMRobertaModel were initialized from the model checkpoint at xlm-roberta-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


## Training the model on Only English data for 2 epochs

In [11]:
train_steps_per_epoch = train_inputs['input_ids'].shape[0] // BATCH_SIZE
train_history = model.fit(train_dataset,
                         steps_per_epoch=train_steps_per_epoch,
                         validation_data=validation_dataset,
                         epochs=2) 

Epoch 1/2


2023-05-02 13:12:35.144684: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_790/ReadVariableOp.
2023-05-02 13:12:37.609046: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_790/ReadVariableOp.




2023-05-02 13:37:41.315517: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.
2023-05-02 13:37:41.843120: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.


Epoch 2/2


## Training the model on Validation data for 2 epochs further to fine-tune on it

In [12]:
validation_steps_per_epoch = validation_inputs['input_ids'].shape[0] // BATCH_SIZE
validation_history = model.fit(validation_dataset.repeat(),
                              steps_per_epoch=validation_steps_per_epoch,
                              epochs=2)

Epoch 1/2
Epoch 2/2


## Submit to Competition

In [13]:
sub['toxic'] = model.predict(test_dataset, verbose=1)
sub.to_csv('submission.csv', index=False)

2023-05-02 14:04:07.130406: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2023-05-02 14:04:07.613220: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.




- Public Leaderboard Score: 0.9355 and Private Leaderboard Score: 0.9343