## Installing Libraries

In [3]:
!pip install nltk
!pip install transformers --quiet

import re
import nltk
import string
import os, gc
import pandas as pd
import tensorflow as tf
from transformers import TFAutoModel
from transformers import AutoTokenizer
nltk.download('stopwords')

You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Setting data paths

In [4]:
main_data_dir_path = "../input/jigsaw-multilingual-toxic-comment-classification/"
toxic_comment_train_csv_path = main_data_dir_path + "jigsaw-toxic-comment-train.csv"
unintended_bias_train_csv_path = main_data_dir_path + "jigsaw-unintended-bias-train.csv"
validation_csv_path = main_data_dir_path + "validation.csv"
test_csv_path = main_data_dir_path + "test.csv"
submission_csv_path = main_data_dir_path + "sample_submission.csv"

## TPU Configurations

In [5]:
#################### TPU Configurations ####################
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

AUTO = tf.data.experimental.AUTOTUNE
# Configuration
EPOCHS = 2
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192
MODEL = 'bert-base-multilingual-uncased'
NUM_SAMPLES = 150000
RANDOM_STATE = 42
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 1e-6

Running on TPU  
INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.
INFO:tensorflow:Initializing the TPU system: local
INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/

## Reading & Balancing the data by Target column

In [6]:
## Reading csv files 
train1 = pd.read_csv(toxic_comment_train_csv_path)
train2 = pd.read_csv(unintended_bias_train_csv_path)
valid = pd.read_csv(validation_csv_path)
test = pd.read_csv(test_csv_path)
sub = pd.read_csv(submission_csv_path)

## Converting floating points to integers ##
train2.toxic = train2['toxic'].round().astype(int)

##### BALANCING THE DATA ##### 
# : Taking all the data from toxic_comment_train_file & all data corresponding to unintended bias train file
# & sampling 150k observations randomly from non-toxic observation population.

# Combine train1 with a subset of train2
train = pd.concat([
    train1[['comment_text', 'toxic']],
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=NUM_SAMPLES, random_state=RANDOM_STATE)
])

## Dropping missing observations with respect to comment-text column 
train = train.dropna(subset=['comment_text'])

## Preprocessing the data

In [7]:
## Removing URL's
def remove_urls(text):
    """
    Function takes a sentence and removes the URLs from the provided sentence.
    """
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

# define a regular expression pattern to match non-alphanumeric characters
pattern = r"[^a-zA-Z0-9 ]"

# define a function to remove non-alphanumeric characters from a string using the pattern
def remove_non_alphanumeric(text):
    return re.sub(pattern, "", text)

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text: str, punctuations: str):
    """
    Function takes a sentence & a punctuations string to remove punctuations from a sentence.
    """
    return text.translate(str.maketrans("", "", punctuations))

## Removing multi-characters
def remove_multiplechars(text):
    text = re.sub(r'(.)\1{3,}',r'\1', text)
    return text

def contraction_to_expansion(text: str, contractions_dict: dict):
    """
    Function takes a sentence (text) and a dictionary to map words like ain't to am not etc.
    and returns the final sentence.
    """
    if type(text) is str:
        for key in contractions_dict:
            value = contractions_dict[key]
            text = text.replace(key, value)
        return text
    else:
        return text
    
contractions = {"ain't": "am not","aren't": "are not","can't": "cannot","can't've": "cannot have",
                "'cause": "because","could've": "could have", "couldn't": "could not",
                "couldn't've": "could not have","didn't": "did not","doesn't": "does not","don't": "do not",
                "hadn't": "had not", "hadn't've": "had not have","hasn't": "has not","haven't": "have not",
                "he'd": "he would","he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                "he's": "he is","how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is",
                "I'd": "I would", "I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am",
                "I've": "I have","isn't": "is not","it'd": "it had", "it'd've": "it would have","it'll": "it will",
                "it'll've": "it will have","it's": "it is","let's": "let us","ma'am": "madam","mayn't": "may not",
                "might've": "might have","mightn't": "might not","mightn't've": "might not have","must've": "must have",
                "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", 
                "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not", 
                "oughtn't've": "ought not have", "shan't": "shall not","sha'n't": "shall not",
                "shan't've": "shall not have","she'd": "she would","she'd've": "she would have", "she'll": "she will",
                "she'll've": "she will have","she's": "she is","should've": "should have","shouldn't": "should not",
                "shouldn't've": "should not have", "so've": "so have","so's": "so is","that'd": "that would",
                "that'd've": "that would have","that's": "that is","there'd": "there had",
                "there'd've": "there would have","there's": "there is","they'd": "they would",
                "they'd've": "they would have","they'll": "they will","they'll've": "they will have",
                "they're": "they are","they've": "they have","to've": "to have","wasn't": "was not","we'd": "we had",
                "we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are",
                "we've": "we have","weren't": "were not","what'll": "what will", "what'll've": "what will have",
                "what're": "what are","what's": "what is","what've": "what have","when's": "when is",
                "when've": "when have", "where'd": "where did","where's": "where is","where've": "where have",
                "who'll": "who will","who'll've": "who will have","who's": "who is", "who've": "who have",
                "why's": "why is","why've": "why have","will've": "will have","won't": "will not",
                "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                "wouldn't've": "would not have","y'all": "you all","y'alls": "you alls","y'all'd": "you all would",
                "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                "you'd": "you had", "you'd've": "you would have","you'll": "you you will",
                "you'll've": "you you will have","you're": "you are","you've": "you have"}

#################################################### CLEANING THE DATA ####################################################
def clean_text(text):
    text = text.replace("\n", "") ## Remove the next line character ##
    text = remove_urls(text) ## Remove the URLs ##
    text = re.compile(r'＼\(.+?\)/').sub("", text) ## Remove emoticons ##
    text = remove_non_alphanumeric(text) ## Remove Unicode characters ##
    text = remove_punctuation(text, PUNCT_TO_REMOVE) ## Removing Punctuations ##
    text = remove_multiplechars(text) ## Removing multiple characters in a word ##
    text = contraction_to_expansion(text, contractions)
    return text

stop_words = nltk.corpus.stopwords.words('english')

In [8]:
###### CLEANING THE DATA ######

train['comment_text'] = train['comment_text'].apply(lambda x: clean_text(x))
valid['comment_text'] = valid['comment_text'].apply(lambda x: clean_text(x))
test['content'] = test['content'].apply(lambda x: clean_text(x))

In [9]:
def encode(texts, tokenizer, max_len):
    """
    Function takes a list of texts, tokenizer (object)
    initialized from HuggingFace library, max_len (defines
    of how long the sentence lengths should be).
    """       
    tokens = tokenizer(texts, max_length=max_len, 
                    truncation=True, padding='max_length',
                    add_special_tokens=True, return_tensors='np')
    
    return tokens

## Encoding comment_text

In [10]:
## Intializing the tokenizer ##
tokenizer = AutoTokenizer.from_pretrained(MODEL)

train_inputs = encode(train['comment_text'].values.tolist(), 
                      tokenizer, max_len=MAX_LEN)
validation_inputs = encode(valid['comment_text'].values.tolist(),
                          tokenizer, max_len=MAX_LEN)
test_inputs = encode(test['content'].values.tolist(),
                    tokenizer, max_len=MAX_LEN)

## Preparing data using tf.data.Data API

In [11]:
def map_fn(input_ids, attention_mask, labels=None):
    if labels is not None:
        return {"input_ids": input_ids, "attention_mask": attention_mask}, labels
    else:
        return {"input_ids": input_ids, "attention_mask": attention_mask}

In [12]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_inputs["input_ids"],
                                                    train_inputs["attention_mask"],
                                                   train['toxic']))
train_dataset = train_dataset.map(map_fn)
train_dataset = train_dataset.repeat().shuffle(2048).batch(BATCH_SIZE).prefetch(AUTO)

validation_dataset = tf.data.Dataset.from_tensor_slices((validation_inputs['input_ids'],
                                                         validation_inputs['attention_mask'],
                                                        valid['toxic']))
validation_dataset = validation_dataset.map(map_fn)
validation_dataset = validation_dataset.batch(BATCH_SIZE).prefetch(AUTO)

test_dataset = tf.data.Dataset.from_tensor_slices((test_inputs['input_ids'],
                                                  test_inputs['attention_mask']))
test_dataset = test_dataset.map(map_fn)
test_dataset = test_dataset.batch(BATCH_SIZE)

## Building the model

In [13]:
def build_model(transformer_layer, max_len):
    """
    Creating the model input layers, output layers,
    model definition and compilation.
        
    Returns: model object after compiling. 
    """
    input_ids = tf.keras.layers.Input(shape=(max_len,), 
                                      dtype=tf.int32, 
                                      name="input_ids")
    attention_mask = tf.keras.layers.Input(shape=(max_len,), 
                                       dtype=tf.int32, 
                                       name="attention_mask")
    embeddings = transformer_layer(input_ids, 
                                 attention_mask=attention_mask)[1]
    x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
    y = tf.keras.layers.Dense(1, activation='sigmoid',name='outputs')(x)
    model = tf.keras.models.Model(inputs=[input_ids, attention_mask],
                             outputs=y)
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE,
                                         weight_decay=WEIGHT_DECAY)
    loss = tf.keras.losses.BinaryCrossentropy()
    AUC = tf.keras.metrics.AUC()
    
    model.compile(loss=loss, metrics=[AUC], optimizer=optimizer)    
    return model

## Loading model on TPUs

In [14]:
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer,
                        max_len=MAX_LEN)

Some layers from the model checkpoint at bert-base-multilingual-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


## Training the model on Only English data for 2 epochs

In [15]:
train_steps_per_epoch = train_inputs['input_ids'].shape[0] // BATCH_SIZE
train_history = model.fit(train_dataset,
                         steps_per_epoch=train_steps_per_epoch,
                         validation_data=validation_dataset,
                         epochs=2) 

Epoch 1/2


2023-05-02 11:17:05.548615: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_406/ReadVariableOp.
2023-05-02 11:17:06.548563: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_406/ReadVariableOp.




2023-05-02 11:26:31.256303: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.
2023-05-02 11:26:31.539232: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.


Epoch 2/2


## Training the model on Validation data for 2 epochs further to fine-tune on it

In [16]:
validation_steps_per_epoch = validation_inputs['input_ids'].shape[0] // BATCH_SIZE
validation_history = model.fit(validation_dataset.repeat(),
                              steps_per_epoch=validation_steps_per_epoch,
                              epochs=2)

Epoch 1/2
Epoch 2/2


## Submit to Competition

In [17]:
sub['toxic'] = model.predict(test_dataset, verbose=1)
sub.to_csv('submission.csv', index=False)

2023-05-02 11:37:18.431298: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2023-05-02 11:37:18.686189: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.




- Public Leaderboard Score: 0.8259 and Private Leaderboard Score: 0.8239