In [1]:
import tensorflow as tf
import os

In [None]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])

tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))

In [None]:
# 4.2.1
!pip install transformers==4.2.1

In [None]:
pip install tf-models-official

In [5]:
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
import re
import os
from collections import Counter
from ast import literal_eval

In [None]:
# Train Data
df_train = pd.read_csv('tsd_train.csv',dtype=str, encoding='utf-8', index_col=False)
df_train["spans"] = df_train.spans.apply(literal_eval)
df_train.head()

In [None]:
# Trial Data *Replace the file by test file for prediction*
df_trial = pd.read_csv('tsd_trial.csv',dtype=str, encoding='utf-8', index_col=False)
df_trial["spans"] = df_trial.spans.apply(literal_eval)
df_trial.head()

In [41]:
# Train data
lines = df_train["text"]
lines_original = df_train["text"]
tags = df_train["spans"]

#Trial Data
lines_trial = df_trial["text"]
lines_original_trial = df_trial["text"]

In [42]:
# Returns position of each tag indices seperated from the single tag indices array
def tag_index(indices):
    indices_sep = []
    temp = []
    for i in range(0,len(indices)-1):
        temp.append(int(indices[i]))
        
        if(i == len(indices)-2):
            temp.append(int(indices[i+1]))
            indices_sep.append(temp)
        
        if(int(indices[i]) + 1 != int(indices[i+1])):
            if(i == len(indices)-1):
                print("Error")
                
            indices_sep.append(temp)
            temp = []
    return indices_sep

In [43]:
# each row contains array of character position of each tag
tag_indices = tags.apply(lambda tag: tag_index(tag))

In [45]:
# Viewing toxic spans after seperation
index = 6
u_temp =tag_indices[index]
print(lines[index])
print("No. of toxic words: ",len(u_temp))
t_temp = (lines[index])
for y_temp in u_temp:
  print(t_temp[y_temp[0]:y_temp[-1]+1])

Please people, stop using these silly, stupid emoticons.
No. of toxic words:  2
stupid
emoticons


In [46]:
# Removes punctuation. Used later
def remove_punctuation(text):
    return re.sub('^["(.,*\\!)\[\]*?""]{1,2}|["(.,*\\!)\[\]*?""]{1,3}$', '', text) 

# Finds the toxic word from its array of indices
def toxic_word(line, tag_indice):
    toxic_words = []
    for i in range(0,len(tag_indice)):
        index_range = tag_indice[i]
        start = index_range[0]
        end = index_range[-1] + 1 
        words = line[start:end]
        for word in words.split():
            word = word.lower()
            word = remove_punctuation(word)
            toxic_words.append(word)
    return toxic_words

In [None]:
# Testing toxic word seperation
index = 0
toxic_word(lines[index],tag_indices[index]) # lines contains the string. tag_indices contains independent toxic word positions.

In [48]:
toxic_words = []
for i in range(0,len(lines)):
    line = lines[i]
    tag_indice = tag_indices[i]
    toxic_words.append(toxic_word(line, tag_indice))

In [None]:
toxic_words[:10]

<h5> Finding toxic words is done. Now we need to locate them in lines to convert it into a token classification task.</h5>

In [50]:
def remove_whitespace(text):
    whitespace = re.compile(r"\s+")
    return whitespace.sub(" ", text).strip()

def remove_ascii(text):
    return (text.encode('ascii', 'ignore')).decode("utf-8")

In [None]:
# lowercasing, removing white spaces, removing ascii characters to input into our model
# Train data
lines = lines.apply(lambda text: text.lower())
lines = lines.apply(lambda text: remove_whitespace(text))
lines = lines.apply(lambda text: remove_ascii(text))
print(lines[2602])
print("**********************************************")
lines_original = lines_original.apply(lambda text: text.lower())
print(lines_original[2602])

print("\n#################################################################################################################\n")

#trial data
lines_trial = lines_trial.apply(lambda text: text.lower())
lines_trial = lines_trial.apply(lambda text: remove_whitespace(text))
lines_trial = lines_trial.apply(lambda text: remove_ascii(text))
print(lines_trial[260])
print("**********************************************")
lines_original_trial = lines_original_trial.apply(lambda text: text.lower())
print(lines_original_trial[260])

In [None]:
#Splitting lines into words
#Train Data
lines_split = lines.apply(lambda text: text.split())
print(len(lines_split[0]))
print(lines_split[0])

print("\n#################################################################################################################\n")

#Trial Data
lines_split_trial = lines_trial.apply(lambda text: text.split())
print(len(lines_split_trial[260]))
print(lines_split_trial[260])

In [53]:
# For empty strings that get stored in some places after remove punctuation step carried next.
def data_leak(word):
    if word == '':
        word = "p"
    return word

In [54]:
# Cleaning seperated words
def split_filter(text):
    text_array = pd.Series(text)
    text_array = text_array.apply(lambda word: remove_punctuation(word))
    
    # data problem solution casued by removing punctuation as some strings were purely punctuation.
    text_array = text_array.apply(lambda word: data_leak(word))
    
    if(len(text) != len(text_array)):
        print("Length mismatch")
    return np.asarray(text_array)

In [None]:
# Train data
lines_split_no_punct = lines_split.apply(lambda l: split_filter(l))
print(lines_split[0])
print(lines_split_no_punct[0])

print("\n#################################################################################################################\n")

#Trial data
lines_split_no_punct_trial = lines_split_trial.apply(lambda l: split_filter(l))
print(lines_split_trial[260])
print(lines_split_no_punct_trial[260])

<h5>Text pre-processing is done. We now create labels using the toxic words found on cleaned text</h5>

In [56]:
def create_label(toxic_word, word_array):
    words = list(set(toxic_word))
    label = np.zeros((len(word_array)))
    for word in words:
        positions = (np.where(word_array == word))
        for position in positions:
            label[position] = 1
    return label

In [57]:
train_tags = []
for i in range(0,len(lines_split_no_punct)):
    tag = create_label(toxic_words[i], lines_split_no_punct[i]) 
    train_tags.append(tag)

In [None]:
index = 2
print(lines_split_no_punct[index])
print(train_tags[index])
print(toxic_words[index])

In [None]:
# Converting from np array to lists for tokenizer
# Train Data
train_texts = list(lines_split_no_punct)
for i in range(0,len(train_texts)):
    train_texts[i] = list(train_texts[i])
print("Length of Train data: ",np.shape(train_texts))

# Trial Data
trial_texts = list(lines_split_no_punct_trial)
for i in range(0,len(trial_texts)):
    trial_texts[i] = list(trial_texts[i])
print("Length of Trial data: ",np.shape(trial_texts))

In [None]:
# Finding length of sequences (hyper parameter for neural network.)
u = lambda text: len(text.split(" "))
sentence_lengths = []
for x in train_texts:
    sentence_lengths.append(len(x))
print(sorted(sentence_lengths)[-50:])
print(len(sentence_lengths))

<h4>Token classification task</h4>

In [61]:
from transformers import TFMPNetModel, MPNetTokenizerFast, XLNetTokenizerFast, TFXLNetModel, AlbertTokenizerFast, TFMT5EncoderModel, TFAlbertModel, TFT5EncoderModel, T5TokenizerFast, TFT5Model, RobertaTokenizerFast, TFRobertaModel, AutoTokenizer, TFXLMRobertaModel, TFBertModel, BertTokenizerFast, TFElectraModel, ElectraTokenizerFast
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint
from official import nlp
import official.nlp.optimization

from sklearn.metrics import classification_report

In [None]:
# Use tokenizer as required. Remove add_prefic_space for other tokenizers apart from roberta
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base", add_prefix_space=True)
# Train set
train_encodings = tokenizer(train_texts, is_split_into_words=True, padding=True, truncation=True)

# Trial set (max length is set for different tokenizers some returned less than 250)
trial_encodings = tokenizer(trial_texts, max_length=250, is_split_into_words=True, padding="max_length", truncation=True)

In [None]:
print(np.shape(train_encodings.input_ids))
print(np.shape(trial_encodings.input_ids))

In [None]:
# Make labels compatible as per tokeniser split and returns training masks for prediction.
def encode_tags(tags, encodings):

    label_all_tokens = False
    encoded_labels = []
    masks = []
    
    for i in range(0, len(tags)):
        if( i%1000 == 0):
          print(str(i) + "...")
          
        label = tags[i]
#         print(label)
        word_ids = encodings[i].word_ids
#         print(word_ids)
        previous_word_idx = None
        label_ids = []
        mask_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
                mask_ids.append(0)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
                mask_ids.append(1)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                mask_ids.append(label[word_idx] if label_all_tokens else 0)

            previous_word_idx = word_idx
            
#         print(label_ids)
#         print(mask_ids)
#         print()

        encoded_labels.append(label_ids)
        masks.append(mask_ids)


    return (encoded_labels, masks)

train_labels, train_masks = encode_tags(train_tags, train_encodings)

In [None]:
index = 3
print(len(train_tags[index]))
print(len(train_texts[index]))

In [None]:
# Returns masks for trial/test data as per tokenizer
def get_masks(texts, encodings):

    label_all_tokens = False
    masks = []
    
    for i in range(0, len(texts)):
        if(i%100 == 0):
          print(i,"...")

        word_ids = encodings[i].word_ids
#         print(word_ids)
        previous_word_idx = None
        mask_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                mask_ids.append(0)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                mask_ids.append(1)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                mask_ids.append(label[word_idx] if label_all_tokens else 0)

            previous_word_idx = word_idx
            
#         print(mask_ids)
#         print()

        masks.append(mask_ids)


    return (masks)
trial_masks = get_masks(trial_texts, trial_encodings)

In [67]:
# Test function for lengths
for i in range(0,len(train_encodings.input_ids)):
    if(len(train_encodings.input_ids[i]) != len(train_labels[i])):
        print(i)
        
for i in range(0,len(trial_encodings.input_ids)):
    if(len(trial_encodings.input_ids[i]) != len(trial_masks[i])):
        print(i)

In [69]:
# Train Data
truncated_train = np.asarray(train_encodings.input_ids)[:,:250]
truncated_train_labels = np.asarray(train_labels)[:,:250]
truncated_train_masks = np.asarray(train_masks)[:,:250]

# Trial Data
truncated_trial = np.asarray(trial_encodings.input_ids)[:,:250]
truncated_trial_masks = np.asarray(trial_masks)[:,:250]

In [None]:
attention_masks_train = np.asarray(train_encodings.attention_mask)[:,:250]
attention_masks_trial = np.asarray(trial_encodings.attention_mask)[:,:250]
print(np.shape(attention_masks_train))
print(np.shape(attention_masks_trial))

In [None]:
# Train Data
index = 0
print(train_texts[index])
print(toxic_words[index])
print(truncated_train_labels[index,:25])
print(truncated_train_masks[index,:25])

In [None]:
# Trial data
index = 0
print(trial_texts[index])
# print(toxic_words[index])
# print(truncated_train_labels[index,:40])
print(truncated_trial_masks[index,:60])

In [None]:
# Train Data
print(np.shape(truncated_train))
print(np.shape(truncated_train_labels))
print(np.shape(truncated_train_masks))

# Trial Data
print(np.shape(truncated_trial))
# print(np.shape(truncated_train_labels))
print(np.shape(truncated_trial_masks))

<h5> Model </h5>

In [None]:
strategy = tf.distribute.TPUStrategy(resolver)

In [89]:
# Bert, electra, roberta, XLM-Roberta Model, XLnet
def toxic_span(input_shape):
    #Model
    inputs = keras.Input(shape=input_shape, dtype='int32')

    # Import model as required 
    model = TFRobertaModel.from_pretrained('roberta-base')
    layer = model.layers[0]
    output = layer(inputs)[0]
    output = keras.layers.BatchNormalization()(output)
    output = keras.layers.Dropout(0.1)(output)

    dense = keras.layers.Dense(1, activation="sigmoid")
    answer = keras.layers.TimeDistributed(dense)(output)

    model = keras.Model(inputs=inputs, outputs=answer, name='toxic_span')
    
    return model

In [90]:
from tensorflow.keras import backend as K
def custom_loss(y_true, y_pred):
    bce = tf.keras.losses.BinaryCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
    isMask = tf.math.not_equal(y_true, -100)
    mask = tf.cast(isMask, dtype=tf.float32)
    y_true_mask = tf.math.multiply(mask,tf.cast(y_true, dtype=tf.float32))
    y_pred_mask = tf.math.multiply(mask,y_pred)
    loss = bce(y_true, y_pred)
    loss_masked = bce(y_true_mask, y_pred_mask) * 10
    return loss_masked

In [None]:
# Set up epochs and steps
epochs = 4
batch_size = 16

train_data_size = len(truncated_train)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# creates an optimizer with learning rate schedule
optimizer = nlp.optimization.create_optimizer(
    5e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

In [None]:
with strategy.scope():
    model = toxic_span((250,))
    optimizer = optimizer
    loss_fun = custom_loss
    model.compile(optimizer=optimizer, loss=loss_fun)

In [None]:
# model_ = toxic_span((250,), bert_layer)
model.summary()

In [94]:
len(truncated_train)

7939

<h5> Custom evaluation metric </h5>

In [95]:
def get_predicted_words(train_prediction, train_texts, truncated_train_masks):
    predicted_labels = []
    predicted_toxic_words = []
    round_pred = np.round(train_prediction)
    train_texts = np.asarray(train_texts)
    for i in range(0,len(truncated_train_masks)):
    #     print(i)
        pred_label = np.zeros(len(train_texts[i]))
        pred_label = round_pred[i][(truncated_train_masks[i,:] == 1)]
        pred_label = np.squeeze(pred_label, axis=-1)
        predicted_labels.append(pred_label)

        pred_toxic_words = []
        for j in range(0,len(pred_label)):
            if (pred_label[j] == 1):
                pred_toxic_words.append(train_texts[i][j])

        predicted_toxic_words.append(pred_toxic_words)
        
    return (predicted_labels, predicted_toxic_words)

In [96]:
def get_char_positions(lines_original, predicted_toxic_words):
    
    char_positions = []
    for i in range(0,len(lines_original)):
        seq_i = []
        for toxic_word in list(set(predicted_toxic_words[i])):
            temp = [(m.start(),m.end()) for m in re.finditer(re.escape(toxic_word), lines_original[i])]
            for start,end in temp:
                seq_i.append(np.arange(start,end))
        if(len(seq_i) != 0):
            seq_i = set(np.concatenate(seq_i, axis=-1))
            seq_i = list((seq_i))
            seq_i.sort()
        char_positions.append(seq_i)
    return char_positions

In [97]:
def f1(predictions, gold):
    """
    F1 (a.k.a. DICE) operating on two lists of offsets (e.g., character).
    >>> assert f1([0, 1, 4, 5], [0, 1, 6]) == 0.5714285714285714
    :param predictions: a list of predicted offsets
    :param gold: a list of offsets serving as the ground truth
    :return: a score between 0 and 1
    """
    if len(gold) == 0:
        return [1,1,1] if len(predictions)==0 else [0,0,0]
    nom = 2*len(set(predictions).intersection(set(gold)))
    denom = len(set(predictions))+len(set(gold))
    f1 = nom/denom
    if len(predictions) == 0:
      precision = 0
    else:
      precision = len(set(predictions).intersection(set(gold)))/len(set(predictions))
    recall = len(set(predictions).intersection(set(gold)))/len(set(gold))
    return [f1,precision, recall]

In [102]:
class EvaluationMetric(keras.callbacks.Callback):   
    
    def __init__(self, truncated_trial, trial_original, trial_texts, truncated_trial_masks, lines_original_trial, attention_masks):
        super(EvaluationMetric, self).__init__()
        self.truncated_trial = truncated_trial
        self.trial_original = trial_original
        self.trial_texts = trial_texts
        self.truncated_trial_masks = truncated_trial_masks
        self.lines_original_trial = lines_original_trial
        self.attention_masks = attention_masks
    
    def on_epoch_begin(self, epoch, logs={}):
        print("\nTraining...")

    def on_epoch_end(self, epoch, logs={}):
        print("\nEvaluating...")
        trial_prediction = self.model.predict(self.truncated_trial)
        
        predicted_labels, predicted_toxic_words = get_predicted_words(trial_prediction, self.trial_texts, self.truncated_trial_masks)
        
        final = get_char_positions(self.lines_original_trial, predicted_toxic_words)
        
        sum_f1 = 0
        precision = 0
        recall = 0
        for i in range(0,len(final)):
            sum_f1 = sum_f1 + f1(final[i], self.trial_original[i])[0]
            # print(f1(final[i], self.trial_original[i]))
            precision = precision + f1(final[i], self.trial_original[i])[1]
            recall = recall + f1(final[i], self.trial_original[i])[2]
        
        print("\nF1 on val set: ",sum_f1/len(final))
        print("\nPrecision on val set: ",precision/len(final))
        print("\nRecall on val set: ",recall/len(final))

# Comment the evaluation metric while predicting on train set        
evaluation_metric = EvaluationMetric(truncated_trial, np.asarray(df_trial["spans"]), trial_texts, truncated_trial_masks, lines_original_trial, attention_masks_trial)

In [103]:
checkpoint = ModelCheckpoint(filepath='/content/roberta.{epoch:03d}.h5',
                                 verbose = 0,
                                 save_weights_only=True,
                                 epoch=1)

In [None]:
# Roberta retrain used for visualisation
history = model.fit(
    x = truncated_train,
    y = truncated_train_labels,
    batch_size=16,
    shuffle=True,
    callbacks = [evaluation_metric, checkpoint],
    epochs=1)

<h3>Train Over </h3>
<p> The next part is for creating results file. Use test file instead of trial file while loading trial data for test set preidictions.</p> 

In [118]:
# Test results if you imported test file during initialisation.
trial_prediction = model.predict(truncated_trial)

In [None]:
# trial_prediction[3][:20]
np.shape(trial_prediction)

In [None]:
predicted_labels, predicted_toxic_words = get_predicted_words(trial_prediction, trial_texts, truncated_trial_masks)

In [None]:
index = 1
print(trial_texts[index])
print(predicted_labels[index])
print("Predicted: ",predicted_toxic_words[index])
# print("True: ",toxic_words[index])

In [122]:
final = get_char_positions(lines_original_trial, predicted_toxic_words)

In [None]:
index = 11
print(trial_texts[index])
print(lines_original_trial[index])
# print(predicted_labels[index])
print("Predicted: ",predicted_toxic_words[index])
# print("True: ",toxic_words[index])
print("Predicted: ", final[index])
print("True: ", df_trial["spans"][index])

<h5>Prediction File</h5>

In [116]:
# make sure that the ids match the ones of the scores
predictions = list(final)
ids = df_train.index.to_list()

# write in a prediction file named "spans-pred.txt"
with open("spans-pred.txt", "w") as out:
    for uid, text_scores in zip(ids, predictions):
        out.write(f"{str(uid)}\t{str(text_scores)}\n")

In [None]:
! zip -r mpnet_2_high_precision.zip ./spans-pred.*

<h4> Analysis </h4>

In [None]:
sum_f1 = 0
precision = 0
recall = 0
for i in range(0,len(final)):
    sum_f1 = sum_f1 + f1(final[i], df_trial["spans"][i])[0]
    precision = precision + f1(final[i], df_trial["spans"][i])[1]
    recall = recall + f1(final[i], df_trial["spans"][i])[2]

print("\nF1 on val set: ",sum_f1/len(final))
print("\nPrecision on val set: ",precision/len(final))
print("\nRecall on val set: ",recall/len(final))