In [1]:
from skmultilearn.model_selection import IterativeStratification
from sklearn.metrics import precision_score, recall_score, f1_score, jaccard_score
from transformers import BertTokenizer, TFBertModel, BertConfig, TFDistilBertModel, DistilBertTokenizer, DistilBertConfig
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow import keras
import tensorflow as tf
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import string
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
import re
import seaborn as sns
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

# Data

In [3]:
df_train = pd.read_csv(
    "../input/jigsaw-toxic-comment-classification-challenge/train.csv")

In [4]:
df_test = pd.read_csv(
    "../input/jigsaw-toxic-comment-classification-challenge/test.csv")

In [5]:
cols_target = ['toxic', 'obscene', 'insult',
               'threat', 'severe_toxic', 'identity_hate']

In [6]:
y_test = pd.read_csv(
    '../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv')

In [7]:
# merge test data and their labels
df_test = df_test.merge(y_test, on='id', how='left')

# remove data without label info (label =-1)
df_test2 = df_test[df_test.toxic != -1]

# Data preparation

In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [9]:
# clean the comment_text in train_df
df_train['comment_text'] = df_train['comment_text'].apply(
    lambda com: clean_text(com))

# clean the comment_text in test_df
df_test2['comment_text'] = df_test2['comment_text'].map(
    lambda com: clean_text(com))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [10]:
# get the number of classes
num_classes = len(cols_target)

## Bert tokenizer

In [11]:
# get the tokenizer
pretrained_bert = "../input/huggingface-bert/bert-base-uncased"
bert_tokenizer = BertTokenizer.from_pretrained(pretrained_bert, do_lower_case=True)

#bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [12]:
# get max length
max_len = df_train.comment_text.apply(lambda x: len(x.split(' '))).max()


sentences = df_train.comment_text
labels = df_train[cols_target]
len(sentences), len(labels)

(159571, 159571)

In [None]:
# get the input_ids and attention masks for all sentences
input_ids = []
attention_masks = []

for sent in sentences[:100000]:

    # apply tokenizer on all sentences
    bert_inps = bert_tokenizer.encode_plus(sent, add_special_tokens=True,
                                           max_length=512, pad_to_max_length=True,
                                           return_attention_mask=True, truncation=True)

    input_ids.append(bert_inps['input_ids'])
    attention_masks.append(bert_inps['attention_mask'])

input_ids = np.asarray(input_ids)
attention_masks = np.array(attention_masks)
labels = np.array(labels[:100000])

In [14]:
# split data into train and validation
train_inp, val_inp, train_mask, val_mask, train_label, val_label = train_test_split(
    input_ids, attention_masks, labels)

# BERT model

In [31]:
def create_model(train=False):
    """ create a bert model for classification
    train : boolean: indicating wether to train the full model """
    inps = Input(shape=(512,), dtype='int64')
    masks = Input(shape=(512,), dtype='int64')
    bert_model = TFBertModel.from_pretrained(pretrained_bert)
    #bert_model = TFBertModel.from_pretrained('bert-base-uncased')
    bert_model.trainable = train
    bert_layer = bert_model(inps, attention_mask=masks)[0][:, 0, :]
    pred = Dense(num_classes, activation='sigmoid')(bert_layer)
    model = tf.keras.Model(inputs=[inps, masks], outputs=pred)
    return model

In [32]:
# Model parameters
loss = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)

In [33]:
# create model
model = create_model(train=False)

# compile model
model.compile(loss=loss, optimizer=optimizer, metrics='accuracy')

# train model
model.fit([train_inp, train_mask], train_label, batch_size=16, epochs=2,
          validation_data=([val_inp, val_mask], val_label), workers=5)

"\n# create model\nmodel = create_model(train=False)\n\n# compile model\nmodel.compile(loss = loss, optimizer = optimizer, metrics='accuracy')\n\n# train model\nmodel.fit([train_inp,train_mask], train_label, batch_size = 16, epochs = 2,\n          validation_data = ([val_inp, val_mask], val_label), workers=5 )\n"

In [34]:
#model.save_weights('./weights.')

In [35]:
#tf.keras.models.save_model(model, './model.h5')

# Test

In [None]:
# get the input_ids and attention masks for all test sentences
sentences_test = df_test2.comment_text
input_ids_test = []
attention_masks_test = []

for sent in sentences_test:

    # apply tokenizer on all sentences
    bert_inps = bert_tokenizer.encode_plus(sent, add_special_tokens=True,
                                           max_length=512, pad_to_max_length=True,
                                           return_attention_mask=True, truncation=True)

    input_ids_test.append(bert_inps['input_ids'])
    attention_masks_test.append(bert_inps['attention_mask'])

input_ids_test = np.asarray(input_ids_test)
attention_masks_test = np.array(attention_masks_test)

In [38]:
# get predictions for test data
pred = model.predict([input_ids_test, attention_masks_test])

In [39]:
#pred.shape

In [40]:
#pred[0]

In [41]:
# compute model accuracy on test data
accuracy_score(df_test2[cols_target], pred)

0.9004032636218701

# Validation

In [42]:
validation_data = pd.read_csv(
    '../input/jigsaw-toxic-severity-rating/validation_data.csv')

In [43]:
validation_data['less_toxic'] = validation_data.less_toxic.apply(clean_text)

In [44]:
validation_data['more_toxic'] = validation_data.more_toxic.apply(clean_text)

In [None]:
# get the input_ids and attention masks for all less toxic sentences
sentences_val1 = validation_data.less_toxic
input_ids_val1 = []
attention_masks_val1 = []

for sent in sentences_val1:

    # apply tokenizer on all sentences
    bert_inps = bert_tokenizer.encode_plus(sent, add_special_tokens=True,
                                           max_length=512, pad_to_max_length=True,
                                           return_attention_mask=True, truncation=True)

    input_ids_val1.append(bert_inps['input_ids'])
    attention_masks_val1.append(bert_inps['attention_mask'])

input_ids_val1 = np.asarray(input_ids_val1)
attention_masks_val1 = np.array(attention_masks_val1)

In [None]:
# get the input_ids and attention masks for all more toxic sentences
sentences_val2 = validation_data.more_toxic
input_ids_val2 = []
attention_masks_val2 = []

for sent in sentences_val2:

    # apply tokenizer on all sentences
    bert_inps = bert_tokenizer.encode_plus(sent, add_special_tokens=True,
                                           max_length=512, pad_to_max_length=True,
                                           return_attention_mask=True, truncation=True)

    input_ids_val2.append(bert_inps['input_ids'])
    attention_masks_val2.append(bert_inps['attention_mask'])

input_ids_val2 = np.asarray(input_ids_val2)
attention_masks_val2 = np.array(attention_masks_val2)

In [50]:
# get prediction for less toxic sentences
pred_val1 = model.predict([input_ids_val1, attention_masks_val1])

In [52]:
# get prediction for more toxic sentences 
pred_val2 = model.predict([input_ids_val2, attention_masks_val2])

In [54]:
def score_function(a):
    """ define the score of toxicity """
    # a[0] = a[0]   # toxic
    # a[1] = a[1]*1.25 # obscene
    # a[2] = a[2]*1.25 # insult
    a[3] = a[3]*1.5  # threat
    a[4] = a[4]*2  # severe_toxic
    a[5] = a[5]*2  # identity hate
    return a.sum()

# apply score function on validation data
val1_pred_sum = np.apply_along_axis(score_function, axis=1, arr=pred_val1_2)
val2_pred_sum = np.apply_along_axis(score_function, axis=1, arr=pred_val2_2)

In [55]:
val1_pred_sum.shape

(30108,)

In [None]:
# compute the score for validation data
(val1_pred_sum < val2_pred_sum).sum()/validation_data.shape[0]

# Comments to score

In [57]:
comments_to_score = pd.read_csv(
    '../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [58]:
comments_to_score['text'] = comments_to_score.text.apply(clean_text)

In [None]:
# get the input_ids and attention masks for all comments to score
sentences_comments = comments_to_score.text
input_ids_comments = []
attention_masks_comments = []

for sent in sentences_comments:

    # apply tokenizer on all sentences
    bert_inps = bert_tokenizer.encode_plus(sent, add_special_tokens=True,
                                           max_length=512, pad_to_max_length=True,
                                           return_attention_mask=True, truncation=True)

    input_ids_comments.append(bert_inps['input_ids'])
    attention_masks_comments.append(bert_inps['attention_mask'])

input_ids_comments = np.asarray(input_ids_comments)
attention_masks_comments = np.array(attention_masks_comments)

In [62]:
# get predictions for comments to score
predictions = model.predict([input_ids_comments, attention_masks_comments])

In [65]:
# apply score function for comments to score
pred_sum = np.apply_along_axis(score_function,  axis=1, arr=predictions_2)

In [66]:
sample_submission = pd.read_csv(
    '../input/jigsaw-toxic-severity-rating/sample_submission.csv')

In [67]:
sample_submission['score'] = pred_sum

In [68]:
# save predictions
sample_submission.to_csv('./submission.csv', index=False)