In [1]:
from transformers import AutoTokenizer, AutoModel
import transformers
import torch
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from sklearn.metrics import accuracy_score
import string
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
import re
import seaborn as sns
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

# Data

In [3]:
df_train = pd.read_csv(
    "../input/jigsaw-toxic-comment-classification-challenge/train.csv")

In [4]:
df_test = pd.read_csv(
    "../input/jigsaw-toxic-comment-classification-challenge/test.csv")

In [5]:
cols_target = ['toxic', 'obscene', 'insult',
               'threat', 'severe_toxic', 'identity_hate']

In [6]:
y_test = pd.read_csv(
    '../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv')

In [7]:
# merge test data and their labels
df_test = df_test.merge(y_test, on='id', how='left')

# remove data without label info (label =-1)
df_test2 = df_test[df_test.toxic != -1]

# Data preparation

In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [9]:
# clean the comment_text in train_df
df_train['comment_text'] = df_train['comment_text'].apply(
    lambda com: clean_text(com))

# clean the comment_text in test_df
df_test2['comment_text'] = df_test2['comment_text'].map(
    lambda com: clean_text(com))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [10]:
# get the number of classes
num_classes = len(cols_target)

In [12]:
# get max length
max_len=df_train.comment_text.apply(lambda x : len(x.split(' '))).max()


sentences=df_train.comment_text
labels=df_train[cols_target]
len(sentences),len(labels)

(159571, 159571)

# BERT model

Inspired by : 
https://www.kaggle.com/toru59er/0-765-lightgbm-bert-simple-baseline

In [16]:
pretrained_bert = "../input/huggingface-bert/bert-base-uncased"

class BertSequenceVectorizer:
    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model_name = pretrained_bert
#         self.model_name = "../input/distil-roberta-base"
#         self.tokenizer = RobertaTokenizer.from_pretrained(self.model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)

#         self.bert_model = transformers.RobertaModel.from_pretrained(self.model_name)
        self.bert_model = transformers.AutoModel.from_pretrained(
            self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = 128
#         self.max_len = 256

    def vectorize(self, sentence: str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor(
            [inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        seq_out = self.bert_model(inputs_tensor, masks_tensor)[0]
        pooled_out = self.bert_model(inputs_tensor, masks_tensor)[1]

        if torch.cuda.is_available():
            return seq_out[0][0].cpu().detach().numpy()
        else:
            return seq_out[0][0].detach().numpy()

In [17]:
# apply BERT tokenizer and model to the training sentences
BSV = BertSequenceVectorizer()
sentences_bert = sentences.apply(lambda x: BSV.vectorize(
    x) if x is not np.nan else np.array([0]*768))

Some weights of the model checkpoint at ../input/huggingface-bert/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
sentences_bert[0].shape

(768,)

In [19]:
# reformat bert outputs
bert = pd.DataFrame(sentences_bert.tolist())
bert.columns = ['text_bertvec_'+str(col) for col in bert.columns]

In [20]:
# transform bert outputs to dataframe
text_bert_df = pd.DataFrame(bert)
text_bert_df.head()

Unnamed: 0,text_bertvec_0,text_bertvec_1,text_bertvec_2,text_bertvec_3,text_bertvec_4,text_bertvec_5,text_bertvec_6,text_bertvec_7,text_bertvec_8,text_bertvec_9,...,text_bertvec_758,text_bertvec_759,text_bertvec_760,text_bertvec_761,text_bertvec_762,text_bertvec_763,text_bertvec_764,text_bertvec_765,text_bertvec_766,text_bertvec_767
0,0.072238,0.19772,0.269506,0.125484,-0.334173,-0.393379,0.447892,0.416747,0.239246,-0.357066,...,0.118503,-0.029593,0.042839,-0.122591,0.634801,0.208534,-0.330013,-0.213664,0.21107,0.406809
1,-0.272002,-0.116618,0.580873,-0.078019,-0.243167,-0.240009,0.75134,0.328918,-0.053315,-0.054193,...,0.053319,-0.289297,0.270767,-0.054791,0.060203,0.031467,0.069743,-0.380638,0.459441,0.62873
2,0.327859,0.291476,0.169962,0.039502,-0.590333,-0.561693,0.235517,0.860082,0.034937,-0.60327,...,-0.229587,0.040969,0.306236,-0.406571,0.107528,0.168994,-0.100248,-0.01236,0.896013,0.56332
3,0.307734,0.460982,0.134422,-0.094811,-0.17803,-0.562496,-0.121295,0.606913,-0.099578,-0.123005,...,0.092666,-0.224054,-0.087925,-0.133209,0.42187,-0.250601,-0.548412,-0.425708,0.248791,0.55833
4,0.100213,0.025797,0.133326,0.007313,-0.114581,-0.370446,0.122238,0.668644,-0.126055,-0.137596,...,-0.227183,-0.453095,0.162811,-0.082508,0.189061,0.01902,-0.137315,-0.44791,0.291573,0.374789


In [21]:
text_bert_df.to_csv('./text_bert_df.csv')

In [25]:
# split data on train and validation
X_train,  X_val, y_train, y_val = train_test_split(
    text_bert_df, labels, random_state=12)

In [26]:
# apply random forest classifier on bert embeddings
rf = RandomForestClassifier(
    max_depth=50, min_samples_split=5, random_state=12, n_jobs=5)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=50, min_samples_split=5, n_jobs=5,
                       random_state=12)

In [27]:
import pickle
# save the model 
filename = './model.pkl'
pickle.dump(rf, open(filename, 'wb'))

#rf = pickle.load(open('../input/model-rf/model.pkl', 'rb'))

In [29]:
# accuracy score of training data
pred_train = rf.predict(X_train)
accuracy_score(y_train,pred_train)

0.9856113905646819

In [30]:
# accuracy score of validation data
pred_val = rf.predict(X_val)
accuracy_score(y_val,pred_val)

0.9029152984232823

# Test

In [22]:
# test sentences
sentences_bert_test = df_test2.comment_text.apply(
    lambda x: BSV.vectorize(x) if x is not np.nan else np.array([0]*768))
bert_test = pd.DataFrame(sentences_bert_test.tolist())
bert_test.columns = ['text_bertvec_'+str(col) for col in bert_test.columns]
text_bert_df_test = pd.DataFrame(bert_test)

text_bert_df_test.to_csv('./text_bert_df_test.csv')

In [38]:
# get predictions for test data
pred = rf.predict(text_bert_df_test)

In [41]:
# accuracy score on test data
accuracy_score(df_test2[cols_target], pred)

0.9004032636218701

# Validation

In [42]:
validation_data = pd.read_csv(
    '../input/jigsaw-toxic-severity-rating/validation_data.csv')

In [43]:
validation_data['less_toxic'] = validation_data.less_toxic.apply(clean_text)

In [44]:
validation_data['more_toxic'] = validation_data.more_toxic.apply(clean_text)

In [47]:
BSV = BertSequenceVectorizer()

Some weights of the model checkpoint at ../input/huggingface-bert/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [48]:
# bert embedding on less toxic sentences
sentences_bert_val1 = validation_data.less_toxic.apply(
    lambda x: BSV.vectorize(x) if x is not np.nan else np.array([0]*768))
bert_val1 = pd.DataFrame(sentences_bert_val1.tolist())
bert_val1.columns = ['text_bertvec_'+str(col) for col in bert_val1.columns]
text_bert_df_val1 = pd.DataFrame(bert_val1)

text_bert_df_val1.to_csv('./text_bert_df_val1.csv')

In [49]:
# bert embedding on more toxic sentences
sentences_bert_val2 = validation_data.more_toxic.apply(
    lambda x: BSV.vectorize(x) if x is not np.nan else np.array([0]*768))
bert_val2 = pd.DataFrame(sentences_bert_val2.tolist())
bert_val2.columns = ['text_bertvec_'+str(col) for col in bert_val2.columns]
text_bert_df_val2 = pd.DataFrame(bert_val2)

text_bert_df_val2.to_csv('./text_bert_df_val2.csv')

In [50]:
# predictions for less toxic data
pred_val1 = rf.predict_proba(text_bert_df_val1)

In [51]:
# reshape predictions for less toxic data
pred_val1_2 = np.array([x[:,1] for x in pred_val1]).T

In [52]:
# predictions for more toxic data
pred_val2 = rf.predict_proba(text_bert_df_val2)

In [53]:
# reshape predictions for more toxic data
pred_val2_2 = np.array([x[:,1] for x in pred_val2]).T

In [54]:
def score_function(a):
    """ define the score of toxicity """
    # a[0] = a[0]   # toxic
    # a[1] = a[1]*1.25 # obscene
    # a[2] = a[2]*1.25 # insult
    a[3] = a[3]*1.5  # threat
    a[4] = a[4]*2  # severe_toxic
    a[5] = a[5]*2  # identity hate
    return a.sum()


# apply score of toxicity on validation data
val1_pred_sum = np.apply_along_axis(score_function, axis=1, arr=pred_val1_2)
val2_pred_sum = np.apply_along_axis(score_function, axis=1, arr=pred_val2_2)

In [56]:
# compute score on validation data
(val1_pred_sum < val2_pred_sum).sum()/validation_data.shape[0]

0.6739736946990833

# Comments to score

In [57]:
comments_to_score = pd.read_csv(
    '../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [58]:
comments_to_score['text'] = comments_to_score.text.apply(clean_text)

In [60]:
# bert embedding of comments to score
sentences_bert_comm = comments_to_score.text.apply(
    lambda x: BSV.vectorize(x) if x is not np.nan else np.array([0]*768))
bert_comm = pd.DataFrame(sentences_bert_comm.tolist())
bert_comm.columns = ['text_bertvec_'+str(col) for col in bert_comm.columns]
text_bert_df_comm = pd.DataFrame(bert_comm)

text_bert_df_comm.to_csv('./text_bert_df_comm.csv')

In [62]:
# get predictions of comments to score
predictions = rf.predict_proba(text_bert_df_comm)

In [63]:
# reshape predictions
predictions_2 = np.array([x[:, 1] for x in predictions]).T

In [65]:
# apply score of toxicity
pred_sum = np.apply_along_axis(score_function,  axis=1, arr=predictions_2)

In [66]:
sample_submission = pd.read_csv(
    '../input/jigsaw-toxic-severity-rating/sample_submission.csv')

In [67]:
sample_submission['score'] = pred_sum

In [68]:
sample_submission.to_csv('./submission.csv', index=False)