In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from keras import layers, models

In [2]:
train = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
sample_submission = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')

In [3]:
maxlen = 69
num_words = 10000
embedding_dim = 100

In [4]:
texts = train['comment_text'].values.tolist()

columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
labels = train[columns].values.tolist()
labels = np.array(labels)

texts[0], labels[0]

("Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
 array([0, 0, 0, 0, 0, 0]))

In [5]:
test_texts = test['comment_text'].values

In [6]:
import transformers
from transformers import AutoTokenizer

#tokenizer = transformers.BertTokenizer.from_pretrained('/kaggle/input/huggingface-bert/bert-base-cased')
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [7]:
sent = "this is a test"
token = tokenizer(sent)
token

{'input_ids': [101, 2023, 2003, 1037, 3231, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [8]:
def encode(texts, tokenizer, maxlen):
    input_ids = []
    #token_type_ids = []
    attention_mask = []
    
    for text in texts:
        token = tokenizer(text, max_length = maxlen, truncation = True, padding = 'max_length', 
                         add_special_tokens = False)
        input_ids.append(token['input_ids'])
        #token_type_ids.append(token['token_type_ids'])
        attention_mask.append(token['attention_mask'])
    
    return np.array(input_ids), np.array(attention_mask)

In [9]:
# let's encode for both train and test texts
sequences = encode(texts, tokenizer, maxlen)
test_sequences = encode(test_texts, tokenizer, maxlen)

In [10]:
# use TF.Data to wrap dataset
batch_size = 20
buffer_size = 1000

dataset = (tf.data.Dataset.from_tensor_slices((sequences, labels)).shuffle(buffer_size).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE))

2023-01-07 21:23:09.888501: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-07 21:23:09.969611: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-07 21:23:09.970407: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-07 21:23:09.974396: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [11]:
from transformers import TFDistilBertModel

model = (TFDistilBertModel.from_pretrained(model_name))

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

2023-01-07 21:23:50.988393: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'activation_13', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the 

In [12]:
def build_model(model):
    # instantiate keras layers
    input_ids = layers.Input(shape=(maxlen,), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape = (maxlen,), dtype=tf.int32, name='attention_mask')
    
    sequence_output = model(input_ids, attention_mask)[0]
    clf_output = sequence_output[:, 0, :]
    
    # instantiate dropout & dense layers
    dropout = layers.Dropout(0.1)
    dense = layers.Dense(6, activation='sigmoid')
    
    # use them 
    clf_output = dropout(clf_output)
    out = dense(clf_output)
    
    model = keras.models.Model(inputs = [input_ids, attention_mask], outputs = out)
    return model

In [13]:
model = build_model(model)
optimizer = keras.optimizers.Adam(learning_rate=1e-4)

model.compile(optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 69)]         0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 69)]         0                                            
__________________________________________________________________________________________________
tf_distil_bert_model (TFDistilB TFBaseModelOutput(la 66362880    input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
tf.__operators__.getitem (Slici (None, 768)          0           tf_distil_bert_model[0][0]   

In [14]:
history = model.fit(dataset, epochs = 2)

Epoch 1/2


2023-01-07 21:24:05.716861: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/2


In [15]:
def make_submission(predictions):
    submission = pd.DataFrame(predictions, columns = columns)
    submission = pd.concat([test['id'], submission], axis = 1)
    
    display(submission.head())
    return submission

In [16]:
predictions = model.predict(test_sequences)
submission = make_submission(predictions)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.993771,0.206823,0.954345,0.058308,0.868888,0.228648
1,0000247867823ef7,0.002045,1.6e-05,0.000499,1.3e-05,0.000307,1.6e-05
2,00013b17ad220c46,0.002051,1.6e-05,0.000505,1.2e-05,0.000306,1.6e-05
3,00017563c3f7919a,0.002038,1.7e-05,0.000506,1.2e-05,0.000307,1.6e-05
4,00017695ad8997eb,0.002079,1.6e-05,0.000509,1.2e-05,0.000307,1.6e-05


In [17]:
submission.to_csv('submission_bert.csv', index = False)

This gave a score of 0.96721 on the Kaggle leaderboard.