In [None]:
!nvidia-smi

Sun Jul 19 11:02:25 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.05    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P8    11W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import numpy as np
import pandas as pd
import datetime
import re
import os

%tensorflow_version 2.x
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Embedding, BatchNormalization

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.initializers import Constant

from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.metrics import roc_auc_score

# Toxic Comment Classification

## Download dataset from Kaggle

In [None]:
# install kaggle
!pip install -q kaggle

In [None]:
# upload kaggle.json
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"beneblau","key":"2b24771f2715eda025cba3e641767f7f"}'}

In [None]:
# make kaggle directory, move in directory and change permission
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

Downloading test_labels.csv.zip to /content
  0% 0.00/1.46M [00:00<?, ?B/s]
100% 1.46M/1.46M [00:00<00:00, 49.0MB/s]
Downloading train.csv.zip to /content
 34% 9.00M/26.3M [00:00<00:00, 23.8MB/s]
100% 26.3M/26.3M [00:00<00:00, 59.4MB/s]
Downloading sample_submission.csv.zip to /content
  0% 0.00/1.39M [00:00<?, ?B/s]
100% 1.39M/1.39M [00:00<00:00, 197MB/s]
Downloading test.csv.zip to /content
 38% 9.00M/23.4M [00:00<00:01, 12.2MB/s]
100% 23.4M/23.4M [00:00<00:00, 25.4MB/s]


In [None]:
# unzip data
!unzip test.csv.zip
!unzip test_labels.csv.zip
!unzip train.csv.zip

Archive:  test.csv.zip
  inflating: test.csv                
Archive:  test_labels.csv.zip
  inflating: test_labels.csv         
Archive:  train.csv.zip
  inflating: train.csv               


## Data Exploration

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_labels = pd.read_csv("test_labels.csv")

In [None]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
test_labels["toxic"].value_counts()

-1    89186
 0    57888
 1     6090
Name: toxic, dtype: int64

In [None]:
# remove -1 that was not used for scoring
test_labels = test_labels.drop(test_labels[test_labels["toxic"]==-1].index)
test_id = test_labels["id"].tolist()
test_score = test[test["id"].isin(test_id)]

# merge to ensure id sequence is the same
test_combined = pd.merge(test_score,test_labels,on="id")

# process testing set
x_test = test_combined["comment_text"].copy()
y_test = test_combined.iloc[:,2:].copy()

In [None]:
# process training set
x_train = train["comment_text"].copy()
y_train = train.iloc[:,2:].copy()

## Text processing

In [None]:
# process text

def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence
  
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)


x_train = x_train.map(preprocess_text)
x_test = x_test.map(preprocess_text)

In [None]:
# GloVe embedding
!wget -qq http://nlp.stanford.edu/data/glove.6B.zip

# unzip
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [None]:
# first, build index mapping words in the embeddings set
# to their embedding vector

GLOVE_DIR = ''

print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [None]:
max_words = 2000
max_len = 128

tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(x_train)

sequences = tok.texts_to_sequences(x_train)
x_train = sequence.pad_sequences(sequences,maxlen=max_len)

## Modelling

In [None]:
print('Preparing embedding matrix.')

EMBEDDING_DIM = 100

# prepare embedding matrix
num_words = min(max_words, len(tok.word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in tok.word_index.items():
    if i > max_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_len,
                            trainable=False)

Preparing embedding matrix.


In [None]:
# building a base model
Inp = Input(name='inputs',shape=[max_len])
x = embedding_layer(Inp)
x = LSTM(64)(x)
x = Dense(128,activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)

out = Dense(6,activation='sigmoid', name='output')(x)

model = Model(inputs=Inp,outputs=out)

In [None]:
model.compile(loss='binary_crossentropy',optimizer="adam",metrics=['accuracy'])

In [None]:
# callbacks

# checkpoint
modelcheckpoint = ModelCheckpoint(os.getcwd(),monitor="loss",save_best_only=True)

# tensorboard
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

# LRscheduler

def scheduler(epoch, lr):
  if epoch < 10:
    return lr
  else:
    return lr * tf.math.exp(-0.1)

lr_schedule = LearningRateScheduler(scheduler)

In [None]:
# train the model
model.fit(x_train,y_train,
          batch_size=64,
          epochs=5,
          callbacks=[modelcheckpoint,tensorboard_callback,lr_schedule])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f5637668f98>

In [None]:
# load saved model
best_model = tf.keras.models.load_model(os.getcwd())

In [None]:
# y_pred processing
test_sequences = tok.texts_to_sequences(x_test)
_test = sequence.pad_sequences(test_sequences,maxlen=max_len)

y_pred = best_model.predict(_test)

In [None]:
# compute score
score = roc_auc_score(y_test,y_pred,average="micro")
score

0.9659798126776568

## Kaggle submission

In [None]:
# process kaggle submission
kaggle_sequences = tok.texts_to_sequences(test["comment_text"])
kaggle_test = sequence.pad_sequences(kaggle_sequences,maxlen=max_len)

kaggle_pred = best_model.predict(kaggle_test)

submission = pd.DataFrame(kaggle_pred,columns=["toxic","severe_toxic","obscene","threat","insult","identity_hate"])
submission.insert(0,"id",test["id"])

submission.to_csv("submission.csv",index=False)

In [None]:
#submit to kaggle
!kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f submission.csv -m "GloVe model"

100% 13.5M/13.5M [00:03<00:00, 3.96MB/s]
Successfully submitted to Toxic Comment Classification Challenge

## Save model

In [None]:
# save model 
files.download("saved_model.pb")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

TypeError: ignored

In [None]:
# save tokenizer
import json

tok_json = tok.to_json()

with open('tokenizer.json', 'w') as outfile:
    json.dump(tok_json, outfile)

In [None]:
files.download("tokenizer.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>