In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/glove840b300dtxt/glove.840B.300d.txt
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv
/kaggle/input/jigsaw-multilingual-toxic-test-translated/jigsaw_miltilingual_test_translated.csv
/kaggle/

In [2]:
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [3]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [4]:
train = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv')
val = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')

In [5]:
train = train.loc[:10000,:]
train.shape

(10001, 8)

In [6]:
train.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
9996,1a7a4868968e2b9e,"Those two love to disagree, don't they? 206.17...",0,0,0,0,0,0
9997,1a7c3bec9a71415d,"""I have changed """"Lance Thomas"""" to """"Lance Th...",0,0,0,0,0,0
9998,1a7c9c14b0cf0fe0,states \n\nCourts: I have been putting all art...,0,0,0,0,0,0
9999,1a7d550fec6e9777,Will do buddy ). But what is this thing about ...,0,0,0,0,0,0
10000,1a7d7c88372e5668,Hi RedRose and apologies for delay. Here is a ...,0,0,0,0,0,0


In [7]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:
train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis=1, inplace = True)
train.shape

(10001, 3)

In [9]:
train_X, valid_X, train_y, valid_y = train_test_split(train.comment_text.values, train.toxic.values, stratify=train.toxic.values, random_state=123, 
                                                      train_size=0.8, shuffle=True)
train_X.shape

(8000,)

In [10]:

lenmax = train['comment_text'].apply(lambda x: len(str(x).split())).max()
lenmax

1403

In [11]:

tokenizer = Tokenizer(num_words=None, 
                     filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')
tokenizer.fit_on_texts(list(train_X) + list(valid_X))

seq_trainx = tokenizer.texts_to_sequences(train_X)
seq_validx = tokenizer.texts_to_sequences(valid_X)

#zero p
pad_trainx = sequence.pad_sequences(seq_trainx, maxlen=lenmax)
pad_validx = sequence.pad_sequences(seq_validx, maxlen=lenmax)

word_index = tokenizer.word_index

seq_trainx

[[25638,
  181,
  1125,
  12,
  12361,
  7431,
  16,
  57,
  157,
  41,
  101,
  184,
  12,
  4420,
  5,
  385,
  3695,
  117,
  18,
  5,
  10003,
  4,
  12,
  4420,
  10,
  16353,
  24,
  73,
  3,
  6001,
  3894,
  668,
  12,
  104,
  16,
  459,
  392,
  134,
  10,
  1,
  32,
  25,
  13,
  10,
  1,
  25639,
  2140,
  157,
  9,
  2580,
  599,
  25640,
  14],
 [674, 8, 5, 5474, 24, 3356],
 [14,
  75,
  281,
  33,
  7432,
  189,
  5,
  3508,
  9,
  781,
  179,
  31,
  329,
  1029,
  1829,
  2,
  1,
  743,
  3,
  25641,
  4,
  10004,
  16354,
  389,
  35,
  8,
  168,
  31,
  1,
  25642,
  25643,
  160,
  1375,
  1010,
  71,
  12,
  37,
  8464,
  54,
  6002,
  90,
  531,
  161,
  4160,
  140,
  72,
  34,
  1667,
  2214,
  1147,
  4,
  25644,
  7,
  250,
  15,
  16355,
  8465,
  14],
 [483,
  5,
  16356,
  677,
  6,
  66,
  506,
  33,
  121,
  408,
  6,
  506,
  33,
  7,
  1,
  2470,
  2581,
  10005,
  3,
  441,
  25645],
 [7433, 118, 2, 19, 1, 2215, 4, 4714, 25646, 476, 2901, 2294, 916],
 

In [12]:
embeddings_index = {}
f = open('/kaggle/input/glove840b300dtxt/glove.840B.300d.txt','r')
for line in tqdm(f):
    words = line.split(' ')
    word = words[0]
    c = np.asarray([float(val) for val in words[1:]])
    embeddings_index[word] = c
f.close()

2196018it [06:29, 5644.85it/s]


In [13]:
embedding_mat = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vec = embeddings_index.get(word)
    if embedding_vec is not None:
        embedding_mat[i] = embedding_vec

100%|██████████| 71911/71911 [00:00<00:00, 197991.57it/s]


In [14]:
with strategy.scope():
    
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_mat],
                     input_length=lenmax,
                     trainable=False))

    model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

In [17]:
model.fit(pad_trainx,train_y,nb_epoch=5, batch_size=64*strategy.num_replicas_in_sync)


  """Entry point for launching an IPython kernel.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x7fc5303b6e48>

In [None]:
predict = model.predict(pad_validx)
fpr, tpr, thresholds = metrics.roc_curve(valid_y, predict)
roc_auc = metrics.auc(fpr, tpr)
roc_auc