In [1]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv("/content/drive/MyDrive/train.csv")
df.head

<bound method NDFrame.head of                       id                                       comment_text  \
0       0000997932d777bf  Explanation\nWhy the edits made under my usern...   
1       000103f0d9cfb60f  D'aww! He matches this background colour I'm s...   
2       000113f07ec002fd  Hey man, I'm really not trying to edit war. It...   
3       0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...   
4       0001d958c54c6e35  You, sir, are my hero. Any chance you remember...   
...                  ...                                                ...   
159566  ffe987279560d7ff  ":::::And for the second time of asking, when ...   
159567  ffea4adeee384e90  You should be ashamed of yourself \n\nThat is ...   
159568  ffee36eab5c267c9  Spitzer \n\nUmm, theres no actual article for ...   
159569  fff125370e4aaaf3  And it looks like it was actually you who put ...   
159570  fff46fc426af1f9a  "\nAnd ... I really don't think you understand...   

        toxic  severe

In [4]:
X = df['comment_text']
Y = df[df.columns[2:]].values

In [5]:
sentences = []
for s in X:
  sentences.append(s)

In [6]:
train_sentences = sentences[0:111700] # 70 percent of dataset
train_labels = Y[:111700,:]

val_sentences = sentences[111700:149997] # 20 percent of dataset
val_labels = Y[111700:149997,:]

test_sentences = sentences[149997:159571] # 10 percent of dataset
test_labels = Y[149997:159571,:]
print(len(train_sentences))
print(len(val_sentences))
print(len(test_sentences))

111700
38297
9574


In [7]:
len(sentences[10])

2875

In [8]:
print(train_labels.shape)
print(val_labels.shape)
print(test_labels.shape)

(111700, 6)
(38297, 6)
(9574, 6)


In [9]:
vocab_size = 200000
max_length = 1800
trunc_type='post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)

tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences,maxlen=max_length, truncating=trunc_type)

val_sequences = tokenizer.texts_to_sequences(val_sentences)
val_padded = pad_sequences(val_sequences,maxlen=max_length)


In [10]:
print(len(val_padded))

38297


In [11]:
Train_dataset = tf.data.Dataset.from_tensor_slices((train_padded, train_labels))
Val_dataset = tf.data.Dataset.from_tensor_slices((val_padded, val_labels))

Train_dataset = Train_dataset.shuffle(111700)

Val_dataset = Val_dataset.shuffle(38297)

In [12]:
len(word_index)

210338

In [13]:
embedding_dim = 32
lstm_dim = 32
out_dim = 6

model_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_dim)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(out_dim, activation='softmax'),
])

model_lstm.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model_lstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1800, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [14]:
NUM_EPOCHS = 1
hist_lstm = model_lstm.fit(train_padded,train_labels, epochs=NUM_EPOCHS, validation_data=(val_padded, val_labels))



In [25]:
model_lstm.save('model_lstm.h5')

In [None]:
import matplotlib.pyplot as plt

acc = hist_lstm.history['accuracy']
val_acc = hist_lstm.history['val_accuracy']
loss = hist_lstm.history['loss']
val_loss = hist_lstm.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
# FOR EVALUATION AND PREDICTION CHECK "TESTING.PY"