In [1]:
import numpy as np
import pandas as pd


In [2]:
df = pd.read_csv('C:/Users/KIIT/Desktop/train.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [4]:
df.shape

(159571, 8)

In [5]:
df.isnull().any()

id               False
comment_text     False
toxic            False
severe_toxic     False
obscene          False
threat           False
insult           False
identity_hate    False
dtype: bool

In [6]:
sum(df.duplicated())

0

In [7]:
df.head(30)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [8]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [9]:
for i,c in enumerate(df.columns):
    i +=1
    print(f'{i}-{c}')

1-id
2-comment_text
3-toxic
4-severe_toxic
5-obscene
6-threat
7-insult
8-identity_hate


In [11]:
samples = df.shape[0]
train = int(samples * 0.9)

val = samples - train

print(f'Number of samples:{samples}\nNumber of Training Examples:{train}\nNumber of Training Examples:{val}')

Number of samples:159571
Number of Training Examples:143613
Number of Training Examples:15958


In [12]:
X_train = df['comment_text'][:train]

y_train = df[['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']][:train].values

In [13]:
print(X_train.shape, y_train.shape)

(143613,) (143613, 6)


In [14]:
X_val = df['comment_text'][train:]
y_val = df[['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']][train:].values


In [15]:
print(X_val.shape, y_val.shape)


(15958,) (15958, 6)


In [16]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [17]:
vocab_size = 20000
embedding_dim = 16
max_length = 350
trunc_type='post'
padding_type='post'
oov_tok = "<UNK>"

In [18]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, maxlen=max_length,
padding=padding_type,
truncating=trunc_type)


val_sequences = tokenizer.texts_to_sequences(X_val)
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding=padding_type, 
                               truncating=trunc_type)

In [19]:

train_padded = np.array(train_padded)
val_padded   = np.array(val_padded)

In [20]:
tf.random.set_seed(30)

In [21]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(6, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 350, 16)           320000    
_________________________________________________________________
bidirectional (Bidirectional (None, 350, 128)          41472     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 390       
Total params: 407,238
Trainable params: 407,238
Non-trainable params: 0
__________________________________________________

In [23]:
num_epochs = 10
history = model.fit(train_padded, y_train, epochs=num_epochs, validation_data=(val_padded, y_val), verbose=1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
model.save("model.h5")