Training 6 models, 1 For each Label.
Using an RNN with LSTM
Framework: Keras

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import re
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras.preprocessing.sequence import pad_sequences
from keras.models import model_from_json
from keras.layers import Input, Dropout, Dense, LSTM, Embedding
from keras.models import Model, Sequential
from keras.optimizers import Adam

Using TensorFlow backend.


# Reading Data

In [3]:
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

# Cleaning Data

In [4]:
stop_words=['about','above','after','again','against','all','am','an','and','any','are','as','at','be','because','been','before'
            ,'being','below','between','both','by','can','could','did','do','does','doing','don','down','during','each','few','for',
            'from','further','had','has','have','having','he','ll','her','here','hers','herself','him','himself','his','how',
            'how','i','ve','if','in','into','is','it','its','itself','let','me','more','most','must','my','myself','of','off',
            'on','once','only','or','other','ought','our','ours','ourselves','out','over','own','same','shall','she','should',
            'so','some','such','than','that','the','their','theirs','them','themselves','then','there','these','they','re',
            'this','those','through','to','too','under','until','up','very','was','we','were','what','when','where','which',
            'while','who','whom','why','with','would','you','your','yours','yourself',
            'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
def remove_punctuation(snt):
    return re.sub(r'[\W]',' ',str(snt).lower())

def remove_stop_words(sent):
    words = sent.split()
    resultwords=[word.lower() for word in words if word.lower() not in stop_words]
    return ' '.join(resultwords)

In [5]:
def clean_data():
    train.comment_text=train.comment_text.apply(remove_punctuation)
    test.comment_text=test.comment_text.apply(remove_punctuation)
    train.comment_text=train.comment_text.apply(remove_stop_words)
    test.comment_text=test.comment_text.apply(remove_stop_words)

In [6]:
clean_data()

# Tokenizing Words

In [7]:
text_data=np.hstack([train.comment_text.str.lower(), 
                      test.comment_text.str.lower()])
tok=Tokenizer()
tok.fit_on_texts(text_data)
print("   Transforming text to seq...")
train["input"] = tok.texts_to_sequences(train.comment_text.str.lower())
test["input"] = tok.texts_to_sequences(test.comment_text.str.lower())

   Transforming text to seq...


In [8]:
MAX_LENGTH = 75
MAX_TOKEN = np.max([np.max(train.input.max()),np.max(test.input.max())]) + 180
print(MAX_LENGTH, MAX_TOKEN)

75 348630


## Seperating Training and Cross Validation Data

In [9]:
train.input.to_csv("train_data.csv",index=False)
test.input.to_csv("test_data.csv",index=False)

In [10]:
train_data=train[:-1000]
valid_data=train[-1000:]

In [11]:
train_x = pad_sequences(train_data.input, maxlen=MAX_LENGTH)
valid_x = pad_sequences(valid_data.input, maxlen=MAX_LENGTH)
test_x  = pad_sequences(test.input, maxlen=MAX_LENGTH)

In [12]:
sub_df=pd.read_csv("submission.csv")
sub_df.id=test.id
sub_df.to_csv("Submission.csv",index=False)

## One-Hot Encoding of target

In [13]:
train_y1 = np_utils.to_categorical(train_data.toxic.values, 2)
valid_y1 = np_utils.to_categorical(valid_data.toxic.values, 2)

train_y2 = np_utils.to_categorical(train_data.severe_toxic.values, 2)
valid_y2 = np_utils.to_categorical(valid_data.severe_toxic.values, 2)

train_y3 = np_utils.to_categorical(train_data.obscene.values, 2)
valid_y3 = np_utils.to_categorical(valid_data.obscene.values, 2)

train_y4 = np_utils.to_categorical(train_data.threat.values, 2)
valid_y4 = np_utils.to_categorical(valid_data.threat.values, 2)

train_y5 = np_utils.to_categorical(train_data.insult.values, 2)
valid_y5 = np_utils.to_categorical(valid_data.insult.values, 2)

train_y6 = np_utils.to_categorical(train_data.identity_hate.values, 2)
valid_y6 = np_utils.to_categorical(valid_data.identity_hate.values, 2)


# LSTM- RNN Model

In [14]:
A = Input(shape=[MAX_LENGTH], name="in")
B = Embedding(MAX_TOKEN, 128)(A)
C = LSTM(90) (B)
D = Dropout(0.6) (Dense(128, activation='relu') (C))
E = Dropout(0.4) (Dense(32, activation='relu') (D))
F = Dropout(0.4) (Dense(8, activation='relu') (E))
output = Dense(2, activation="softmax") (F)


In [15]:
model1 = Model(A, output)
model2 = Model(A, output)
model3 = Model(A, output)
model4 = Model(A, output)
model5 = Model(A, output)
model6 = Model(A, output)

In [17]:
print(model1.summary())
# print(model2.summary())
# print(model3.summary())
# print(model4.summary())
# print(model5.summary())
# print(model6.summary())
ad=Adam(0.001,decay=1e-6)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
in (InputLayer)              (None, 75)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 75, 128)           44624640  
_________________________________________________________________
lstm_1 (LSTM)                (None, 90)                78840     
_________________________________________________________________
dense_1 (Dense)              (None, 128)               11648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                4128      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
__________

# Training Model

In [18]:
model1.compile(optimizer=ad,loss='categorical_crossentropy',metrics = ['accuracy'])
model2.compile(optimizer=ad,loss='categorical_crossentropy',metrics = ['accuracy'])
model3.compile(optimizer=ad,loss='categorical_crossentropy',metrics = ['accuracy'])
model4.compile(optimizer=ad,loss='categorical_crossentropy',metrics = ['accuracy'])
model5.compile(optimizer=ad,loss='categorical_crossentropy',metrics = ['accuracy'])
model6.compile(optimizer=ad,loss='categorical_crossentropy',metrics = ['accuracy'])

In [19]:
model1.fit(train_x, train_y1, batch_size = 128, epochs = 1, 
                verbose = 1, validation_data = (valid_x, valid_y1))

Train on 158571 samples, validate on 1000 samples
Epoch 1/1


<keras.callbacks.History at 0x1e093275fd0>

In [20]:
 _model1=model1.to_json()
with open("Model1.json","w") as json_file:
    json_file.write(_model1)
model1.save_weights("weights1.h5")

In [21]:
json_file = open("Model1.json", "r")
model_json = json_file.read()
json_file.close()
model = model_from_json(model_json)
model.load_weights("weights1.h5")
model1.compile(optimizer=ad,loss='categorical_crossentropy',metrics = ['accuracy'])
preds=model.predict(test_x)

In [22]:
df=pd.DataFrame(preds,columns=['no','yes'])
sub_df.toxic=df.yes
sub_df.to_csv("Submission.csv",index=False)

In [23]:
model2.fit(train_x, train_y2, batch_size = 128, epochs = 1, 
                verbose = 1, validation_data = (valid_x, valid_y2))

Train on 158571 samples, validate on 1000 samples
Epoch 1/1


<keras.callbacks.History at 0x1e0a00fd780>

In [24]:
 _model2=model2.to_json()
with open("Model2.json","w") as json_file:
    json_file.write(_model2)
model2.save_weights("weights2.h5")

In [25]:
json_file = open("Model2.json", "r")
model_json = json_file.read()
json_file.close()
model = model_from_json(model_json)
model.load_weights("weights2.h5")
model1.compile(optimizer=ad,loss='categorical_crossentropy',metrics = ['accuracy'])
preds=model.predict(test_x)

In [26]:
df=pd.DataFrame(preds,columns=['no','yes'])
sub_df.severe_toxic=df.yes
sub_df.to_csv("Submission.csv",index=False)

In [27]:
model3.fit(train_x, train_y3, batch_size = 128, epochs = 1, 
                verbose = 1, validation_data = (valid_x, valid_y3))

Train on 158571 samples, validate on 1000 samples
Epoch 1/1


<keras.callbacks.History at 0x1e0a80f26a0>

In [28]:
_model3=model3.to_json()
with open("Model3.json","w") as json_file:
    json_file.write(_model3)
model3.save_weights("weights3.h5")

In [29]:
json_file = open("Model3.json", "r")
model_json = json_file.read()
json_file.close()
model = model_from_json(model_json)
model.load_weights("weights3.h5")
model1.compile(optimizer=ad,loss='categorical_crossentropy',metrics = ['accuracy'])
preds=model.predict(test_x)

In [30]:
df=pd.DataFrame(preds,columns=['no','yes'])
sub_df.obscene=df.yes
sub_df.to_csv("Submission.csv",index=False)

In [31]:
model4.fit(train_x, train_y4, batch_size = 128, epochs = 1, 
                verbose = 1, validation_data = (valid_x, valid_y4))

Train on 158571 samples, validate on 1000 samples
Epoch 1/1


<keras.callbacks.History at 0x1e0a9770b38>

In [32]:
 _model4=model4.to_json()
with open("Model4.json","w") as json_file:
    json_file.write(_model4)
model4.save_weights("weights4.h5")

In [33]:
json_file = open("Model4.json", "r")
model_json = json_file.read()
json_file.close()
model = model_from_json(model_json)
model.load_weights("weights4.h5")
model1.compile(optimizer=ad,loss='categorical_crossentropy',metrics = ['accuracy'])
preds=model.predict(test_x)

In [34]:
df=pd.DataFrame(preds,columns=['no','yes'])
sub_df.threat=df.yes
sub_df.to_csv("Submission.csv",index=False)

In [35]:
model5.fit(train_x, train_y5, batch_size = 128, epochs = 1, 
                verbose = 1, validation_data = (valid_x, valid_y5))

Train on 158571 samples, validate on 1000 samples
Epoch 1/1


<keras.callbacks.History at 0x1e0d090b710>

In [36]:
_model5=model5.to_json()
with open("Model5.json","w") as json_file:
    json_file.write(_model5)
model5.save_weights("weights5.h5")

In [37]:
json_file = open("Model5.json", "r")
model_json = json_file.read()
json_file.close()
model = model_from_json(model_json)
model.load_weights("weights5.h5")
model1.compile(optimizer=ad,loss='categorical_crossentropy',metrics = ['accuracy'])
preds=model.predict(test_x)

In [38]:
df=pd.DataFrame(preds,columns=['no','yes'])
sub_df.insult=df.yes
sub_df.to_csv("Submission.csv",index=False)

In [39]:
model6.fit(train_x, train_y6, batch_size = 128, epochs = 1, 
                verbose = 1, validation_data = (valid_x, valid_y6))

Train on 158571 samples, validate on 1000 samples
Epoch 1/1


<keras.callbacks.History at 0x1e0d3b9fc88>

In [40]:
 _model6=model6.to_json()
with open("Model6.json","w") as json_file:
    json_file.write(_model6)
model6.save_weights("weights6.h5")

In [41]:
json_file = open("Model6.json", "r")
model_json = json_file.read()
json_file.close()
model = model_from_json(model_json)
model.load_weights("weights6.h5")
model1.compile(optimizer=ad,loss='categorical_crossentropy',metrics = ['accuracy'])
preds=model.predict(test_x)

In [42]:
df=pd.DataFrame(preds,columns=['no','yes'])
sub_df.identity_hate=df.yes
sub_df.to_csv("Submission.csv",index=False)