In [None]:
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
import pandas as pd


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Input
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_labels = pd.read_csv('test_labels.csv')

In [None]:
ids_to_discard = test_labels[test_labels['toxic'] == -1]['id']
test = test[~test['id'].isin(ids_to_discard)]
test_labels = test_labels[~test_labels['id'].isin(ids_to_discard)]

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)      ### conversion of contraction words to expanded words
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)                                                 ### removing non-word characters

    text = re.sub(r'fck', 'fuck', text)
    text = re.sub(r'a$$', 'ass', text)
    text = re.sub(r'@', 'at', text)
    text = re.sub(r'wikipedia:[^\s]+',' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub('(utc)', ' ' ,text)
    text = re.sub(' u ', ' you ' ,text)


    text = re.sub('[^A-Za-z\' ]+', '',text)                                        ### removing all non-letter values(Except single quotes)
    text = re.sub('\s+', ' ', text)

    text = text.strip(' ')
    text = ' '.join([word for word in text.split() if word not in (stop_words)])    ### Stopwords removal
    return text

train["comment_text"] = train["comment_text"].apply(clean_text)
test["comment_text"] = test["comment_text"].apply(clean_text)

In [None]:
train_data = train["comment_text"]
test_data = test["comment_text"]
# train_label=train[['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']]
train_label=train['toxic']

In [None]:
### Creating corpus of words and coverting it into integer and then susbstituing it in sentences - prepare tokenizer
tokenizer = Tokenizer(num_words = 40000) #40000 words are used here
tokenizer.fit_on_texts(train_data)

#convert each text into array of integers with help of tokenizer.
train_final = tokenizer.texts_to_sequences(train_data)
test_final = tokenizer.texts_to_sequences(test_data)

In [None]:
train_padded =pad_sequences(train_final, maxlen=150)
test_padded =pad_sequences(test_final, maxlen=150)
print("Shape of training data",train_padded.shape)
print("Shape of testing data",test_padded.shape)

Shape of training data (159571, 150)
Shape of testing data (63978, 150)


#Vectorize words using word2vec

# Keras

In [None]:
model = Sequential()
model.add(Embedding(40000, 128))
model.add(LSTM(units = 64, dropout = 0.2,return_sequences=True))
model.add(LSTM(units = 64, dropout = 0.2))
model.add(Dense(units = 1, activation = 'sigmoid'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 128)         5120000   
_________________________________________________________________
lstm_6 (LSTM)                (None, None, 64)          49408     
_________________________________________________________________
lstm_7 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 5,202,497
Trainable params: 5,202,497
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["AUC", "accuracy"])
x_train, x_val, y_train, y_val = train_test_split(train_padded, train_label, shuffle = True, random_state = 123)
model.fit(x_train, y_train, batch_size = 32, epochs = 1, validation_data = (x_val, y_val))



<tensorflow.python.keras.callbacks.History at 0x7fdce6cadd90>

In [None]:
### Prediction for test data
predict = model.predict(test_padded)
print("Predicted values are",predict)

In [None]:
target_cols = np.array(['toxic','severe_toxic','obscene', 'threat','insult', 'identity_hate'])
final_predict_test = pd.concat([pd.DataFrame(predict, columns=target_cols)], 1)
t1 = test['id']
final_predict_test = pd.concat([t1,final_predict_test],1)
final_predict_test.head()