<a href="https://colab.research.google.com/github/ANUPRIYAR/Multilabel_Classification_NLP_LSTM/blob/main/Toxic_comment_Classification_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

In [2]:
!unzip /content/drive/MyDrive/Deep_learning/jigsaw-toxic-comment-classification-challenge.zip

Archive:  /content/drive/MyDrive/Deep_learning/jigsaw-toxic-comment-classification-challenge.zip
  inflating: sample_submission.csv.zip  
  inflating: test.csv.zip            
  inflating: test_labels.csv.zip     
  inflating: train.csv.zip           


In [3]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
data = pd.read_csv('/content/train.csv.zip')
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
data.shape

(159571, 8)

In [7]:
data.comment_text = data.comment_text.astype('string')
data.dtypes

id               object
comment_text     string
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object

In [8]:
data.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [9]:
import nltk
from nltk.tokenize import word_tokenize
import string

**Clean the text**

In [10]:
def clean_text(text):
  punc_chars = list(string.punctuation)
  punc_chars.remove('.')
  stop_words = stopwords.words('english')
  extra_chars = punc_chars + stop_words

  # Clean the text
  text = re.sub(r'[\r|\n|\r\n]+',' ',text)
  text = re.sub('[0-9]', '', text)
  text =[word for word in word_tokenize(text) if word not in punc_chars + stop_words]
  text = ' '.join(text)
  text.lower()
  return text

In [11]:
data['comment_text'] = data['comment_text'].apply(clean_text)

In [12]:
data['comment_text'][2]

"Hey man I 'm really trying edit war . It 's guy constantly removing relevant information talking edits instead talk page . He seems care formatting actual info ."

**Obtain x and y variables from the dataframes**

In [13]:
Comments = data.comment_text.values
y_fields = ['toxic',	'severe_toxic',	'obscene'	,'threat',	'insult'	,'identity_hate']
y = data[y_fields].values

In [14]:
# Processing test files
test_x = pd.read_csv('/content/test.csv.zip')
test_y = pd.read_csv('/content/test_labels.csv.zip')

In [15]:
x_test = test_x['comment_text'].values
y_test = test_y[y_fields].values

**Tokenization**

In [16]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(Comments)

# Tokenization of x_test
tokenizer.fit_on_texts(x_test)

**Text preprocessing**

In [17]:
sent_length = 100
voc_size = 10000
input_dim = 10

In [18]:
from keras.preprocessing import text, sequence

seq = tokenizer.texts_to_sequences(Comments)
pad = sequence.pad_sequences(seq, maxlen=100)

test_seq = tokenizer.texts_to_sequences(x_test)
test_pad = sequence.pad_sequences(test_seq, maxlen=100)

**Create the model**



In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Embedding(voc_size, input_dim, input_length=sent_length))
model.add(LSTM(128))
model.add(Dense(6, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 10)           100000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               71168     
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 774       
Total params: 171,942
Trainable params: 171,942
Non-trainable params: 0
_________________________________________________________________


In [20]:
len(pad), y.shape , len(test_pad), y_test.shape

(159571, (159571, 6), 153164, (153164, 6))

In [21]:
import numpy as np
x_train = np.array(pad)
y_train = np.array(y)

x_test = np.array(test_pad)
y_test = np.array(y_test)

In [22]:
x_train.shape , y_train.shape , x_test.shape, y_test.shape

((159571, 100), (159571, 6), (153164, 100), (153164, 6))

**Train the model**

In [28]:
# Fit on model
model.fit(x_train,y_train, validation_data=(x_test,y_test), epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f93d0d35630>

In [29]:
model.evaluate(x_test, y_test, batch_size=32, verbose=2)

4787/4787 - 16s - loss: -1.2300e+01 - accuracy: 0.9989


[-12.30030632019043, 0.9989357590675354]

In [30]:
# To save the model
model.save('LSTM_model.h5')



**Add Dropout**

In [31]:
from tensorflow.keras.layers import Dropout

model = Sequential()
model.add(Embedding(voc_size, input_dim, input_length=sent_length ))
model.add(Dropout(.3))
model.add(LSTM(100))
model.add(Dropout(.3))
model.add(Dense(6, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

**Loading the previous model**

In [37]:
from keras.models import load_model

# To load the model
load_model('LSTM_model.h5')

<tensorflow.python.keras.engine.sequential.Sequential at 0x7f93d0598c50>