<a href="https://colab.research.google.com/github/Athurnm/MachineLearning/blob/master/NLP_Text_Classification_LSTMandFF_Keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing Dependencies
# Using Recurrent neural network
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Embedding, SpatialDropout1D
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import numpy as np

# Install for saving model
!pip install h5py pyyaml
!pip install tf_nightly

Using TensorFlow backend.




#Data Preparation

In [2]:
# Importing data from drive
from google.colab import files
uploaded = files.upload()

Saving DB INTENT.csv to DB INTENT (3).csv


In [0]:
import pandas as pd
import io
df = pd.read_csv(io.StringIO(uploaded['DB INTENT.csv'].decode('utf-8')))

In [4]:
# Checking data
df.head()

Unnamed: 0,TEKS_KALIMAT,ID_KONTEKS
0,"bisa dibantu maaf, selamat pagi",33
1,"mbak, saya mau melaporkan ini gangguan telepon",12
2,"sering kali nggak bisa ditelepon, nggak bisa t...",12
3,kan sejak diganti sama ada dikasih modem itu,12
4,iya baik,19


In [5]:
# Load tools we need for preprocessing
# Tokenizer use to tokenize text into words
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['TEKS_KALIMAT'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 6691 unique tokens.


In [6]:
X = tokenizer.texts_to_sequences(df['TEKS_KALIMAT'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (21174, 250)


In [7]:
target = pd.get_dummies(df['ID_KONTEKS'].values)
print('Shape of data tensor:', target.shape)

Shape of data tensor: (21174, 42)


In [0]:
#from sklearn.preprocessing import LabelBinarizer

In [0]:
#encoder = LabelBinarizer()
#encoder.fit(ta)
#target = encoder.transform(ta)

In [0]:
#print(ta[0])
#print(target[1])

In [0]:
# convert pandas dataframe to string type (for list)
#text = df['TEKS_KALIMAT']
#text = text.astype(str)


# target = df['ID_KONTEKS_ALT_1'].values.tolist()
#target = df['ID_KONTEKS']
#target = target.astype(str)

In [0]:
# change to list type
#t = text.tolist()
#ta = target.tolist()
#print(type(t))
#print(type(ta))

In [0]:
#VOCAB_SIZE = 20000 #define vocabulary size is 20000

In [0]:
#tokenizer = Tokenizer(num_words=VOCAB_SIZE) # Setup tokenizer

In [0]:
#tokenizer.fit_on_texts(t)

In [0]:
#text = tokenizer.texts_to_matrix(t, mode='tfidf')

In [0]:
#tokenizer.index_word #list words

In [0]:
#word_index = tokenizer.word_index
#print('Found %s unique tokens.' % len(word_index))

##Data Split for Train and Validation

In [0]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X, target, test_size=0.2, random_state=1)

#RNN Model
RNN Model will be implemented with LSTM approach to solve vanishing gradient issues

In [0]:
# Instatiate model
model = Sequential()

##Create Model

In [0]:
from keras.layers import Bidirectional

# Input layer with embedding words
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM))
model.add(SpatialDropout1D(0.1))

# Hidden layer RNN using LSTM
model.add(Bidirectional(LSTM(125, dropout=0.1, recurrent_dropout=0.1, return_sequences=True)))
model.add(Bidirectional(LSTM(50, dropout=0.1)))

# Output layer 42 class
model.add(Dense(42, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 80
batch_size = 64

model.summary()

## LSTM - with EarlyStopping Model Fit

In [0]:
# using early stopping
history = model.fit(Xtrain, Ytrain, epochs=epochs, batch_size=batch_size,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Train on 13551 samples, validate on 3388 samples
Epoch 1/80
Epoch 2/80
Epoch 3/80
 2304/13551 [====>.........................] - ETA: 5:37 - loss: 2.5482 - acc: 0.2457

In [0]:
model.save('NLP_Classification_LSTM_EarlyStopping.h5')

###Evaluate LSTM (RNN Implementation) - with EarlyStopping Model

In [0]:
# Evaluate accuracy with test data chunk
accr = model.evaluate(Xtest,Ytest)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [0]:
# Import matplotlib
import matplotlib.pyplot as plt

# Plot the first model loss and validating loss for knowing overfit and underfit
plt.title('Loss for first model')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

In [0]:
# Plot the first model accuracy and validating accuracy for knowing overfit and underfit
plt.title('Accuracy for First Model')
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='test')
plt.legend()
plt.show();

## LSTM - without EarlyStopping Model Fit

In [0]:
# without early stopping
history2 = model.fit(Xtrain, Ytrain, epochs=epochs, batch_size=batch_size,validation_split=0.1)

In [0]:
model.save('NLP_Classification_LSTM.h5')

###Evaluate LSTM (RNN Implementation) - without EarlyStopping Model

In [0]:
# Evaluate accuracy with test data chunk
accr = model.evaluate(Xtest,Ytest)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [0]:
# Plot the second model loss and validating loss for knowing overfit and underfit
plt.title('Loss for second model')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

In [0]:
# Plot the second model accuracy and validating accuracy for knowing overfit and underfit
plt.title('Accuracy for Second Model')
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='test')
plt.legend()
plt.show();

#Feed Forward DNN Model

In [0]:
# Instatiate model
model1 = Sequential()

# Create Model
model1.add(Dense(512, activation='relu', input_shape=(MAX_NB_WORDS,)))
model1.add(Dropout(0.3))
model1.add(Dense(512, activation='relu'))
model1.add(Dropout(0.3))
model1.add(Dense(42, activation='softmax'))
model1.summary()

# Build Model
model1.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [0]:
epochs = 50
batch_size = 128

##FFNN - with early stopping fit

In [0]:
history3 = model1.fit(Xtrain, Ytrain, epochs=epochs, batch_size=batch_size,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])