# Multiclass Classification using Bi-GRU and Embedding Matrix with Glove

## Dataset
- https://www.kaggle.com/stefanlarson/outofscope-intent-classification-dataset

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_rows', 700)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
train = pd.read_json('/kaggle/input/outofscope-intent-classification-dataset/is_train.json')
val = pd.read_json('/kaggle/input/outofscope-intent-classification-dataset/is_val.json')
test = pd.read_json('/kaggle/input/outofscope-intent-classification-dataset/is_test.json')
oos_train = pd.read_json('/kaggle/input/outofscope-intent-classification-dataset/oos_train.json')
oos_val = pd.read_json('/kaggle/input/outofscope-intent-classification-dataset/oos_val.json')
oos_test = pd.read_json('/kaggle/input/outofscope-intent-classification-dataset/oos_test.json')
files = [(train,'train'),(val,'val'),(test,'test'),(oos_train,'oos_train'),(oos_val,'oos_val'),(oos_test,'oos_test')]
for file,name in files:
    file.columns = ['text','intent']
    print(f'{name} shape:{file.shape}, {name} has {train.isna().sum().sum()} null values')

In [None]:
import re
from string import digits
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional, GRU
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, OneHotEncoder
from tensorflow.keras.models import Sequential
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
df = train.append(test)
df.shape, train.shape, test.shape

In [None]:
df.intent.value_counts()

In [None]:
li = ['play_music', 'smart_home', 'current_location', 'tell_joke', 'next_song',  'flip_coin', 'greeting', 'what_song', 'calendar', 'time']

In [None]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

In [None]:
df_ = pd.DataFrame()

for index, row in df.iterrows():
    if row['intent'] in li:
        df_ = df_.append(row.to_dict(), ignore_index = True)
df_.shape

In [None]:
df_.head()

In [None]:
df_.intent.unique()

In [None]:
def preprocess(df):
    # convert source and target text to Lowercase
    df['text'] = df['text'].apply(lambda x: x.lower())
    
    # creating a space between a word and the punctuation following it
    df['text'] = df['text'].apply(lambda x: re.sub(r"([?.!,¿])", r" \1 ", x))
    df['text'] = df['text'].apply(lambda x: re.sub(r'[" "]+', " ", x))
    
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    df['text'] = df['text'].apply(lambda x: re.sub(r"[^a-zA-Z?.!,¿]+", " ", x))
    
    # Remove digits from source and target sentences
    num_digits = str.maketrans('', '', digits)
    df['text'] = df['text'].apply(lambda x: x.translate(num_digits))
    
    # Remove extra spaces
    df['text'] = df['text'].apply(lambda x: x.strip())

    df['text'] = df['text'].apply(lambda x: re.sub(" +", " ", x))

    return df

In [None]:
df = preprocess(df_)
df.head()

In [None]:
df.shape

In [None]:
# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
df['le_intent'] = labelencoder.fit_transform(df['intent'])
df

In [None]:
print(labelencoder.classes_)
np.save('classes.npy', labelencoder.classes_)

In [None]:
labels = to_categorical(df['le_intent'], num_classes=len(li))
print(labels[:10])

In [None]:
# n_most_common_words = 8000
# max_len = 130
# tokenizer = Tokenizer(num_words=n_most_common_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
# tokenizer.fit_on_texts(df['text'].values)
# sequences = tokenizer.texts_to_sequences(df['text'].values)
# word_index = tokenizer.word_index
# print('Found %s unique tokens.' % len(word_index))

# X = pad_sequences(sequences, maxlen=max_len)

In [None]:
n_most_common_words = 8000
max_len = 40
def tokenization(data, maxlength = 100):
    token = Tokenizer(num_words=n_most_common_words, lower=True, oov_token='oov')
    token.fit_on_texts(data)
    
    data_seq = token.texts_to_sequences(data)
    data_pad = pad_sequences(data_seq, maxlen=maxlength, padding='post')
    
    return token, data_pad

token, X = tokenization(df['text'].values, maxlength=max_len)

In [None]:
reverse_word_index = {v: k for k, v in token.word_index.items()}

In [None]:
path_glove = '/kaggle/input/glove6b200d/glove.6B.200d.txt'

# creating glove vectors
def get_glove_vector():
    glove_vectors = {}

    with open(path_glove, "r", encoding="UTF-8") as glove:
        for line in glove:
            values = line.split()
            word = values[0]
            vectors = np.asarray(values[1:])
            glove_vectors[word] = vectors
    return glove_vectors

glove_vectors = get_glove_vector()
total_words = len(glove_vectors.keys()) 
total_words

In [None]:
emb_dim = 200


# create word vector matrix with glove vectors
def create_word_vector_matrix(token, glove_vectors, vocab_size, emb_dim):
    word_vector_matrix = np.zeros((vocab_size+1, emb_dim))
    
    count = 0
    for word, index in token.word_index.items():
        vector = glove_vectors.get(word)
        if vector is not None:
            word_vector_matrix[index] = vector
        else:
            count += 1
    print(f"Vector not found for {count} words")
    return word_vector_matrix

vocab_size = len(token.word_index) 
emb_matrix = create_word_vector_matrix(token, glove_vectors, vocab_size, emb_dim)

In [None]:
emb_matrix.shape

In [None]:
type(emb_matrix)

In [None]:
print("Saving emb_matrix")
np.save('emb_matrix.npy', emb_matrix)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X , labels, test_size=0.25, random_state=42)

In [None]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
emb_dim = 200
batch_size = 256

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size+1, 
                    output_dim=emb_dim, 
                    input_length=X.shape[1],
                    weights=[emb_matrix],
                    trainable=False))
# model.add(Dropout(0.2))
# model.add(Input(shape=(50,)))
model.add(Bidirectional(GRU(128)))
# model.add(Dense(units=256, activation='relu'))
# model.add(Dropout(0.2))
model.add(Dense(units=len(li), activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

In [None]:
# print((X_train.shape, y_train.shape, X_test.shape, y_test.shape))

# model = Sequential()
# model.add(Embedding(n_most_common_words, emb_dim, input_length=X.shape[1]))
# model.add(SpatialDropout1D(0.7))
# model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
# model.add(Dense(150, activation='softmax'))
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
# print(model.summary())
# history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.0001)])

In [None]:
!pip install -q tensorgram
from tensorgram import TensorGram
token_data = "479470573"
tg=TensorGram("LSTM",token_data)

In [None]:
checkpoint_filepath = '/tmp/checkpoint'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [None]:
epochs = 40
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.0001),
                                                                                                            model_checkpoint_callback, 
                                                                                                           tg])

In [None]:
accr = model.evaluate(X_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
import matplotlib.pyplot as plt

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
model.save("model/BI-GRU.h5")

In [None]:
os.listdir('model/')

In [None]:
ans = model.predict(X_test[0])
print(ans.shape)
print(labels[np.argmax(ans)])

# Save Tokenizer

In [None]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(token, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Inference

In [None]:
import pickle
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:

classes = np.load('classes.npy', allow_pickle=True)
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

model = load_model('model/BI-GRU.h5')

new_complaint = ['how is everything going for you']
# seq = tokenizer.texts_to_sequences(new_complaint)


n_most_common_words = 8000
max_len = 40


def tokenization(data, maxlength=100):
    token = Tokenizer(num_words=n_most_common_words, lower=True, oov_token='oov')
    token.fit_on_texts(data)

    data_seq = token.texts_to_sequences(data)
    data_pad = pad_sequences(data_seq, maxlen=maxlength, padding='post')

    return token, data_pad


token, seq = tokenization(new_complaint, maxlength=max_len)
padded = pad_sequences(seq, maxlen=max_len)
pred = model.predict(padded)
res = np.argmax(model.predict(padded), axis=-1)
print(classes[res[0]])


# Check save files

In [None]:
os.listdir()