In [None]:
import numpy as np
import pandas as pd
import time
from collections import defaultdict 
from tensorflow import keras
from keras import layers


In [None]:
!pip install tqdm

In [None]:
import gensim, re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import sys

from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Embedding

In [None]:
with open("/Data/viettel_train_input_no_tokenize.txt","r",encoding='UTF-8') as f:
  X_train=f.read().splitlines()
with open("/Data/viettel_test_input_no_tokenize.txt","r",encoding='UTF-8') as f:
  X_test=f.read().splitlines()
with open("/Data/viettel_train_label.txt","r",encoding='UTF-8') as f:
  y_train=f.read().splitlines()
with open("/Data/viettel_test_label.txt","r",encoding='UTF-8') as f:
  y_test=f.read().splitlines()

In [None]:
w2v_model = gensim.models.Word2Vec(sentences=X_train, vector_size=500, window=5, min_count=2, workers=4,iter=50)
w2v_weights = w2v_model.wv.vectors
vocab_size, embedding_size = w2v_weights.shape
print("Vocabulary Size: {} - Embedding Dim: {}".format(vocab_size, embedding_size))

In [None]:
def word2token(word):
    try:
        return w2v_model.wv.vocab[word].index
    except KeyError:
        return 0
def token2word(token):
    return w2v_model.wv.index2word[token]

In [None]:
from sklearn.manifold import TSNE
import random
import matplotlib.pyplot as plt
n_samples = 500
# Sample random words from model dictionary
random_i = random.sample(range(vocab_size), n_samples)
random_w = [token2word(i) for i in random_i]

# Generate Word2Vec embeddings of each word
word_vecs = np.array([w2v_model[w] for w in random_w])

# Apply t-SNE to Word2Vec embeddings, reducing to 2 dims
tsne = TSNE()
tsne_e = tsne.fit_transform(word_vecs)

# Plot t-SNE result
plt.figure(figsize=(32, 32))
plt.scatter(tsne_e[:, 0], tsne_e[:, 1], marker='o', c=range(len(random_w)), cmap=plt.get_cmap('Spectral'))

for label, x, y, in zip(random_w, tsne_e[:, 0], tsne_e[:, 1]):
    plt.annotate(label,
                 xy=(x, y), xytext=(0, 15),
                 textcoords='offset points', ha='right', va='bottom',
                 bbox=dict(boxstyle='round, pad=0.2', fc='yellow', alpha=0.1))

In [None]:
X_train=[[word2token(word) for word in text] for text in X_train]
X_train= pad_sequences(X_train)
X_test=[[word2token(word) for word in text] for text in X_test]
X_test=pad_sequences(X_test,maxlen=X_train.shape[1])

In [None]:
labels=list(set(y_train))
label2id=dict([label,id] for id,label in enumerate(labels))
y_train_vectorized=np.array([label2id[label] for label in y_train])
y_test_vectorized=np.array([label2id[label] for label in y_test])

In [None]:
w2v_weights = w2v_model.wv.vectors

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
model = Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=embedding_size,weights=[w2v_weights],trainable=False,input_length=X_train.shape[1]))
model.add(LSTM(256,return_sequences=True))
model.add(LSTM(64,return_sequences=True))
model.add(LSTM(128,return_sequences=False))
model.add(Dense(len(labels),activation="softmax"))
model.summary()
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=['acc'])
checkpoint_filepath="/Word2Vec+LSTM-weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
batch = 64
epochs = 40
checkpoint=ModelCheckpoint(checkpoint_filepath, 
                monitor = 'val_acc', 
                verbose = 1, 
                save_best_only = True, 
                mode = 'max')
callbacks_list = [checkpoint]
batch = 64
epochs = 80
history=model.fit(X_train,y_train_vectorized,batch,epochs,validation_data=(X_test,y_test_vectorized),callbacks=callbacks_list)

In [None]:
model.load_weights("/Word2Vec+LSTM-weights-improvement-61-0.66.hdf5")
start_time=time.time()
logits=model.predict(X_test)
print("Inference in {} seconds".format((time.time()-start_time)/len(X_test)))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
cnf_matrix = confusion_matrix(y_test_vectorized, pred)
df_cm = pd.DataFrame(cnf_matrix)
plt.figure(figsize=(30,10))
sn.set(font_scale=1.4) # for label size
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size

plt.show()