In [1]:
import sys
import os
import numpy as np
import json
import tensorflow as tf
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, TimeDistributed, Dense, Dropout ,LayerNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_hub as hub

# from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import pickle




In [2]:
root_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(root_dir)

In [3]:
from Preprocessor import Preprocessor
from embeddings import Glove

In [4]:
preprocessor = Preprocessor("../dataset/PIZZA_train_sampled.json", "../dataset/PIZZA_dev.json", "../dataset/PIZZA_test.json", 
                            "../dataset/preprocessed_PIZZA_train.json", "../dataset/preprocessed_PIZZA_dev.json", 
                            "../dataset/preprocessed_PIZZA_test.json")

In [5]:
glove50="../dataset/glove.6B.50d.txt"
glove100="../dataset/glove.6B.100d.txt"
glove200="../dataset/glove.6B.200d.txt"
glove300="../dataset/glove.6B.300d.txt"
Glove= Glove("../dataset/PIZZA_train_sampled.json",glove300,300)
Glove.save_tokenizer("word_tokenizer.pkl")

In [6]:
def read_file(doc_path,dataset_type):
    with open(doc_path, 'r') as f:
            labels=[]
            sentences=[]
            for line in f:
                parsed_line = json.loads(line.strip())
                if f"{dataset_type}.SRC" in parsed_line:
                    sentences.append(parsed_line[f"{dataset_type}.SRC"])
                if f"{dataset_type}.LABELS" in parsed_line:
                    labels.append(parsed_line[f"{dataset_type}.LABELS"])
    return sentences, labels
sentences, labels = read_file("../dataset/PIZZA_train_sampled.json","train")

# tokenizing words
with open('word_tokenizer.pkl', 'rb') as f:
    word_tokenizer = pickle.load(f)
X_words_sequences = word_tokenizer.texts_to_sequences(sentences)
vocab_size = len(word_tokenizer.word_index)+1 # +1 for padding

label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)
y_label_sequences = label_tokenizer.texts_to_sequences(labels)

with open('label_tokenizer.pkl', 'wb') as f:
    pickle.dump(label_tokenizer, f)
    
max_length = max([len(seq) for seq in X_words_sequences])
x_words_padded = pad_sequences(X_words_sequences, maxlen=max_length, padding='post')
y_words_padded = pad_sequences(y_label_sequences, maxlen=max_length, padding='post')
num_classes_of_labels = len(label_tokenizer.word_index)+1 # +1 for padding
print(vocab_size, num_classes_of_labels)
print(x_words_padded[0], y_words_padded[0])
y_words_cat = to_categorical(y_words_padded, num_classes=num_classes_of_labels)
X_word_train = np.array(x_words_padded)
y_word_train = np.array(y_words_cat) 

417 18
[26 14  3 46 36  7  3 20 34  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0] [ 4  1  1  5 11  1  1  6  9  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]


In [9]:
# Part 3: Build the model for sequence labeling (one label per timestep)
embedding_dim=Glove.embedding_dim
embedding_matrix=Glove.embedding_matrix
model = Sequential()
model.add(Embedding(input_dim=vocab_size,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],  # Load pre-trained GloVe embeddings
                    trainable=True)) # Fine-tune the embeddings
model.add(Bidirectional(LSTM(units=256, return_sequences=True)))  # return_sequences=True for sequence labeling
LayerNormalization() # we use it to normalize the activations of the previous layer at each step
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(units=128, return_sequences=True, recurrent_dropout=0.2)))
model.add(TimeDistributed(Dense(num_classes_of_labels, activation='softmax')))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_word_train, y_word_train, batch_size=16, epochs=6, validation_split=0.1)
model.save('sequence_labeling_glove_model.keras')
model_path="sequence_labeling_glove_model.keras"

Epoch 1/6
[1m1529/1529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 43ms/step - accuracy: 0.9769 - loss: 0.0989 - val_accuracy: 0.9301 - val_loss: 0.4141
Epoch 2/6
[1m1529/1529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 42ms/step - accuracy: 1.0000 - loss: 9.1968e-05 - val_accuracy: 0.9328 - val_loss: 0.4670
Epoch 3/6
[1m1529/1529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 48ms/step - accuracy: 1.0000 - loss: 1.3101e-04 - val_accuracy: 0.9373 - val_loss: 0.4227
Epoch 4/6
[1m1529/1529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 40ms/step - accuracy: 1.0000 - loss: 3.1017e-05 - val_accuracy: 0.9321 - val_loss: 0.5043
Epoch 5/6
[1m1529/1529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 41ms/step - accuracy: 1.0000 - loss: 1.7502e-05 - val_accuracy: 0.9370 - val_loss: 0.5204
Epoch 6/6
[1m1529/1529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 43ms/step - accuracy: 1.0000 - loss: 5.3654e-06 - val_accuracy: 0.9381 - val_los

In [8]:
def predict_labels(sentences, model_path, tokenizer_path, labels_tokenizer_path, max_length):
    # Load the pre-trained model
    model = load_model(model_path)
    
    # Load tokenizers
    with open(tokenizer_path, 'rb') as f:
        tokenizer = pickle.load(f)
    with open(labels_tokenizer_path, 'rb') as f:
        label_tokenizer = pickle.load(f)
    
    # Tokenize and pad sentences
    sequences = tokenizer.texts_to_sequences(sentences)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    X_input = tf.convert_to_tensor(padded_sequences)  # Use TensorFlow tensors for efficiency

    # Make predictions
    predictions = model.predict(X_input, verbose=0)

    # Convert predictions to labels
    predicted_labels = np.argmax(predictions, axis=-1)

    # Map indices back to labels
    index_to_label = {v: k.upper() for k, v in label_tokenizer.word_index.items()}
    predicted_labels_mapped = [
        [index_to_label.get(idx) for idx in seq if idx != 0] for seq in predicted_labels
    ]

    return predicted_labels_mapped
sentences, true_labels = read_file("../dataset/PIZZA_test2.json","test")
tokenizer_path = "word_tokenizer.pkl"
label_tokenizer_path = "label_tokenizer.pkl"
max_length = max_length

predicted_labels = predict_labels(sentences, model_path, tokenizer_path, label_tokenizer_path, max_length)
i=0
false_count=0
true_count=0
for sentence, labels in zip(sentences, predicted_labels):
    IsEqual=np.array_equal(labels, true_labels[i])
    if(IsEqual==False):
        false_count+=1
    else:
        true_count+=1
    i+=1
accuracy= true_count/(true_count+false_count)
print("Sentence Accuracy",accuracy*100)

Sentence Accuracy 35.68129330254042
