In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from utils import *
from gensim.models import KeyedVectors

# Load the Word2Vec model (binary format)
model_path = "C:\\Users\\Marwan\\Desktop\\College\\5_1\\NLP\\Food-Order-Parsing\\word2vec\\GoogleNews-vectors-negative300.bin"
word2vec = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [None]:
import gensim.downloader as api

# Download the model (if not already cached) and load it
word2vec = api.load('word2vec-google-news-300')

In [None]:
from utils import *
path = lambda x: f"datasets/LSTM_{x}.json"
X_train, y_train = load_lstm_dataset_from_json(path("train_12000"))
X_dev, y_dev = load_lstm_dataset_from_json(path("dev"))

print("Train size:", len(X_train))
print("Dev size:", len(X_dev))

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, TimeDistributed, Masking
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split


embedding_dim = 300


# Function to get Word2Vec vectors for a sentence
def sentence_to_embeddings(sentence, word2vec, embedding_dim):
    embeddings = []
    for word in sentence.split():
        if word in word2vec:
            embeddings.append(word2vec[word])
        else:
            embeddings.append(np.zeros(embedding_dim))  # OOV words get a zero vector
    return np.array(embeddings)

# Convert sentences to Word2Vec embeddings
X_train_embeddings = [sentence_to_embeddings(sentence, word2vec, embedding_dim) for sentence in X_train]
X_dev_embeddings = [sentence_to_embeddings(sentence, word2vec, embedding_dim) for sentence in X_dev]

In [None]:
path = lambda x: f"datasets/LSTM_{x}.json"
X_train, y_train = load_lstm_dataset_from_json(path("train"), size=12000)
X_dev, y_dev = load_lstm_dataset_from_json(path("dev"))

print("Train size:", len(X_train))
print("Dev size:", len(X_dev))

In [None]:
# Pad the sequences
max_len = max(
    max(len(seq) for seq in X_train_embeddings),
    max(len(seq) for seq in X_dev_embeddings)
)
X_train_padded = pad_sequences(X_train_embeddings, maxlen=max_len, dtype='float32', padding='post')
X_dev_padded = pad_sequences(X_dev_embeddings, maxlen=max_len, dtype='float32', padding='post')

# Pad and one-hot encode the labels
y_train_padded = pad_sequences(y_train, maxlen=max_len, padding='post', value=-1)
y_dev_padded = pad_sequences(y_dev, maxlen=max_len, padding='post', value=-1)

num_classes = len(set(label for seq in y_train + y_dev for label in seq))

y_train_one_hot = np.array([to_categorical(seq, num_classes=num_classes) for seq in y_train_padded])
y_dev_one_hot = np.array([to_categorical(seq, num_classes=num_classes) for seq in y_dev_padded])

In [None]:
import tensorflow as tf

# Enable Eager Execution if not enabled already
tf.config.run_functions_eagerly(True)

# ... (Your existing code)

# Build the model
model = Sequential([
    Bidirectional(LSTM(64, return_sequences=True, input_shape=(max_len, embedding_dim))),  # BiLSTM layer
    TimeDistributed(Dense(num_classes, activation='softmax'))  # Output layer
])

# Get the mask from the input
mask = tf.not_equal(X_train_padded, 0.0) # Assuming 0.0 is your padding value

# Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

In [None]:
# Train the model
model.fit(
    X_train_padded,
    y_train_one_hot,
    validation_data=(X_dev_padded, y_dev_one_hot),
    epochs=10,
    batch_size=32
)

In [None]:
from tensorflow import keras

loaded_model = keras.models.load_model('my_lstm_model.keras')

# Assuming 'new_sentence' is your input sentence
new_sentence_embeddings = sentence_to_embeddings(new_sentence, word2vec, embedding_dim)
new_sentence_padded = pad_sequences([new_sentence_embeddings], maxlen=max_len, dtype='float32', padding='post')

predictions = loaded_model.predict(new_sentence_padded)
predicted_labels = np.argmax(predictions, axis=-1)

In [None]:
import numpy as np

def calculate_accuracy_with_tolerance(model, X_dev, y_dev, word2vec, embedding_dim, max_len, tolerance=2):
    """
    Calculates accuracy with a tolerance for incorrect elements.

    Args:
        model: The trained LSTM model.
        X_dev: The original unpadded input data for the development set.
        y_dev: The true labels for the development set.
        word2vec: The Word2Vec model used for embeddings.
        embedding_dim: The embedding dimension.
        max_len: The maximum sequence length used for padding during training.
        tolerance: The maximum number of incorrect elements allowed for a sentence to be considered correct.

    Returns:
        The accuracy score.
    """

    correct_predictions = 0
    total_predictions = 0

    for i in range(len(X_dev)):
        sentence = X_dev[i]
        true_labels = y_dev[i]

        # Get embeddings for the current sentence
        sentence_embeddings = sentence_to_embeddings(sentence, word2vec, embedding_dim)

        # Pad the sentence embeddings to match the model's input shape
        sentence_padded = pad_sequences([sentence_embeddings], maxlen=max_len, dtype='float32', padding='post')

        # Get predictions for the padded sentence
        predictions = model.predict(sentence_padded)
        predicted_labels = np.argmax(predictions, axis=-1)[0]  # Get predicted labels for the sentence

        # Calculate the number of incorrect elements
        num_incorrect = np.sum(predicted_labels[:len(true_labels)] != true_labels)

        # Check if the number of incorrect elements is within the tolerance
        if num_incorrect <= tolerance:
            correct_predictions += 1
        total_predictions += 1

    accuracy = correct_predictions / total_predictions
    return accuracy

# Example usage with tolerance=2:
accuracy = calculate_accuracy_with_tolerance(model, X_dev, y_dev, word2vec, embedding_dim, max_len, tolerance=2)
print(f"Accuracy with tolerance 5: {accuracy}")