In [63]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding,LSTM,Dense,Input
from nltk.corpus import stopwords

pd.set_option("display.max_columns", None)
np.random.seed(42)
tf.random.set_seed(42)

df_train = pd.read_json("dataset/PIZZA_train.json", lines=True,)
df_train = df_train.sample(100000)

In [64]:
# Tokenize input
input_texts = df_train['train.SRC']
output_texts = df_train['train.EXR']

input_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(input_texts)
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
input_vocab_size = len(input_tokenizer.word_index) + 1

# Pad input sequences
max_input_length = max(len(seq) for seq in input_sequences)
input_sequences_padded = pad_sequences(input_sequences, maxlen=max_input_length, padding='post')

output_tokenizer = Tokenizer(filters="")
output_tokenizer.fit_on_texts(output_texts)
output_sequences = output_tokenizer.texts_to_sequences(output_texts)
output_vocab_size = len(output_tokenizer.word_index) + 1

# Pad output sequences
max_output_length = max(len(seq) for seq in output_sequences)
output_sequences_padded = pad_sequences(output_sequences, maxlen=max_output_length, padding='post')

# Ensure both input and output sequences have the same length
max_length = max(max_input_length, max_output_length)
input_sequences_padded = pad_sequences(input_sequences, maxlen=max_length, padding='post')
output_sequences_padded = pad_sequences(output_sequences, maxlen=max_length, padding='post')

# Encoder
encoder_input = Input(shape=(max_length,))
encoder_embedding = Embedding(input_dim=input_vocab_size, output_dim=256)(encoder_input)
encoder_lstm, state_h, state_c = LSTM(128, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_input = Input(shape=(max_length-1,))
decoder_embedding = Embedding(input_dim=output_vocab_size, output_dim=256)(decoder_input)
decoder_lstm = LSTM(128, return_sequences=True, return_state=True)
decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(output_vocab_size, activation='softmax')
decoder_output = decoder_dense(decoder_output)

In [65]:
rare_tokens = sorted(output_tokenizer.word_counts, key=output_tokenizer.word_counts.get)[:30]
rare_tokens

['mexican',
 'supreme',
 'hawaiian',
 'vegan',
 'mediterranean',
 'coffee',
 'margherita',
 'chicago_style',
 'cauliflower_crust',
 'combination',
 'sourdough_crust',
 'all_vegetables',
 'cherry_coke',
 'vegetarian',
 'keto_crust',
 'neapolitan',
 'deep_dish',
 'cheese_lover',
 'gluten_free_crust',
 'cherry_pepsi',
 'new_york_style',
 'all_toppings',
 'olive_oil',
 'parsley',
 'italian_sausage',
 'roasted_garlic',
 'rosemary',
 'salami',
 'spicy_red_sauce',
 'lettuce']

In [66]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

class WeightedLoss(SparseCategoricalCrossentropy):
    def __init__(self, class_weights, **kwargs):
        super().__init__(**kwargs)
        self.class_weights = tf.convert_to_tensor(class_weights, dtype=tf.float32)  # Ensure weights are float32

    def __call__(self, y_true, y_pred, sample_weight=None):
        # Gather the weights for the true labels
        weights = tf.gather(self.class_weights, tf.cast(y_true, tf.int32))
        weights = tf.cast(weights, dtype=tf.float32)  # Ensure weights are float32

        # Calculate the base loss
        loss = super().__call__(y_true, y_pred)

        # Multiply the loss by the weights
        weighted_loss = loss * weights

        # If sample_weight is provided, apply it
        if sample_weight is not None:
            weighted_loss *= tf.cast(sample_weight, dtype=tf.float32)

        return weighted_loss



# Assign higher weights to rare tokens
class_weights = np.ones(output_vocab_size)
for word, index in output_tokenizer.word_index.items():
    if word in rare_tokens:
        class_weights[index] = 2.0 + 0.2*index  # Higher weight for rare tokens

loss = WeightedLoss(class_weights=class_weights)


In [67]:
model = Model([encoder_input, decoder_input], decoder_output)
model.compile(optimizer=Adam(learning_rate=10**-4), loss='sparse_categorical_crossentropy', metrics=['accuracy',])

# Correctly shape the decoder input and output data
decoder_input_data = output_sequences_padded[:, :-1]  # Remove the last token
decoder_output_data = output_sequences_padded[:, 1:]  # Remove the first token


model.fit(
    [input_sequences_padded, decoder_input_data],
    decoder_output_data,
    batch_size=16,
    epochs=5,
    # validation_split=0.2,
    shuffle=True,
)


Epoch 1/5
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m272s[0m 43ms/step - accuracy: 0.7990 - loss: 1.0045
Epoch 2/5
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m268s[0m 43ms/step - accuracy: 0.9261 - loss: 0.2573
Epoch 3/5
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m294s[0m 47ms/step - accuracy: 0.9460 - loss: 0.1914
Epoch 4/5
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m288s[0m 46ms/step - accuracy: 0.9586 - loss: 0.1524
Epoch 5/5
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m269s[0m 43ms/step - accuracy: 0.9655 - loss: 0.1274


<keras.src.callbacks.history.History at 0x130f3429150>

In [77]:
df_dev = pd.read_json("dataset/PIZZA_dev.json", lines=True)

In [75]:
# df_dev = pd.read_json("dataset/PIZZA_dev.json", lines=True)

# Encoder Model
encoder_model = Model(encoder_input, encoder_states)

# Decoder Model
decoder_state_input_h = Input(shape=(128,))
decoder_state_input_c = Input(shape=(128,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm_output, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_output = decoder_dense(decoder_lstm_output)

decoder_model = Model(
    [decoder_input] + decoder_states_inputs,
    [decoder_output] + decoder_states
)

def beam_search_decode(input_seq, beam_width=3):
    # Ensure the input sequence is padded and has the correct shape
    input_seq = pad_sequences(input_seq, maxlen=max_length, padding='post')  # Pad to max_length
    input_seq = np.array(input_seq)  # Convert to NumPy array

    # Pass the processed input sequence to the encoder
    states_value = encoder_model.predict(input_seq)

    # Beam search initialization
    sequences = [([output_tokenizer.word_index["(order"]], 0.0, states_value)]
    max_steps = max_output_length

    # Beam search decoding loop
    for _ in range(max_steps):
        all_candidates = []
        for seq, score, states in sequences:
            target_seq = np.array(seq[-1]).reshape(1, 1)  # Last token in the sequence
            output_tokens, h, c = decoder_model.predict([target_seq] + states)
            states = [h, c]

            # Expand to all possible next tokens
            for i in range(len(output_tokenizer.word_index)):
                candidate = (seq + [i], score - np.log(output_tokens[0, 0, i]), states)
                all_candidates.append(candidate)

        # Order all candidates by score
        ordered = sorted(all_candidates, key=lambda x: x[1])
        # Select the top `beam_width` sequences
        sequences = ordered[:beam_width]

        # Stop if all sequences in the beam encounter a 0 word index
        if all(seq[-1] == 0 for seq, _, _ in sequences):
            break

    # Return the best sequence (lowest score)
    best_sequence = sequences[0][0]
    decoded_sentence = " ".join(output_tokenizer.index_word[i] for i in best_sequence if i != 0)
    return decoded_sentence


test_input_sequence = input_tokenizer.texts_to_sequences(['i wish to have two pizzas'])
output = beam_search_decode(test_input_sequence)
print(output)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 184ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

In [76]:
from collections import defaultdict


def parse_sequence(sequence):
    """
    Parses a sequence into a nested structure for comparison.

    Args:
        sequence (str): Input sequence to parse.

    Returns:
        dict: Parsed structure with keys as parent elements and lists of children.
    """
    stack = []
    structure = defaultdict(list)
    tokens = re.findall(r"\(|\)|\w+", sequence)

    current_parent = None
    for token in tokens:
        if token == "(":
            stack.append(current_parent)
        elif token == ")":
            if stack:  # Check if stack is not empty
                stack.pop()
            else:
                raise ValueError("Unmatched closing parenthesis in sequence.")
        else:
            if stack:  # Only add to structure if there's a valid parent
                parent = stack[-1]
                if parent:
                    structure[parent].append(token)
            current_parent = token if token not in structure else None

    if stack:  # Check if there are unmatched opening parentheses
        return defaultdict(list)

    return structure


def compare_sequences_with_modulo(gt_structure, pred_structure):
    for key in gt_structure:
        if key not in pred_structure:
            return False
        if sorted(gt_structure[key]) != sorted(pred_structure[key]):
            return False
    return True


def evaluate_with_em(predictions, ground_truths):
    total = len(predictions)
    matches = 0

    for pred, gt in zip(predictions, ground_truths):
        pred_structure = parse_sequence(pred)
        gt_structure = parse_sequence(gt)
        if compare_sequences_with_modulo(gt_structure, pred_structure):
            matches += 1

    return matches / total if total > 0 else 0.0

In [78]:
predictions = []

for i in range(100):
    test_input_sequence = input_tokenizer.texts_to_sequences([df_dev["dev.SRC"].loc[i]])
    output = beam_search_decode(test_input_sequence)[:-1]
    predictions.append(output)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44

In [79]:
ground_truths = []
for i in range(100):
    ground_truths.append(df_dev["dev.EXR"].loc[i].lower())

In [80]:
# Evaluate EM with Modulo Sibling Order
accuracy = evaluate_with_em(predictions, ground_truths)
print(f"Exact Match Accuracy: {accuracy * 100:.2f}%")

Exact Match Accuracy: 1.00%
