In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Layer, Embedding, LSTM, Dense, Input, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
import re

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
np.random.seed(42)
tf.random.set_seed(42)

In [2]:
main_train = pd.read_json("dataset/PIZZA_train.json", lines=True,)
df_dev = pd.read_json("dataset/PIZZA_dev.json", lines=True,)

In [3]:
df_train = main_train.sample(10000)

In [4]:
# Data Preparation
x_train = df_train["train.SRC"]
y_train = df_train["train.EXR"]

x_dev = df_dev["dev.SRC"]
y_dev = df_dev["dev.EXR"]

In [5]:
def clean_text(text):
    text = re.sub(r"[^\w']", " ", text)  # Remove non-word characters
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces
    text = text.lower().strip()  # Lowercase and strip whitespace
    return text

In [6]:
lemmatizer = WordNetLemmatizer()

def lemma(text):
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    return " ".join(text)

In [25]:
class Encoder(Layer):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm = LSTM(hidden_dim, return_sequences=True, return_state=True)

    def call(self, inputs):
        embedded = self.embedding(inputs)
        outputs, state_h, state_c = self.lstm(embedded)
        return outputs, [state_h, state_c]

class Attention(Layer):
    def __init__(self, units):
        super().__init__()
        self.alignment_layer = Dense(units, activation='tanh')
        self.score_layer = Dense(1)

    def call(self, query, encoder_outputs, mask=None):
        # Expand query to match encoder_outputs dimensions
        query = tf.expand_dims(query, 1)  # Shape: (batch_size, 1, hidden_dim)
        
        # Repeat query to match encoder_outputs sequence length
        query = tf.repeat(query, encoder_outputs.shape[1], axis=1)
        
        # Concatenate query and encoder_outputs
        concatenated = tf.concat([query, encoder_outputs], axis=-1)
        
        # Compute alignment scores
        score = self.score_layer(self.alignment_layer(concatenated))
        score = tf.squeeze(score, axis=-1)
        
        # Apply softmax to get attention weights
        if mask is not None:
            score = tf.where(mask, score, float('-inf'))
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # Compute context vector
        context_vector = tf.reduce_sum(
            attention_weights[..., tf.newaxis] * encoder_outputs, 
            axis=1
        )
        
        return context_vector, attention_weights

class PointerGeneratorNetwork(Model):
    def __init__(self, 
                 src_vocab_size, 
                 tgt_vocab_size, 
                 embedding_dim=256, 
                 hidden_dim=512, 
                 max_length=100):
        super().__init__()
        
        # Encoder
        self.encoder = Encoder(src_vocab_size, embedding_dim, hidden_dim)
        
        # Embedding for decoder
        self.embedding = Embedding(tgt_vocab_size, embedding_dim)
        
        # Attention Mechanism
        self.attention = Attention(hidden_dim)
        
        # Decoder LSTM
        self.decoder_lstm = LSTM(hidden_dim, return_sequences=True, return_state=True)
        
        # Output Layer
        self.output_layer = Dense(tgt_vocab_size, activation='softmax')
        
        # Hyperparameters
        self.hidden_dim = hidden_dim
        self.max_length = max_length
        self.tgt_vocab_size = tgt_vocab_size

    def call(self, inputs, training=None):
        # Split inputs into source and target
        source_input = inputs[:, :inputs.shape[1]//2]
        
        # Encoder processing
        encoder_outputs, encoder_states = self.encoder(source_input)
        
        # Initial decoder state
        decoder_states = encoder_states
        
        # Placeholder for decoder outputs
        decoder_outputs = []
        
        # Decoder input (use target part of input for teacher forcing)
        decoder_input = inputs[:, inputs.shape[1]//2:]
        
        for t in range(self.max_length):
            # Select current timestep input
            current_input = decoder_input[:, t:t+1] if t < decoder_input.shape[1] else tf.fill([tf.shape(inputs)[0], 1], 0)
            
            # Embedding decoder input
            decoder_embedded = self.embedding(current_input)
            
            # Attention
            context_vector, _ = self.attention(
                decoder_states[0], encoder_outputs
            )
            
            # Concatenate embedded input, context vector
            lstm_input = tf.concat([decoder_embedded, tf.expand_dims(context_vector, 1)], axis=-1)
            
            # Decoder LSTM
            decoder_output, state_h, state_c = self.decoder_lstm(
                lstm_input, initial_state=decoder_states
            )
            
            # Output layer
            output = self.output_layer(decoder_output)  # Remove this if necessary
            # output = tf.squeeze(output, axis=-2)  # Remove the extra dimension

            
            decoder_outputs.append(output)
            
            # Update decoder states
            decoder_states = [state_h, state_c]
        
        # Stack decoder outputs
        decoder_outputs = tf.stack(decoder_outputs, axis=1)
        
        return decoder_outputs

    def train_step(self, data):
        x, y = data
        
        with tf.GradientTape() as tape:
            # Forward pass
            y_pred = self(x, training=True)
            
            # Reshape y to match model output
            y_reshaped = y[:, :y_pred.shape[1]]
            y_reshaped = tf.one_hot(y_reshaped, depth=self.tgt_vocab_size)
            
            
            # Compute loss
            loss = tf.keras.losses.categorical_crossentropy(y_reshaped, y)
            loss = tf.reduce_mean(loss)
        
        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        
        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        
        return {"loss": loss}

# Tokenizers and data preparation
class StructuredOutputTokenizer:
    def __init__(self):
        self.word_to_index = {}
        self.index_to_word = {}
        self.next_index = 1  # Start from 1 as 0 is typically reserved for padding

    def fit_on_texts(self, texts):
        # Tokenize and create vocabulary
        for text in texts:
            tokens = text.split()
            for token in tokens:
                if token not in self.word_to_index:
                    self.word_to_index[token] = self.next_index
                    self.index_to_word[self.next_index] = token
                    self.next_index += 1
        return self

    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            sequence = [self.word_to_index.get(token, 0) for token in text.split()]
            sequences.append(sequence)
        return sequences

# Prepare data
input_tokenizer = Tokenizer(oov_token='UNK')
output_tokenizer = StructuredOutputTokenizer()

# Prepare data
X_train = [
    "i'd like three large pies with pestos and yellow peppers",
]
y_train = [
    "(ORDER (PIZZAORDER (NUMBER 3 ) (SIZE LARGE ) (TOPPING PESTO ) (TOPPING YELLOW_PEPPERS ) ) )",
]

# Fit tokenizers
input_tokenizer.fit_on_texts(X_train)
output_tokenizer.fit_on_texts(y_train)

# Convert to sequences
X_train_seq = input_tokenizer.texts_to_sequences(X_train)
y_train_seq = output_tokenizer.texts_to_sequences(y_train)


# Get vocabulary sizes
input_vocab_size = len(input_tokenizer.word_index) + 1
output_vocab_size = output_tokenizer.next_index

max_length = max(len(seq) for seq in X_train_seq)
max_length = max(max_length, max(len(seq) for seq in y_train_seq))

# Pad sequences
X_train_padded = pad_sequences(X_train_seq,max_length, padding='post')
y_train_padded = pad_sequences(y_train_seq, max_length,padding='post')

# Combine input and output sequences
# X_train_combined = np.concatenate([X_train_padded, y_train_padded], axis=1)

# Create model
model = PointerGeneratorNetwork(
    src_vocab_size=input_vocab_size,
    tgt_vocab_size=output_vocab_size
)

# Compile the model
model.compile(
    optimizer='adam',
    loss=None  # We're using custom train_step
)


# Prepare target data for training
# Note: We need to reshape the target to match the model's output shape
y_train_sparse = np.expand_dims(y_train_padded, axis=-1)

# Train
model.fit(
    X_train_padded, 
    y_train_sparse, 
    epochs=5, 
    batch_size=1  # Use batch size 1 for this small example
)

Epoch 1/5


TypeError: in user code:

    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Yara\AppData\Local\Temp\ipykernel_14168\1359058002.py", line 138, in train_step
        loss = tf.keras.losses.categorical_crossentropy(y_reshaped, y)
    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\losses.py", line 2199, in categorical_crossentropy
        label_smoothing = tf.convert_to_tensor(label_smoothing, dtype=y_pred.dtype)

    TypeError: Expected int32, but got 0.0 of type 'float'.


In [26]:
class Encoder(Layer):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)
        self.lstm = LSTM(hidden_dim, return_sequences=True, return_state=True)
    
    def call(self, inputs):
        # Embedding
        embedded = self.embedding(inputs)
        
        # LSTM Encoding
        encoded_sequence, state_h, state_c = self.lstm(embedded)
        
        return encoded_sequence, [state_h, state_c]

class Attention(Layer):
    def __init__(self, units):
        super().__init__()
        self.alignment_layer = Dense(units, activation='tanh')
        self.score_layer = Dense(1)

    def call(self, query, encoder_outputs, mask=None):
        # Expand query to match encoder_outputs dimensions
        query = tf.expand_dims(query, 1)  # Shape: (batch_size, 1, hidden_dim)
        
        # Repeat query to match encoder_outputs sequence length
        query = tf.repeat(query, encoder_outputs.shape[1], axis=1)
        
        # Concatenate query and encoder_outputs
        concatenated = tf.concat([query, encoder_outputs], axis=-1)
        
        # Compute alignment scores
        score = self.score_layer(self.alignment_layer(concatenated))
        score = tf.squeeze(score, axis=-1)
        
        # Apply softmax to get attention weights
        if mask is not None:
            score = tf.where(mask, score, float('-inf'))
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # Compute context vector
        context_vector = tf.reduce_sum(
            attention_weights[..., tf.newaxis] * encoder_outputs, 
            axis=1
        )
        
        return context_vector, attention_weights

class PointerGeneratorNetwork(Model):
    def __init__(self, 
                 src_vocab_size, 
                 tgt_vocab_size, 
                 embedding_dim=256, 
                 hidden_dim=512, 
                 max_length=100):
        super().__init__()
        
        # Encoder
        self.encoder = Encoder(src_vocab_size, embedding_dim, hidden_dim)
        
        # Embedding for decoder
        self.embedding = Embedding(tgt_vocab_size, embedding_dim)
        
        # Attention Mechanism
        self.attention = Attention(hidden_dim)
        
        # Decoder LSTM
        self.decoder_lstm = LSTM(hidden_dim, return_sequences=True, return_state=True)
        
        # Generation Probability Layer
        self.generation_prob = Dense(1, activation='sigmoid')
        
        # Output Layer
        self.output_layer = Dense(tgt_vocab_size, activation='softmax')
        
        # Hyperparameters
        self.hidden_dim = hidden_dim
        self.max_length = max_length
        self.tgt_vocab_size = tgt_vocab_size

    def call(self, inputs, training=None):
        # Encoder processing
        encoder_outputs, encoder_states = self.encoder(inputs)
        
        # Initial decoder state
        decoder_states = encoder_states
        
        # Placeholder for decoder outputs
        decoder_outputs = []
        
        # Initial decoder input (start token)
        decoder_input = tf.fill([tf.shape(inputs)[0], 1], 1)  # Assuming 1 is start token
        
        for t in range(1, self.max_length):
            # Embedding decoder input
            decoder_embedded = self.embedding(decoder_input)
            
            # Attention
            context_vector, attention_weights = self.attention(
                decoder_states[0], encoder_outputs
            )
            
            # Concatenate embedded input, context vector
            lstm_input = tf.concat([decoder_embedded, tf.expand_dims(context_vector, 1)], axis=-1)
            
            # Decoder LSTM
            decoder_output, state_h, state_c = self.decoder_lstm(
                lstm_input, initial_state=decoder_states
            )
            
            # Generation probability
            p_gen = self.generation_prob(decoder_output)
            
            # Output layer
            output = self.output_layer(decoder_output)
            
            decoder_outputs.append(output)
            
            # Update decoder input and states for next timestep
            decoder_input = tf.argmax(output, axis=-1)
            decoder_states = [state_h, state_c]
        
        # Stack decoder outputs
        decoder_outputs = tf.stack(decoder_outputs, axis=1)
        
        return decoder_outputs

    
    def train_step(self, data):
        # Unpack the data
        x, y = data
        
        with tf.GradientTape() as tape:
            # Forward pass
            y_pred = self(x, training=True)
            
            # Compute loss
            loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)
        
        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        
        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        
        # Update metrics
        self.compiled_metrics.update_state(y, y_pred)
        
        return {m.name: m.result() for m in self.metrics}

def create_pointer_generator(src_vocab_size, tgt_vocab_size):
    model = PointerGeneratorNetwork(
        src_vocab_size=src_vocab_size, 
        tgt_vocab_size=tgt_vocab_size
    )
    
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Example usage
# src_vocab_size = ...
# tgt_vocab_size = ...
# model = create_pointer_generator(src_vocab_size, tgt_vocab_size)
# model.fit(x_train, y_train, epochs=10, batch_size=32)

In [28]:
class StructuredOutputTokenizer:
    def __init__(self):
        self.token_to_index = {
            'PAD': 0,
            'UNK': 1,
            'START': 2,
            'END': 3
        }
        self.index_to_token = {v: k for k, v in self.token_to_index.items()}
        self.next_index = 4

    def _tokenize_structured_output(self, output):
        # Tokenize the structured output
        tokens = []
        current_token = ''
        for char in output:
            if char in ['(', ')', ' ']:
                if current_token:
                    # Add token if it exists
                    if current_token not in self.token_to_index:
                        self.token_to_index[current_token] = self.next_index
                        self.index_to_token[self.next_index] = current_token
                        self.next_index += 1
                    tokens.append(current_token)
                    current_token = ''
                
                if char in ['(', ')']:
                    tokens.append(char)
            else:
                current_token += char
        
        return tokens

    def fit_on_texts(self, outputs):
        for output in outputs:
            self._tokenize_structured_output(output)
        return self

    def texts_to_sequences(self, outputs):
        sequences = []
        for output in outputs:
            tokens = self._tokenize_structured_output(output)
            sequence = [self.token_to_index.get(token, self.token_to_index['UNK']) for token in tokens]
            sequences.append(sequence)
        return sequences

    def sequences_to_texts(self, sequences):
        return [[self.index_to_token.get(idx, 'UNK') for idx in seq] for seq in sequences]

# Example usage
input_tokenizer = Tokenizer(oov_token='UNK')
output_tokenizer = StructuredOutputTokenizer()

# Prepare data
X_train = [
    "i'd like three large pies with pestos and yellow peppers",
]

y_train = [
    "(ORDER (PIZZAORDER (NUMBER 3 ) (SIZE LARGE ) (TOPPING PESTO ) (TOPPING YELLOW_PEPPERS ) ) )",
]

# # Prepare data
# input_tokenizer = Tokenizer(oov_token='UNK')
# output_tokenizer = StructuredOutputTokenizer()

# # Prepare data
# X_train = [
#     "i'd like three large pies with pestos and yellow peppers",
# ]
# y_train = [
#     "(ORDER (PIZZAORDER (NUMBER 3 ) (SIZE LARGE ) (TOPPING PESTO ) (TOPPING YELLOW_PEPPERS ) ) )",
# ]

# Fit tokenizers
input_tokenizer.fit_on_texts(X_train)
output_tokenizer.fit_on_texts(y_train)

# Convert to sequences
X_train_seq = input_tokenizer.texts_to_sequences(X_train)
y_train_seq = output_tokenizer.texts_to_sequences(y_train)

# Pad sequences
X_train_padded = pad_sequences(X_train_seq, padding='post', maxlen=30)
y_train_padded = pad_sequences(y_train_seq, padding='post',maxlen=30)

# Get vocabulary sizes
input_vocab_size = len(input_tokenizer.word_index) + 1
output_vocab_size = output_tokenizer.next_index




In [29]:

# Modify data preparation
# Combine source and target sequences
# X_train_combined = np.concatenate([X_train_padded, y_train_padded], axis=1)

# Create model
model = create_pointer_generator(
    src_vocab_size=input_vocab_size, 
    tgt_vocab_size=output_vocab_size
)



# Prepare target data for training
# Note: We need to reshape the target to match the model's output shape
y_train_sparse = np.expand_dims(y_train_padded, axis=-1)

# Train
model.fit(
    X_train_padded, 
    y_train_sparse, 
    epochs=5, 
    batch_size=1  # Use batch size 1 for this small example
)

Epoch 1/5


ValueError: in user code:

    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Yara\AppData\Local\Temp\ipykernel_14168\3865676413.py", line 138, in train_step
        loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)
    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\compile_utils.py", line 277, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\losses.py", line 143, in __call__
        losses = call_fn(y_true, y_pred)
    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\losses.py", line 270, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\losses.py", line 2221, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\backend.py", line 5575, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (1, 30, 1) and (1, 99, 1, 4) are incompatible


In [9]:
# Create model
model = create_pointer_generator(
    src_vocab_size=input_vocab_size, 
    tgt_vocab_size=output_vocab_size
)

# Train
model.fit(X_train_padded, y_train_padded, epochs=5, batch_size=32)

Epoch 1/5


OperatorNotAllowedInGraphError: in user code:

    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Yara\AppData\Local\Temp\ipykernel_19860\1901776023.py", line 131, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None

    OperatorNotAllowedInGraphError: Exception encountered when calling layer 'pointer_generator_network' (type PointerGeneratorNetwork).
    
    in user code:
    
        File "C:\Users\Yara\AppData\Local\Temp\ipykernel_19860\1901776023.py", line 74, in call  *
            source_seq, target_seq = inputs
    
        OperatorNotAllowedInGraphError: Iterating over a symbolic `tf.Tensor` is not allowed. You can attempt the following resolutions to the problem: If you are running in Graph mode, use Eager execution mode or decorate this function with @tf.function. If you are using AutoGraph, you can try decorating this function with @tf.function. If that does not work, then you may be using an unsupported feature or your source code may not be visible to AutoGraph. See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/autograph/g3doc/reference/limitations.md#access-to-source-code for more information.
    
    
    Call arguments received by layer 'pointer_generator_network' (type PointerGeneratorNetwork):
      • inputs=tf.Tensor(shape=(None, 10), dtype=int32)
