In [5]:
from gensim.models import Word2Vec,FastText
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from tensorflow import keras
import re
from nltk.stem import WordNetLemmatizer


from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding,LSTM,Dense,Input,Bidirectional,Attention,Concatenate,TimeDistributed

pd.set_option("display.max_columns", None)
np.random.seed(42)
tf.random.set_seed(42)


In [2]:
main_train = pd.read_json("dataset/PIZZA_train.json", lines=True,)
df_dev = pd.read_json("dataset/PIZZA_dev.json", lines=True,)

In [3]:
df_train = main_train.sample(10000)

In [46]:
CONTRACTIONS = {
    "n't": "not",
    "'s": "is",
    "'re": "are",
    "'m": "am",
    "'ll": "will",
    "'ve": "have",
    "'d": "would",
    "'em": "them",
    "'all": "all",
    "'cause": "because",
    "'clock": "oclock",
    "'tis": "it is",
    "'twas": "it was",
    "'tween": "between",
    "'twere": "it were",
    "'twould": "it would",
    "'twixt": "betwixt",
    "'twill": "it will",
    "'til": "until",
    "'bout": "about",
    "'cept": "except",
    "'cos": "because",
    "'fore": "before",
    "'round": "around",
    "'n'": "and",
    "'neath": "beneath",
    "'nother": "another",
    "'nuff": "enough",
}
negation_words = {
    "no",
    "not",
    "none",
    "never",
    "without",
    "avoid",
    "neither",
    "nor",
    "hate",
    "hold",
}
pizza = {"pizza", "pizzas", "pie", "pies"}

stop_negation_words = {"and", "but"}
stop_words = set(stopwords.words("english"))
stop_words = stop_words - negation_words - stop_negation_words - {'all' , 'a','an'}
stop_words.update({"would", "like", "get", "want"})
stop_words.update(pizza)

lemmatizer = WordNetLemmatizer()  # WordNet Lemmatizer


In [6]:
def clean_text(text):
    text = re.sub(r"[^\w']", " ", text)  # Remove non-word characters
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces
    text = text.lower().strip()  # Lowercase and strip whitespace
    return text

In [7]:
def expnad_abb2(text):

    pattern = re.compile(
        r"(" + "|".join(re.escape(key) for key in CONTRACTIONS.keys()) + r")"
    )
    expanded_text = pattern.sub(lambda x: " " + CONTRACTIONS[x.group()], text)
    return expanded_text

In [8]:
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

In [18]:
def handle_negation(text):
    # Look for patterns like "no [word1] [word2] ..." and transform them
    words = text.split()
    transformed_words = []
    negation_flag = False  # To track if we're negating

    for i, word in enumerate(words):
        if word.lower() in negation_words:  # Trigger negation
            negation_flag = True
            continue  # Skip adding "no" to the transformed text
        elif negation_flag and (
            not re.match(r"[a-zA-Z]+", word) or word.lower() in stop_negation_words
        ):  # End negation on punctuation or 'and'
            negation_flag = False

        # Prefix "NOT_" if negation flag is set
        if negation_flag:
            transformed_words.append(f"not_{word}")
            if word in ["much"]:
                negation_flag = False
        else:
            if word.lower() not in stop_negation_words:
                transformed_words.append(word)

    return " ".join(transformed_words)

In [19]:
def lemma(text):
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    return " ".join(text)

In [215]:
def tokenize_adj_noun(text):
    # Tokenize text into words
    words = word_tokenize(text)
    
    # Perform POS tagging
    pos_tags = pos_tag(words)
    
    tokens = []
    skip_next = False
    
    for i in range(len(pos_tags) - 1):
        if skip_next:
            skip_next = False
            continue
        
        word, tag = pos_tags[i]
        next_word, next_tag = pos_tags[i + 1]

        if next_word in pizza :
            tokens.append(word)
            continue
        
        # If current word is an adjective and the next is a noun, combine them
        if (word not in ['extra' , 'pineapple'] and tag in ['JJ', 'JJR', 'JJS'] or word == 'all') and next_tag in ['NN', 'NNS', 'NNP', 'NNPS']:
            tokens.append(f"{word}_{next_word}")
            skip_next = True
        if (word == 'seven' and next_word in ['up' , 'ups']):
            tokens.append(f"{word}_{next_word}")
            skip_next = True
        else:
            tokens.append(word)
    
    # Add the last word if it wasn't part of an adj+noun pair
    if not skip_next:
        tokens.append(pos_tags[-1][0])
    
    return " ".join(tokens)


In [53]:
def preprocess(text):
    text = clean_text(text)
    text = lemma(text)
    # text = tokenize_adj_noun(text)
    text = expnad_abb2(text)
    text = remove_stopwords(text)
    text = handle_negation(text)
    return text

In [54]:
# tokenize_adj_noun('i\'d like a all veggies pizza with garlic green olive and hams')
preprocess('pepperoni pizza with extra cheese and extra ham pepperoni pepper')

'pepperoni extra cheese extra ham pepperoni pepper'

In [166]:
df_train['train.INPUT'] = df_train['train.SRC'].apply(preprocess)

In [57]:
toppings_regex = re.compile(r'(?<=\(TOPPING\s)[^)]*(?=\s)')
number_regex = re.compile(r'(?<=\(NUMBER\s)[^)]*(?=\s)')
size_regex = re.compile(r'(?<=\(SIZE\s)[^)]*(?=\s)')
quantity_regex = re.compile(r'(?<=\(QUANTITY\s)[^)]*(?=\s)')
style_regex = re.compile(r'(?<=\(STYLE\s)[^)]*(?=\s)')
drink_type_regex = re.compile(r'(?<=\(DRINKTYPE\s)[^)]*(?=\s)')
volume_regex = re.compile(r'(?<=\(VOLUME\s)[^)]*(?=\s)')
container_type_regex = re.compile(r'(?<=\(CONTAINERTYPE\s)[^)]*(?=\s)')   

In [58]:
sizes = set()
toppings = set()
numbers = set()
quantities = set()
styles = set()
drink_types = set()
container_types = set()
volumes = set()
none = set()

In [59]:
def get_none_match(text):
    order_regex = re.compile(r'(?<=ORDER\s)[^(]*(?=\s\()')
    pizzaorder_regex = re.compile(r'(?<=PIZZAORDER\s)[^(]*(?=\s\()')
    drinkorder_regex = re.compile(r'(?<=DRINKORDER\s)[^(]*(?=\s\()')
    between_parentheses_regex = re.compile(r'(?<=\)\s)[^()]+(?=\s\()')
    uncleaned_none_match = re.findall(order_regex, text)
    uncleaned_none_match.extend(re.findall(pizzaorder_regex, text))
    uncleaned_none_match.extend(re.findall(drinkorder_regex, text))
    uncleaned_none_match.extend(re.findall(between_parentheses_regex, text))
    none_match=[]
    for sentence in uncleaned_none_match:
        sen = expnad_abb2(sentence).upper()
        none_match.extend([word.lower() for word in sen.split()])
    return none_match

In [60]:
sizes.update([item.lower() for sublist in df_train['train.EXR'].apply(lambda x: size_regex.findall(x)) for item in sublist])
toppings.update([item.lower() for sublist in df_train['train.EXR'].apply(lambda x: toppings_regex.findall(x)) for item in sublist])
numbers.update([item.lower() for sublist in df_train['train.EXR'].apply(lambda x: number_regex.findall(x)) for item in sublist])
quantities.update([item.lower() for sublist in df_train['train.EXR'].apply(lambda x: quantity_regex.findall(x)) for item in sublist])
styles.update([item.lower() for sublist in df_train['train.EXR'].apply(lambda x: style_regex.findall(x)) for item in sublist])
drink_types.update([item.lower() for sublist in df_train['train.EXR'].apply(lambda x: drink_type_regex.findall(x)) for item in sublist])
container_types.update([item.lower() for sublist in df_train['train.EXR'].apply(lambda x: container_type_regex.findall(x)) for item in sublist])
volumes.update([item.lower() for sublist in df_train['train.EXR'].apply(lambda x: volume_regex.findall(x)) for item in sublist])

In [61]:
sizes.update([item.lower() for sublist in df_train['train.TOP'].apply(lambda x: size_regex.findall(x)) for item in sublist])
toppings.update([item.lower() for sublist in df_train['train.TOP'].apply(lambda x: toppings_regex.findall(x)) for item in sublist])
numbers.update([item.lower() for sublist in df_train['train.TOP'].apply(lambda x: number_regex.findall(x)) for item in sublist])
quantities.update([item.lower() for sublist in df_train['train.TOP'].apply(lambda x: quantity_regex.findall(x)) for item in sublist])
styles.update([item.lower() for sublist in df_train['train.TOP'].apply(lambda x: style_regex.findall(x)) for item in sublist])
drink_types.update([item.lower() for sublist in df_train['train.TOP'].apply(lambda x: drink_type_regex.findall(x)) for item in sublist])
container_types.update([item.lower() for sublist in df_train['train.TOP'].apply(lambda x: container_type_regex.findall(x)) for item in sublist])
volumes.update([item.lower() for sublist in df_train['train.TOP'].apply(lambda x: volume_regex.findall(x)) for item in sublist])
none.update([item.lower() for sublist in df_train['train.TOP'].apply(get_none_match) for item in sublist])

In [62]:
sizes = list(sizes)
toppings = list(toppings)
numbers = list(numbers)
quantities = list(quantities)
styles = list(styles)
drink_types = list(drink_types)
container_types = list(container_types)
volumes = list(volumes)
none = list(none)

In [167]:
fast_text_model = FastText(sentences=df_train['train.INPUT'].apply(lambda x : x.split()), vector_size=300, window=5, min_count=1, workers=4, sg=1)

In [168]:
fast_text_model.wv.key_to_index


{'a': 0,
 'three': 1,
 'four': 2,
 'party': 3,
 'five': 4,
 'cheese': 5,
 'sized': 6,
 'pepper': 7,
 'sprite': 8,
 'one': 9,
 'tea': 10,
 'two': 11,
 'size': 12,
 'american': 13,
 'ice': 14,
 'glaze': 15,
 'ounce': 16,
 'balsamic': 17,
 'large': 18,
 'not_cheese': 19,
 'not_american': 20,
 'ups': 21,
 'personal': 22,
 'extra': 23,
 'diet': 24,
 'medium': 25,
 'seven': 26,
 'onion': 27,
 'also': 28,
 '500': 29,
 'sauce': 30,
 'olive': 31,
 'green': 32,
 'not_pepper': 33,
 'pecorino': 34,
 'not_crust': 35,
 'not_thin': 36,
 'bottle': 37,
 'peperonni': 38,
 'milliliter': 39,
 'ginger': 40,
 'ale': 41,
 'crust': 42,
 'need': 43,
 'liter': 44,
 'fantas': 45,
 'roasted': 46,
 'little': 47,
 'red': 48,
 'lunch': 49,
 'chicken': 50,
 'mozzarella': 51,
 'coke': 52,
 'banana': 53,
 '20': 54,
 'tomato': 55,
 'bit': 56,
 'lot': 57,
 'pineapple': 58,
 'ml': 59,
 'pesto': 60,
 'regular': 61,
 'pellegrino': 62,
 'fl': 63,
 'iced': 64,
 'san': 65,
 'thin': 66,
 'eight': 67,
 'pellegrinos': 68,
 'bacon

In [169]:
fast_text_model.save("fast_text_model.model")

KeyboardInterrupt: 

In [44]:
def Similarity(w1,w2,model):
    if w1 not in model.wv or w2 not in model.wv:
        return 0
    A = model.wv[w1]; B = model.wv[w2]
    return sum(A*B)/(pow(sum(pow(A,2)),0.5)*pow(sum(pow(B,2)),0.5))

In [45]:
def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_A = np.linalg.norm(A)
    norm_B = np.linalg.norm(B)
    if norm_A > 0 and norm_B > 0:
        return dot_product / (norm_A * norm_B)
    else:
        return 0.0  # Return 0 if either vector is zero

In [100]:
categories = {
    # 'PIZZAORDER': pizza,  # Assume `pizza` is a list of relevant entities
    'NUMBER': numbers,    # Assume `numbers` is a list of relevant entities
    'SIZE': sizes,        # Assume `sizes` is a list of relevant entities
    'TOPPING': toppings,  # Assume `toppings` is a list of relevant entities
    'STYLE': styles,      # Assume `styles` is a list of relevant entities
    'QUANTITY': quantities,  # Assume `quantities` is a list of relevant entities
    'DRINKTYPE': drink_types,  # Assume `drink_types` is a list of relevant entities
    'CONTAINERTYPE': container_types,  # Assume `container_types` is a list of relevant entities
    'VOLUME': volumes,  # Assume `volumes` is a list of relevant entities
    'NONE': none  # Default category, assumes `none` is a list of entities or empty list
}


def get_best_match(token, model):

    # Check if the token is in any of the relevant entities
    for category, entity_list in categories.items():
        
        if token in entity_list:
            return category
        
    # If no exact match, find the best match based on cosine similarity
    best_category = None
    best_similarity = 0.0

    for category, entity_list in categories.items():
        for entity in entity_list:
            similarity = model.wv.similarity(token, entity)
            # print("For token:", token, "and entity:", entity, "similarity:", similarity)
            if similarity > best_similarity:
                best_similarity = similarity
                # print("Best similarity:", best_similarity)
                # print("Best category:", category)
                # print("Best entity:", entity)
                best_category = category

    return best_category
    # return best_entity

In [101]:
def construct_input(text):
    tokens = word_tokenize(text.lower())
    s = ''
    neg = False
    for token in tokens:
        if token[:3] == 'not':
            token = token[4:]
            neg = True
            s+=f"(NOT "
        best_macth = get_best_match(token,fast_text_model)
        if best_macth != 'NONE' and best_macth != 'PIZZAORDER':
            s+=f"({best_macth} {token}) "
            if neg:
                s+=f") "
                neg = False 
        # elif best_macth == 'PIZZAORDER':
        #     s = "(PIZZAORDER) " + s
        # elif best_macth == 'DRINKTYPE' or best_macth == 'CONTAINERTYPE':
        #     s+=f"(DRINKORDER) "
    return s
                                     

In [104]:
input_text = "two pizzas with balsamic glaze and i want two medium pizzas no american cheese"
input_tokens = word_tokenize(input_text.lower())

In [105]:
construct_input(preprocess(input_text))

'(NUMBER two) (TOPPING balsamic) (TOPPING glaze) (NUMBER two) (SIZE medium) (NOT (CONTAINERTYPE american) ) (NOT (TOPPING cheese) ) '

In [112]:
df_train['train.INPUT'] = df_train['train.INPUT'].apply(lambda x: construct_input(x))

In [170]:
input_texts = df_train['train.INPUT'].apply(lambda x: x.lower())
output_texts = ["<sos> " + text.lower() + " <eos>" for text in df_train["train.EXR"].tolist()] 

In [171]:
input_tokenizer = Tokenizer(filters="",)  # Don't filter out any characters
input_tokenizer.fit_on_texts(input_texts)
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
input_vocab_size = len(input_tokenizer.word_index) + 1

# Pad input sequences
max_input_length = max(len(seq) for seq in input_sequences)
input_sequences_padded = pad_sequences(input_sequences, maxlen=max_input_length, padding='post')

output_tokenizer = Tokenizer(filters="")
output_tokenizer.fit_on_texts(output_texts)
output_sequences = output_tokenizer.texts_to_sequences(output_texts)
output_vocab_size = len(output_tokenizer.word_index) + 1

# Pad output sequences
max_output_length = max(len(seq) for seq in output_sequences)
output_sequences_padded = pad_sequences(output_sequences, maxlen=max_output_length, padding='post')

# Ensure both input and output sequences have the same length
max_length = max(max_input_length, max_output_length)
input_sequences_padded = pad_sequences(input_sequences, maxlen=max_length, padding='post')
output_sequences_padded = pad_sequences(output_sequences, maxlen=max_length, padding='post')

In [172]:
# View sample mappings
word_to_index = input_tokenizer.word_index
index_to_word = {idx: word for word, idx in word_to_index.items()}

In [173]:
embedd_model = FastText(sentences=input_texts.apply(lambda x : x.split()), vector_size=300, window=5, min_count=1, workers=4, sg=1)

In [174]:

embedding_dim = embedd_model.wv.vector_size  # Dimension of Word2Vec vectors
hidden_units = 128

# Initialize the embedding matrix
embedding_matrix = np.zeros((input_vocab_size, embedding_dim))

for word, idx in word_to_index.items():
    if word in embedd_model.wv.key_to_index.keys():
        # print("found" , word)
        embedding_matrix[idx] = embedd_model.wv[word]
    else:
        print("not found" , word)
        embedding_matrix[idx] = np.random.uniform(-0.01, 0.01, embedding_dim)


In [175]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.initializers import Constant

embedding_layer = Embedding(
    input_dim=input_vocab_size,
    output_dim=embedding_dim,
    embeddings_initializer=Constant(embedding_matrix),
    trainable=False  # Freeze embeddings if you don’t want to fine-tune them
)


In [197]:
# # Tokenize input
# input_texts = df_train['train.INPUT'].apply(lambda x : x.upper())
# output_texts = ["<sos> " + text + " <eos>" for text in df_train["train.EXR"].tolist()]  # Add start and end tokens

# input_tokenizer = Tokenizer(filters="")  # Don't filter out any characters
# input_tokenizer.fit_on_texts(input_texts)
# input_sequences = input_tokenizer.texts_to_sequences(input_texts)
# input_vocab_size = len(input_tokenizer.word_index) + 1

# # Pad input sequences
# max_input_length = max(len(seq) for seq in input_sequences)
# input_sequences_padded = pad_sequences(input_sequences, maxlen=max_input_length, padding='post')

# output_tokenizer = Tokenizer(filters="")
# output_tokenizer.fit_on_texts(output_texts)
# output_sequences = output_tokenizer.texts_to_sequences(output_texts)
# output_vocab_size = len(output_tokenizer.word_index) + 1

# # Pad output sequences
# max_output_length = max(len(seq) for seq in output_sequences)
# output_sequences_padded = pad_sequences(output_sequences, maxlen=max_output_length, padding='post')

# # Ensure both input and output sequences have the same length
# max_length = max(max_input_length, max_output_length)
# input_sequences_padded = pad_sequences(input_sequences, maxlen=max_length, padding='post')
# output_sequences_padded = pad_sequences(output_sequences, maxlen=max_length, padding='post')

# Correctly shape the decoder input and output data
decoder_input_data = output_sequences_padded[:, :-1]  # Remove the last token
decoder_output_data = output_sequences_padded[:, 1:]  # Remove the first token

# Encoder
encoder_input = Input(shape=(max_length,))
encoder_embedding = embedding_layer(encoder_input)
encoder_lstm, state_h, state_c = LSTM(256, return_state=True, return_sequences=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_input = Input(shape=(max_length-1,))
decoder_embedding = embedding_layer(decoder_input)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_lstm_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Attention Mechanism
attention_layer = Attention()  # Using scaled dot-product attention
context_vector = attention_layer([decoder_lstm_output, encoder_lstm])

# Concatenate Context Vector and Decoder LSTM Output
decoder_combined_context = Concatenate()([decoder_lstm_output, context_vector])

# Dense Layer for Output
decoder_dense = Dense(output_vocab_size, activation='softmax')
decoder_output = decoder_dense(decoder_combined_context)


# Define Model
model = Model([encoder_input, decoder_input], decoder_output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(
    [input_sequences_padded, decoder_input_data],
    decoder_output_data,
    batch_size=32,
    epochs=5,
    validation_split=0.2,
    shuffle=True,
)

# Define Encoder Model
encoder_model = Model(encoder_input, [encoder_lstm, state_h, state_c])

# Define Decoder Model with Attention
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm_output, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]

context_vector = attention_layer([decoder_lstm_output, encoder_lstm])
decoder_combined_context = Concatenate()([decoder_lstm_output, context_vector])
decoder_output = decoder_dense(decoder_combined_context)

decoder_model = Model(
    [decoder_input, encoder_lstm] + decoder_states_inputs,
    [decoder_output] + decoder_states
)


Epoch 1/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 92ms/step - accuracy: 0.7070 - loss: 1.5402 - val_accuracy: 0.8994 - val_loss: 0.3687
Epoch 2/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 94ms/step - accuracy: 0.9098 - loss: 0.3298 - val_accuracy: 0.9263 - val_loss: 0.2603
Epoch 3/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 96ms/step - accuracy: 0.9302 - loss: 0.2483 - val_accuracy: 0.9394 - val_loss: 0.2155
Epoch 4/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 85ms/step - accuracy: 0.9426 - loss: 0.2054 - val_accuracy: 0.9501 - val_loss: 0.1803
Epoch 5/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 89ms/step - accuracy: 0.9527 - loss: 0.1692 - val_accuracy: 0.9606 - val_loss: 0.1422


In [198]:
# # Define the encoder (using Bidirectional LSTM)
# input_sequence = Input(shape=(None,))  # Input sequence (e.g., source sentence)
# embedding_layer = Embedding(input_dim=input_vocab_size, output_dim=256)(input_sequence)
# encoder = Bidirectional(LSTM(units=256, return_state=False, return_sequences=True))(embedding_layer)
# encoder_outputs = encoder  # Shape: (batch_size, input_sequence_length, 512)

# # Define the decoder (with LSTM and Attention mechanism)
# decoder_input_sequence = Input(shape=(None,))  # Decoder input sequence (e.g., target sentence)
# decoder_embedding = Embedding(input_dim=output_vocab_size, output_dim=256)(decoder_input_sequence)
# decoder_lstm = LSTM(units=512, return_sequences=True, return_state=True)
# decoder_lstm_output, _, _ = decoder_lstm(decoder_embedding, initial_state=None)

# # Attention mechanism: decoder output (3D tensor) and encoder outputs (3D tensor)
# attention = Attention()([decoder_lstm_output, encoder_outputs])

# # Context vector: concatenation of decoder output and attention output
# context_vector = Concatenate()([decoder_lstm_output, attention])

# # Output layer (dense layer with softmax activation)
# output = Dense(output_vocab_size, activation='softmax')(context_vector)

# # Model definition
# model = Model([input_sequence, decoder_input_sequence], output)

# # Summary of the model to check the shapes
# model.summary()

# # model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# # Compile the model with categorical crossentropy and Adam optimizer
# model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

# def scheduled_sampling_step(epoch, model, input_data, target_data, sampling_rate=1.0):
#     # Initialize the decoder input with the first token (start token)
#     decoder_input = np.zeros_like(target_data)
#     decoder_input[:, 0] = target_data[:, 0]  # Use the first token of the ground truth as input to start

#     # Prepare to store the decoder's predictions
#     decoder_predictions = []

#     # Iterate through the sequence length
#     for t in range(1, target_data.shape[1]):  # Excluding the first token (start token)
        
#         # Sample a random value to decide whether to use the true token or predicted token
#         use_true_token = np.random.rand() < sampling_rate
        
#         if use_true_token:
#             # Use the true token from the target sequence
#             decoder_input[:, t] = target_data[:, t]
#         else:
#             # Use the model's previous prediction as the input to the next timestep
#             pred_token = model.predict([input_data, decoder_input])[:, t - 1, :]
#             decoder_input[:, t] = np.argmax(pred_token, axis=-1)

#         # Collect the predicted token for each timestep
#         decoder_predictions.append(decoder_input[:, t])

#     # Convert list of predictions into an array of shape (batch_size, target_length)
#     decoder_predictions = np.array(decoder_predictions).T  # Transpose to (batch_size, target_length)
#     return decoder_predictions



# def get_batch(batch_index, batch_size, input_sequences, target_sequences, max_input_length, max_target_length):
   
#     # Calculate the start and end indices of the batch
#     start_idx = batch_index * batch_size
#     end_idx = min((batch_index + 1) * batch_size, len(input_sequences))
    
#     # Extract a slice of the input and target sequences
#     input_batch = input_sequences[start_idx:end_idx]
#     target_batch = target_sequences[start_idx:end_idx]
    
#     # Pad input sequences to `max_input_length`
#     input_batch_padded = np.zeros((len(input_batch), max_input_length), dtype=np.int32)
#     for i, seq in enumerate(input_batch):
#         input_batch_padded[i, :len(seq)] = seq[:max_input_length]
    
#     # Pad target sequences to `max_target_length`
#     target_batch_padded = np.zeros((len(target_batch), max_target_length), dtype=np.int32)
#     for i, seq in enumerate(target_batch):
#         target_batch_padded[i, :len(seq)] = seq[:max_target_length]
    
#     return input_batch_padded, target_batch_padded

# from tensorflow.keras.utils import to_categorical

# def one_hot_encode_sequences(sequences, vocab_size):
#     # Converts sequences of token indices into one-hot encoded format
#     return to_categorical(sequences, num_classes=vocab_size)


# # Set initial sampling rate (start with 1.0, and gradually decrease it)
# initial_sampling_rate = 1.0
# decay_rate = 0.95  # Controls how quickly the sampling rate decays
# min_sampling_rate = 0.2  # Minimum sampling rate to avoid dropping too early
# batch_size = 32
# num_batches = len(input_sequences_padded) // batch_size
# num_epochs = 5

# sampling_rate = initial_sampling_rate


# # Training loop with scheduled sampling
# for epoch in range(num_epochs):
#     for batch_index in range(num_batches):
#         # Get a batch of data
#         input_batch, target_batch = get_batch(batch_index, batch_size, input_sequences_padded, output_sequences_padded, max_input_length, max_output_length)
        
#         # One-hot encode the target batch
#         target_batch_one_hot = one_hot_encode_sequences(target_batch, vocab_size=output_vocab_size)
        
#         # Perform scheduled sampling step to generate predicted sequence
#         predicted_sequence = scheduled_sampling_step(epoch, model, input_batch, target_batch, sampling_rate)
        
#         # Trim the predicted sequence to match the target length
#         predicted_sequence = predicted_sequence[:, :target_batch.shape[1]]  # Trim to target length
        
#         # Train the model on the batch using model.fit
#         model.fit([input_batch, target_batch], target_batch_one_hot, batch_size=batch_size, epochs=1,verbose=0)

#     # Update the sampling rate after each epoch
#     sampling_rate = max(min_sampling_rate, sampling_rate * decay_rate)
#     print(f'Epoch {epoch+1}/{num_epochs} | Sampling rate: {sampling_rate}')



In [177]:
def decode_sequence(input_seq):
    # Ensure input sequence is padded correctly
    input_seq = pad_sequences(input_seq, maxlen=max_length, padding="post")

    # Get encoder output and initial states
    encoder_output, state_h, state_c = encoder_model.predict(input_seq, verbose=0)
    states_value = [state_h, state_c]

    # Start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = output_tokenizer.word_index["<sos>"]  # Use a start token
    
    stop_condition = False
    decoded_sentence = ""

    while not stop_condition:
        # Predict the next token with attention
        output_tokens, h, c = decoder_model.predict(
            [target_seq, encoder_output] + states_value, verbose=0
        )

        # Get the predicted token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = output_tokenizer.index_word.get(sampled_token_index, "<unk>")
        decoded_sentence += " " + sampled_token

        # Stop if end token is generated
        if sampled_token == "<eos>":
            stop_condition = True

        # Update target sequence (input for the next timestep)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update decoder states
        states_value = [h, c]

    return decoded_sentence.strip()


In [178]:
# def decode_sequence(input_seq):
#     # Ensure input sequence is padded correctly
#     input_seq = pad_sequences(input_seq, maxlen=max_input_length, padding="post")
    
#     # Get encoder output and initial states
#     encoder_output, state_h, state_c = encoder_model.predict(input_seq, verbose=0)
#     states_value = [state_h, state_c]

#     # Start token
#     target_seq = np.zeros((1, 1))  # Shape (1, 1)
#     target_seq[0, 0] = output_tokenizer.word_index["<sos>"]  # Use start token "<sos>"
    
#     stop_condition = False
#     decoded_sentence = ""

#     while not stop_condition:
#         # Predict the next token with attention
#         output_tokens, h, c = decoder_model.predict(
#             [target_seq, encoder_output] + states_value, verbose=0
#         )

#         # Apply a Dense layer to match the dimensionality of encoder_output
#         # The Dense layer will project output_tokens to the same dimensionality as encoder_output (256).
#         attention_input = Dense(256)(output_tokens)  # Project output_tokens to match the encoder's hidden size

#         # Apply attention mechanism
#         attention_output = Attention()([attention_input, encoder_output])
#         context_vector = Concatenate()([attention_output, attention_input])

#         # Get the predicted token
#         sampled_token_index = np.argmax(output_tokens[0, -1, :])  # Get the most probable token
#         sampled_token = output_tokenizer.index_word.get(sampled_token_index, "<unk>")
#         decoded_sentence += " " + sampled_token

#         # Stop if end token is generated
#         if sampled_token == "<eos>":
#             stop_condition = True

#         # Update target sequence (input for the next timestep)
#         target_seq = np.zeros((1, 1))
#         target_seq[0, 0] = sampled_token_index  # Use the predicted token as the next input
        
#         # Update decoder states
#         states_value = [h, c]

#     return decoded_sentence.strip()

In [199]:
df_dev['dev.INPUT'] = df_dev['dev.SRC'].apply(preprocess)
# df_dev['dev.INPUT'] = df_dev['dev.INPUT'].apply(lambda x: construct_input(x).lower())

In [204]:
test_index = 2

In [205]:
df_dev['dev.INPUT'].loc[test_index]

'need order one large vegetarian extra banana pepper'

In [206]:
df_dev['dev.EXR'].loc[test_index]

'(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (STYLE VEGETARIAN ) (COMPLEX_TOPPING (QUANTITY EXTRA ) (TOPPING BANANA_PEPPERS ) ) ) )'

In [207]:
test_input_sequence = input_tokenizer.texts_to_sequences([df_dev['dev.INPUT'].loc[test_index]])
output = decode_sequence(test_input_sequence)
print(output)


(order (pizzaorder (number 1 ) (size large ) (style thick_crust ) (topping banana_peppers ) ) ) <eos>


In [165]:
for i in range(10,20):
    print("Input:", df_train['train.INPUT'].iloc[i])
    print("Expected:", df_train['train.EXR'].iloc[i])
    test_input_sequence = input_tokenizer.texts_to_sequences([df_train['train.INPUT'].iloc[test_index].upper()])
    output = decode_sequence(test_input_sequence)
    print("Predicted:", output)
    print()

# test_input_sequence = input_tokenizer.texts_to_sequences([df_dev['dev.INPUT'].loc[test_index].upper()])
# output = decode_sequence(test_input_sequence)
# print(output)


Input: (NUMBER a) (CONTAINERTYPE bottle) (DRINKTYPE ice) (DRINKTYPE tea) (NUMBER one) (VOLUME dr) (TOPPING pepper) (NUMBER one) (NUMBER eight) (VOLUME ounce) (TOPPING pineapple) (DRINKTYPE soda) 
Expected: (ORDER (DRINKORDER (NUMBER 1 ) (VOLUME 8 OZ ) (DRINKTYPE PINEAPPLE_SODA ) ) (DRINKORDER (NUMBER 1 ) (DRINKTYPE DR_PEPPER ) ) (DRINKORDER (NUMBER 1 ) (CONTAINERTYPE BOTTLE ) (DRINKTYPE ICE_TEA ) ) )
Predicted: (order (pizzaorder (number 1 ) (topping artichokes ) (topping meatballs ) (topping ricotta_cheese ) (not (style thin_crust ) ) ) ) <eos>

Input: (NUMBER four) (NUMBER seven) (DRINKTYPE ups) (NUMBER three) (VOLUME dr) (DRINKTYPE pepers) (NUMBER four) (NUMBER 8) (VOLUME ounce) (TOPPING pineapple) (DRINKTYPE soda) 
Expected: (ORDER (DRINKORDER (NUMBER 4 ) (VOLUME 8 OZ ) (DRINKTYPE PINEAPPLE_SODA ) ) (DRINKORDER (NUMBER 3 ) (DRINKTYPE DR_PEPPER ) ) (DRINKORDER (NUMBER 4 ) (DRINKTYPE SEVEN_UP ) ) )
Predicted: (order (pizzaorder (number 1 ) (topping artichokes ) (topping meatballs ) (