In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from nltk.corpus import stopwords
import re


In [92]:
df = pd.read_json("/kaggle/input/pizza-dataset/PIZZA_train.json", lines=True)
df.drop_duplicates(inplace = True)
df = df.sample(5*10**5)

In [2]:
CONTRACTIONS = {
    "n't": "not",
    "'s": "is",
    "'re": "are",
    "'m": "am",
    "'ll": "will",
    "'ve": "have",
    "'d": "would",
    "'em": "them",
    "'all": "all",
    "'cause": "because",
    "'clock": "oclock",
    "'tis": "it is",
    "'twas": "it was",
    "'tween": "between",
    "'twere": "it were",
    "'twould": "it would",
    "'twixt": "betwixt",
    "'twill": "it will",
    "'til": "until",
    "'bout": "about",
    "'cept": "except",
    "'cos": "because",
    "'fore": "before",
    "'round": "around",
    "'n'": "and",
    "'neath": "beneath",
    "'nother": "another",
    "'nuff": "enough",
}
negation_words = {
    "no",
    "not",
    "none",
    "never",
    "without",
    "avoid",
    "neither",
    "nor",
    "hate",
    "hold",
}

stop_negation_words = {"and"}
stop_words = set(stopwords.words("english"))
stop_words = stop_words - negation_words - stop_negation_words
stop_words.update({"would", "like", "get", "want", "order" , "please" , 'could' , 'prefer' ,
                    'handle' , 'take' , 'bring' , 'need' , 'make' , 'love', 'let', 'absolutely',
                    "arrange" })

In [94]:
def expnad_abb2(text):

    pattern = re.compile(
        r"(" + "|".join(re.escape(key) for key in CONTRACTIONS.keys()) + r")"
    )
    expanded_text = pattern.sub(lambda x: " " + CONTRACTIONS[x.group()], text)
    return expanded_text

In [None]:
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

In [None]:
def handle_negation(text):
    # Look for patterns like "no [word1] [word2] ..." and transform them
    words = text.split()
    transformed_words = []
    negation_flag = False  # To track if we're negating

    for i, word in enumerate(words):
        if word.lower() in negation_words:  # Trigger negation
            negation_flag = True
            continue  # Skip adding "no" to the transformed text
        elif negation_flag and (
            not re.match(r"[a-zA-Z]+", word) or word.lower() in stop_negation_words
        ):  # End negation on punctuation or 'and'
            negation_flag = False

        # Prefix "NOT_" if negation flag is set
        if negation_flag:
            transformed_words.append(f"not_{word}")
            if word in ["much"]:
                negation_flag = False
        else:
            if word.lower() not in stop_negation_words:
                transformed_words.append(word)

    return " ".join(transformed_words)

In [95]:
df_copy = df.copy()

In [96]:
df_copy['train.SRC'] = df_copy['train.SRC'].apply(expnad_abb2)
df_copy.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
10614,pizza with balsamic glaze and mozzarella and r...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING BALSAM...,(ORDER (PIZZAORDER pizza with (TOPPING balsami...,(ORDER (PIZZAORDER (TOPPING balsamic glaze ) (...
1607428,four seven ups and five 500 ml coke zeroes and...,(ORDER (DRINKORDER (NUMBER 1 ) (VOLUME 200 ML ...,(ORDER (DRINKORDER (NUMBER four ) (DRINKTYPE s...,(ORDER (DRINKORDER (NUMBER four ) (DRINKTYPE s...
18844,i would like a pizza with american cheese and ...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING AMERIC...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING americ...
1982886,i would like a pizza with red peppers shrimp a...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING RED_PE...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING red pe...
2353930,i would like a pizza with red pepper flakes mo...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING RED_PE...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING red pe...


In [97]:
df_copy['train.SRC'] = df_copy['train.SRC'].apply(handle_negation)
df_copy.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
10614,pizza with balsamic glaze mozzarella ranch sauce,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING BALSAM...,(ORDER (PIZZAORDER pizza with (TOPPING balsami...,(ORDER (PIZZAORDER (TOPPING balsamic glaze ) (...
1607428,four seven ups five 500 ml coke zeroes a 200 m...,(ORDER (DRINKORDER (NUMBER 1 ) (VOLUME 200 ML ...,(ORDER (DRINKORDER (NUMBER four ) (DRINKTYPE s...,(ORDER (DRINKORDER (NUMBER four ) (DRINKTYPE s...
18844,i would like a pizza with american cheese with...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING AMERIC...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING americ...
1982886,i would like a pizza with red peppers shrimp g...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING RED_PE...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING red pe...
2353930,i would like a pizza with red pepper flakes mo...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING RED_PE...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING red pe...


In [98]:
df_copy['train.SRC'] = df_copy['train.SRC'].apply(remove_stopwords)
df_copy.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
10614,pizza balsamic glaze mozzarella ranch sauce,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING BALSAM...,(ORDER (PIZZAORDER pizza with (TOPPING balsami...,(ORDER (PIZZAORDER (TOPPING balsamic glaze ) (...
1607428,four seven ups five 500 ml coke zeroes 200 mil...,(ORDER (DRINKORDER (NUMBER 1 ) (VOLUME 200 ML ...,(ORDER (DRINKORDER (NUMBER four ) (DRINKTYPE s...,(ORDER (DRINKORDER (NUMBER four ) (DRINKTYPE s...
18844,pizza american cheese olive oil,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING AMERIC...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING americ...
1982886,pizza red peppers shrimp green pepper,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING RED_PE...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING red pe...
2353930,pizza red pepper flakes mozzarella cheese cara...,(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING RED_PE...,(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza ...,(ORDER (PIZZAORDER (NUMBER a ) (TOPPING red pe...


In [99]:
from gensim.models import FastText

In [101]:
fast_text_model = FastText(sentences=df_copy["train.SRC"].apply(lambda x : x.split()), vector_size=50, window=5, min_count=1, workers=4, sg=1)

In [102]:
fast_text_model.wv.key_to_index

{'three': 0,
 'pizzas': 1,
 'pizza': 2,
 'four': 3,
 'pies': 4,
 'party': 5,
 'five': 6,
 'cheese': 7,
 'sized': 8,
 'one': 9,
 'two': 10,
 'size': 11,
 'sprite': 12,
 'american': 13,
 'ice': 14,
 'glaze': 15,
 '-': 16,
 'large': 17,
 'balsamic': 18,
 'ounce': 19,
 'pie': 20,
 'not_cheese': 21,
 'pepper': 22,
 'not_american': 23,
 'peppers': 24,
 'tea': 25,
 'ups': 26,
 'extra': 27,
 'diet': 28,
 'seven': 29,
 'medium': 30,
 'also': 31,
 'personal': 32,
 'teas': 33,
 'sauce': 34,
 'ginger': 35,
 'pecorino': 36,
 'peperonni': 37,
 'not_crust': 38,
 'not_thin': 39,
 'cans': 40,
 'green': 41,
 'red': 42,
 'fantas': 43,
 'little': 44,
 'ale': 45,
 'lunch': 46,
 'roasted': 47,
 'not_any': 48,
 'bottle': 49,
 '500': 50,
 'sprites': 51,
 '20': 52,
 'coke': 53,
 'crust': 54,
 'banana': 55,
 'chicken': 56,
 'mozzarella': 57,
 'milliliter': 58,
 'bit': 59,
 'not_the': 60,
 'pellegrino': 61,
 'san': 62,
 'onions': 63,
 'fl': 64,
 'regular': 65,
 'liter': 66,
 'pineapple': 67,
 'onion': 68,
 'oliv

In [103]:
# Extract source (SRC) and target (EXR) fields
src_texts = df_copy["train.SRC"].tolist()
tgt_texts = ["<sos> " + text + " <eos>" for text in df["train.TOP"].tolist()]  # Add start and end tokens

# Tokenizers
src_tokenizer = Tokenizer(filters="" , oov_token = "<oov>")
tgt_tokenizer = Tokenizer(filters="" , oov_token = "<oov>")

src_tokenizer.fit_on_texts(src_texts)
tgt_tokenizer.fit_on_texts(tgt_texts)

# Vocabulary sizes
src_vocab_size = len(src_tokenizer.word_index) + 1
tgt_vocab_size = len(tgt_tokenizer.word_index) + 1

# Tokenize and pad sequences
src_sequences = src_tokenizer.texts_to_sequences(src_texts)
tgt_sequences = tgt_tokenizer.texts_to_sequences(tgt_texts)

max_src_len = max(len(seq) for seq in src_sequences)
max_tgt_len = max(len(seq) for seq in tgt_sequences)

src_sequences = pad_sequences(src_sequences, maxlen=max_src_len, padding="post")
tgt_sequences = pad_sequences(tgt_sequences, maxlen=max_tgt_len, padding="post")

# Dataset Preparation
BATCH_SIZE = 64

BUFFER_SIZE = 10000

dataset = tf.data.Dataset.from_tensor_slices((src_sequences, tgt_sequences))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)


In [104]:
import numpy as np

In [105]:
# Prepare word embeddings using FastText
embedding_dim = fast_text_model.wv.vector_size  # Dimension of Word2Vec vectors

# Initialize a matrix to store word vectors
embedding_matrix = np.zeros((src_vocab_size, embedding_dim))

# Fill the embedding matrix with FastText word vectors
for word, idx in src_tokenizer.word_index.items():
    if word in fast_text_model.wv.key_to_index.keys():
        # print("found" , word)
        embedding_matrix[idx] = fast_text_model.wv[word]
    else:
        print("not found" , word)
        embedding_matrix[idx] = np.random.uniform(-0.01, 0.01, embedding_dim)

not found <oov>


In [5]:

# Model Parameters
EMBED_SIZE = 128
HIDDEN_SIZE = 256

# Encoder
class Encoder(Model):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = Embedding(input_dim=src_vocab_size,
                            output_dim=embedding_dim,
                            weights=[embedding_matrix],
                            trainable=True)
        self.lstm = Bidirectional(LSTM(hidden_size, return_sequences=True, return_state=True))

    def call(self, x):
        x = self.embedding(x)
        outputs, forward_h, forward_c, backward_h, backward_c = self.lstm(x)
        state_h = tf.concat([forward_h, backward_h], axis=-1)
        state_c = tf.concat([forward_c, backward_c], axis=-1)
        return outputs, state_h, state_c

# Attention Layer
class Attention(tf.keras.layers.Layer):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.W1 = Dense(hidden_size)
        self.W2 = Dense(hidden_size)
        self.V = Dense(1)

    def call(self, encoder_outputs, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = self.V(tf.nn.tanh(self.W1(encoder_outputs) + self.W2(hidden_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * encoder_outputs
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

# Decoder
class Decoder(Model):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = Embedding(vocab_size, embed_size, trainable=True)
        self.lstm = LSTM(hidden_size * 2, return_sequences=True, return_state=True)
        self.fc = Dense(vocab_size)
        self.attention = Attention(hidden_size)

    def call(self, x, encoder_outputs, hidden, cell):
        context_vector, attention_weights = self.attention(encoder_outputs, hidden)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        outputs, state_h, state_c = self.lstm(x, initial_state=[hidden, cell])
        logits = self.fc(outputs)
        return logits, state_h, state_c, attention_weights

# Define the model
encoder = Encoder(src_vocab_size, EMBED_SIZE, HIDDEN_SIZE)
decoder = Decoder(tgt_vocab_size, EMBED_SIZE, HIDDEN_SIZE)

# Loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    # Squeeze the predictions to remove the extra dimension
    pred = tf.squeeze(pred, axis=1)
    loss = loss_object(real, pred)
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    return tf.reduce_mean(loss)

# Optimizer
optimizer = tf.keras.optimizers.Adam()

NameError: name 'embedding_dim' is not defined

In [107]:
# Training step
@tf.function
def train_step(src, tgt):
    loss = 0
    with tf.GradientTape() as tape:
        encoder_outputs, enc_hidden, enc_cell = encoder(src)
        dec_hidden, dec_cell = enc_hidden, enc_cell
        dec_input = tf.expand_dims([tgt_tokenizer.word_index["<sos>"]] * BATCH_SIZE, 1)

        for t in range(1, tgt.shape[1]):
            predictions, dec_hidden, dec_cell, _ = decoder(dec_input, encoder_outputs, dec_hidden, dec_cell)
            loss += loss_function(tgt[:, t], predictions)
            dec_input = tf.expand_dims(tgt[:, t], 1)

    batch_loss = loss / int(tgt.shape[1])
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss
# Training loop
EPOCHS = 5

for epoch in range(EPOCHS):
    total_loss = 0

    for (batch, (src, tgt)) in enumerate(dataset):
        # print(f'Working = {batch}/{len(dataset)}')
        batch_loss = train_step(src, tgt)
        total_loss += batch_loss

    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss.numpy():.4f}")

Epoch 1/5, Loss: 375.1225
Epoch 2/5, Loss: 101.5145
Epoch 3/5, Loss: 96.2576
Epoch 4/5, Loss: 93.8160
Epoch 5/5, Loss: 92.2122


In [132]:
encoder = tf.keras.models.load_model("/kaggle/working/enocder_more.keras")

TypeError: Could not locate class 'Encoder'. Make sure custom classes are decorated with `@keras.saving.register_keras_serializable()`. Full object config: {'module': None, 'class_name': 'Encoder', 'config': {'trainable': True, 'dtype': 'float32'}, 'registered_name': 'Encoder', 'build_config': {'input_shape': [64, 20]}}

In [122]:
def translate(sentence, encoder, decoder, src_tokenizer, tgt_tokenizer, max_tgt_len):
    # Tokenize and pad the input
    input_sequence = src_tokenizer.texts_to_sequences([sentence])
    input_sequence = pad_sequences(input_sequence, maxlen=max_src_len, padding="post")

    # Encode the input sequence
    encoder_outputs, enc_hidden, enc_cell = encoder(tf.convert_to_tensor(input_sequence))

    # Initialize the decoder
    dec_input = tf.expand_dims([tgt_tokenizer.word_index["<sos>"]], 0)
    dec_hidden, dec_cell = enc_hidden, enc_cell

    result_tokens = []

    for _ in range(max_tgt_len):
        predictions, dec_hidden, dec_cell, _ = decoder(dec_input, encoder_outputs, dec_hidden, dec_cell)
        predicted_id = tf.argmax(predictions[0, 0]).numpy()

        if predicted_id == tgt_tokenizer.word_index["<eos>"]:
            break

        if predicted_id in tgt_tokenizer.index_word:  # Ensure valid token
            result_tokens.append(tgt_tokenizer.index_word[predicted_id])
        else:
            result_tokens.append("<oov>")  # Handle unknown tokens

        dec_input = tf.expand_dims([predicted_id], 0)

    # Join tokens to form the final output
    predicted_sentence = " ".join(result_tokens)
    return predicted_sentence


In [109]:
input_sentence = "i'd like to get a small pepperoni and tuna pizza and i don't want cheese"

input_sentence = remove_stopwords(handle_negation(expnad_abb2(input_sentence)))

# Translate the input
predicted_output = translate(input_sentence, encoder, decoder, src_tokenizer, tgt_tokenizer, max_tgt_len)
print("Input Sentence:", input_sentence)
print("Predicted Output:", predicted_output)

Input Sentence: small pepperoni tuna pizza not_want not_cheese
Predicted Output: (order i'd like (pizzaorder (number a ) (size small ) (topping pepperoni ) (topping tuna ) pizza ) and (topping mozzarella cheese ) ) )


In [110]:
df_test  = pd.read_csv("/kaggle/input/pizza-test/test_set.csv")
df_test

Unnamed: 0,id,order
0,0,let me prefer a extra large sauteed spinach an...
1,1,could you give me a exta large pizza with roas...
2,2,can you handle this order a pizza make it etra...
3,3,let me try 5 lnch pizzas with garlic onions an...
4,4,i need you to order me a pie in med size along...
...,...,...
3995,3995,i'd have medium pizza along with thin crust ol...
3996,3996,i want a chicken and pepperoni pie with extra ...
3997,3997,get me a sausage pizza with extra cheese and n...
3998,3998,i'd like a large pepperoni pizza with extra ch...


In [111]:
df_test['input'] = df_test['order'].apply(expnad_abb2).apply(remove_stopwords).apply(handle_negation)

In [112]:
df_test

Unnamed: 0,id,order,input
0,0,let me prefer a extra large sauteed spinach an...,extra large sauteed spinach tomatoes pizza not...
1,1,could you give me a exta large pizza with roas...,give exta large pizza roasted red peppers chee...
2,2,can you handle this order a pizza make it etra...,pizza etra large new york style tomato sauce n...
3,3,let me try 5 lnch pizzas with garlic onions an...,try 5 lnch pizzas garlic onions pesto
4,4,i need you to order me a pie in med size along...,pie med size along pesto pan ham
...,...,...,...
3995,3995,i'd have medium pizza along with thin crust ol...,medium pizza along thin crust olives not_onions
3996,3996,i want a chicken and pepperoni pie with extra ...,chicken pepperoni pie extra cheese
3997,3997,get me a sausage pizza with extra cheese and n...,sausage pizza extra cheese not_pesto
3998,3998,i'd like a large pepperoni pizza with extra ch...,large pepperoni pizza extra cheese two large c...


In [123]:
from tqdm import tqdm

y_pred = []

# Add tqdm for the progress bar
for x_test in tqdm(df_test['input'], desc="Processing test inputs"):
    predicted_output = translate(x_test, encoder, decoder, src_tokenizer, tgt_tokenizer, max_tgt_len)
    y_pred.append(predicted_output)

Processing test inputs: 100%|██████████| 4000/4000 [24:33<00:00,  2.71it/s]


In [124]:
df = pd.DataFrame(y_pred, columns=["output"])
df['id'] = df.index
df = df[['id', 'output']]
df.to_csv("9.csv", index=False)

In [125]:
import re
import csv

# Function to capitalize keywords
def capitalize_keywords(input_text):
    # Define the keywords to capitalize
    keywords = ["order", "pizzaorder", "number", "size", "style", "topping", "quantity", "drinkorder", "containertype", "drinktype", "volume"]
    
    # Capitalize keywords using regex substitution
    for keyword in keywords:
        input_text = re.sub(rf"\b{keyword}\b", keyword.upper(), input_text)
    
    return input_text

# Input and output CSV files
input_csv_file = "/kaggle/working/9.csv"  # Input file with id and output columns
output_csv_file = "9-new.csv"  # Output file to save results

# Process each line from the input file
with open(input_csv_file, "r") as infile, open(output_csv_file, "w", newline="") as outfile:
    reader = csv.DictReader(infile)  # Read input as a dictionary to access columns by name
    fieldnames = reader.fieldnames  # Preserve the same columns as in the input
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    
    # Write header to the output file
    writer.writeheader()
    
    # Process each row
    for row in reader:
        original_output = row["output"]  # Assuming the column to process is named "output"
        row["output"] = capitalize_keywords(original_output)  # Update the "output" column
        writer.writerow(row)

print(f"Capitalized text saved to {output_csv_file}.")


Capitalized text saved to 9-new.csv.


In [126]:
df_dev = pd.read_json("/kaggle/input/pizza-dataset/PIZZA_dev.json", lines=True)
df_dev

Unnamed: 0,dev.SRC,dev.EXR,dev.TOP,dev.PCFG_ERR
0,i want to order two medium pizzas with sausage...,(ORDER (PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) ...,(ORDER i want to order (PIZZAORDER (NUMBER two...,False
1,five medium pizzas with tomatoes and ham,(ORDER (PIZZAORDER (NUMBER 5 ) (SIZE MEDIUM ) ...,(ORDER (PIZZAORDER (NUMBER five ) (SIZE medium...,False
2,i need to order one large vegetarian pizza wit...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i need to order (PIZZAORDER (NUMBER one...,False
3,i'd like to order a large onion and pepper pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like to order (PIZZAORDER (NUMBER a...,False
4,i'll have one pie along with pesto and ham but...,(ORDER (PIZZAORDER (NOT (TOPPING OLIVES ) ) (N...,(ORDER i'll have (PIZZAORDER (NUMBER one ) pie...,False
...,...,...,...,...
343,i would like to get five mushroom and onion pi...,(ORDER (PIZZAORDER (NOT (TOPPING SAUSAGE ) ) (...,(ORDER i would like to get (PIZZAORDER (NUMBER...,False
344,order one large pizza with sausage and mushrooms,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER order (PIZZAORDER (NUMBER one ) (SIZE l...,False
345,order a pizza with sausage peppers and no chicken,(ORDER (PIZZAORDER (NOT (TOPPING CHICKEN ) ) (...,(ORDER order (PIZZAORDER (NUMBER a ) pizza wit...,False
346,place a rush order for two large pepperoni piz...,(ORDER (DRINKORDER (CONTAINERTYPE CAN ) (DRINK...,(ORDER place a rush order for (PIZZAORDER (NUM...,False


In [127]:
df_dev['input'] = df_dev['dev.SRC'].apply(expnad_abb2).apply(remove_stopwords).apply(handle_negation)

In [128]:
from tqdm import tqdm

y_pred_dev = []

# Add tqdm for the progress bar
for x_test in tqdm(df_dev['input'], desc="Processing test inputs"):
    predicted_output = translate(x_test, encoder, decoder, src_tokenizer, tgt_tokenizer, max_tgt_len)
    y_pred_dev.append(predicted_output)

Processing test inputs: 100%|██████████| 348/348 [02:00<00:00,  2.88it/s]


In [129]:
df = pd.DataFrame(y_pred_dev, columns=["output"])
df['id'] = df.index
df = df[['id', 'output']]
df.to_csv("dev-new.csv", index=False)

In [130]:
import re
import csv

# Function to capitalize keywords
def capitalize_keywords(input_text):
    # Define the keywords to capitalize
    keywords = ["order", "pizzaorder", "number", "size", "style", "topping", "quantity", "drinkorder", "containertype", "drinktype", "volume"]
    
    # Capitalize keywords using regex substitution
    for keyword in keywords:
        input_text = re.sub(rf"\b{keyword}\b", keyword.upper(), input_text)
    
    return input_text

# Input and output CSV files
input_csv_file = "/kaggle/working/dev-new.csv"  # Input file with id and output columns
output_csv_file = "dev_output-new.csv"  # Output file to save results

# Process each line from the input file
with open(input_csv_file, "r") as infile, open(output_csv_file, "w", newline="") as outfile:
    reader = csv.DictReader(infile)  # Read input as a dictionary to access columns by name
    fieldnames = reader.fieldnames  # Preserve the same columns as in the input
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    
    # Write header to the output file
    writer.writeheader()
    
    # Process each row
    for row in reader:
        original_output = row["output"]  # Assuming the column to process is named "output"
        row["output"] = capitalize_keywords(original_output)  # Update the "output" column
        writer.writerow(row)

print(f"Capitalized text saved to {output_csv_file}.")


Capitalized text saved to dev_output-new.csv.
