## Imports


In [None]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.utils import to_categorical


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Embedding,
    LSTM,
    Dense,
    Input,
)
from nltk.corpus import stopwords

pd.set_option("display.max_columns", None)
np.random.seed(42)
tf.random.set_seed(42)

## Constants


In [None]:
dataset_dir = "dataset/"
CONTRACTIONS = {
    "n't": "not",
    "'s": "is",
    "'re": "are",
    "'m": "am",
    "'ll": "will",
    "'ve": "have",
    "'d": "would",
    "'em": "them",
    "'all": "all",
    "'cause": "because",
    "'clock": "oclock",
    "'tis": "it is",
    "'twas": "it was",
    "'tween": "between",
    "'twere": "it were",
    "'twould": "it would",
    "'twixt": "betwixt",
    "'twill": "it will",
    "'til": "until",
    "'bout": "about",
    "'cept": "except",
    "'cos": "because",
    "'fore": "before",
    "'round": "around",
    "'n'": "and",
    "'neath": "beneath",
    "'nother": "another",
    "'nuff": "enough",
}
negation_words = {
    "no",
    "not",
    "none",
    "never",
    "without",
    "avoid",
    "neither",
    "nor",
    "hate",
    "hold",
}

vectorizer = TfidfVectorizer()
# tokenizer = MWETokenizer() # Multi-Word Expression Tokenizer
stop_negation_words = {"and", "but"}
stop_words = set(stopwords.words("english"))
stop_words = stop_words - negation_words - stop_negation_words
stop_words.update({"would", "like", "get", "want"})

## Text Processing


In [None]:
def clean_text(text):
    text = re.sub(r"[^\w']", " ", text)  # Remove non-word characters
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces
    text = text.lower().strip()  # Lowercase and strip whitespace
    return text

In [None]:
def expnad_abb(text):

    pattern = re.compile(
        r"(" + "|".join(re.escape(key) for key in CONTRACTIONS.keys()) + r")"
    )
    expanded_text = pattern.sub(lambda x: " " + CONTRACTIONS[x.group()], text)
    return expanded_text

In [None]:
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

In [None]:
def handle_negation(text):
    # Look for patterns like "no [word1] [word2] ..." and transform them
    words = text.split()
    transformed_words = []
    negation_flag = False  # To track if we're negating

    for i, word in enumerate(words):
        if word.lower() in negation_words:  # Trigger negation
            negation_flag = True
            continue  # Skip adding "no" to the transformed text
        elif negation_flag and (
            not re.match(r"[a-zA-Z]+", word) or word.lower() in stop_negation_words
        ):  # End negation on punctuation or 'and'
            negation_flag = False

        # Prefix "NOT_" if negation flag is set
        if negation_flag:
            transformed_words.append(f"NOT_{word}")
            if word in ["much"]:
                negation_flag = False
        else:
            if word.lower() not in stop_negation_words:
                transformed_words.append(word)

    return " ".join(transformed_words)

In [None]:
def add_tags(sentence):
    return f"<s> {sentence.strip()} </s>"

## Read Data


In [None]:
df_train = pd.read_json(dataset_dir + "PIZZA_train.json", lines=True, nrows=10000)

# Training Set Preparation


### Take portion of data


In [None]:
input_sentences = df_train["train.SRC"]
output_sentences = df_train["train.EXR"]

In [None]:
output_sentences = output_sentences.apply(add_tags)

### Tokenize input


In [None]:
input_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(input_sentences)
input_sequences = input_tokenizer.texts_to_sequences(input_sentences)
input_vocab_size = len(input_tokenizer.word_index) + 1

### Tokenize output


In [None]:
output_tokenizer = Tokenizer(filters="")
output_tokenizer.fit_on_texts(output_sentences)
output_sequences = output_tokenizer.texts_to_sequences(output_sentences)
output_vocab_size = len(output_tokenizer.word_index) + 1

## Pad the sequence

### To make all of the same size which is the max sequence length of both input and output


In [None]:
max_input_length = max(len(seq) for seq in input_sequences)
max_output_length = max(len(seq) for seq in output_sequences)
max_length = max(max_output_length, max_input_length)
input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding="post")
output_sequences = pad_sequences(output_sequences, maxlen=max_length, padding="post")

# Model Building


### Building the encoder

We mainly have <span style="color:lime">3</span> layers

1. The input layer
2. The embedding layer to change words to numbers or vectors
3. The LSTM layer to fit the data and generate the context vector

Input --> Embeddings --> LSTM


In [None]:
# Encoder
encoder_input = Input(shape=(max_length,))
encoder_embedding = Embedding(input_vocab_size, 128)(encoder_input)
encoder_lstm, state_short_term, state_long_term = LSTM(256, return_state=True)(
    encoder_embedding
)
encoder_context_vector = [state_short_term, state_short_term]

### Building the decoder

We mainly have <span style="color:lime">4</span> layers

1. The input layer
2. The embedding layer to change words to numbers or vectors
3. The LSTM layer to fit the data and generate the context vector but here we must specify the input hidden and cell is the output of the encoder [ This is the idea of encoder-decoder ]
4. The dense layer for getting the output


In [None]:
# Decoder
decoder_input = Input(shape=(max_length - 1,))
decoder_embedding = Embedding(output_vocab_size, 128)(decoder_input)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
# No need for the hidden or the cell
decoder_output, _, _ = decoder_lstm(
    decoder_embedding, initial_state=encoder_context_vector
)
decoder_dense = Dense(output_vocab_size, activation="softmax")
decoder_output = decoder_dense(decoder_output)

### Formulate the full model


In [None]:
# Model
model = Model([encoder_input, decoder_input], decoder_output)
model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

# Summarize model
model.summary()

In [None]:
decoder_input_data = output_sequences[:, :-1]  # Remove the last token
decoder_output_data = output_sequences[:, 1:]  # Remove the first token

### Fit the model


In [None]:
model.fit(
    [input_sequences, decoder_input_data],
    decoder_output_data,
    batch_size=32,
    epochs=10,
)

# Validation Set Preparation


In [None]:
df_dev = pd.read_json(dataset_dir + "PIZZA_dev.json", lines=True)

In [None]:
validation_input_sentences = df_dev["dev.SRC"]
validation_output_sentences = df_dev["dev.EXR"]

In [None]:
encoder_model = Model(encoder_input, encoder_context_vector)

In [None]:
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm_output, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_output = decoder_dense(decoder_lstm_output)

decoder_model = Model(
    [decoder_input] + decoder_states_inputs, [decoder_output] + decoder_states
)

In [None]:
def decode_sequence(input_seq):
    # Ensure input sequence is padded correctly
    input_seq = pad_sequences(
        input_seq, maxlen=max_length, padding="post"
    )  # Padding to max_input_length

    states_value = encoder_model.predict(input_seq, verbose=0)

    # Start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = output_tokenizer.word_index["<s>"]  # Use a start token
    
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value, verbose=0
        )

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        if sampled_token_index == 0:
            break
        sampled_token = output_tokenizer.index_word[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "</s>":
            stop_condition = True

        # if len(decoded_sentence) > max_output_length+30:
        #     stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return decoded_sentence

In [None]:
accuracy = 0
temp = validation_output_sentences.tolist()
expected = []
for sen in temp:
    expected.append(sen[7:-1].strip().lower())

In [None]:
from nltk.translate.bleu_score import sentence_bleu


# Compare predictions with actual outputs

for i, val_input in enumerate(validation_input_sentences):

    val_seq = input_tokenizer.texts_to_sequences([val_input])

    predicted_output = decode_sequence(val_seq)


    # Tokenize expected output

    reference = validation_output_sentences[i].split()

    candidate = predicted_output.split()

    bleu_score = sentence_bleu([reference], candidate)

    print("Expected:", expected[i])

    print("Predicted:", predicted_output)

    # accuracy+=bleu_score
    if predicted_output == expected[i]:
        accuracy += 1

print(accuracy / len(validation_input_sentences))