## Imports


In [1]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer, word_tokenize, MWETokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", None)
np.random.seed(42)
tf.random.set_seed(42)

## Constants


In [2]:
dataset_dir = "dataset/"
CONTRACTIONS = {
    "n't": "not",
    "'s": "is",
    "'re": "are",
    "'m": "am",
    "'ll": "will",
    "'ve": "have",
    "'d": "would",
    "'em": "them",
    "'all": "all",
    "'cause": "because",
    "'clock": "oclock",
    "'tis": "it is",
    "'twas": "it was",
    "'tween": "between",
    "'twere": "it were",
    "'twould": "it would",
    "'twixt": "betwixt",
    "'twill": "it will",
    "'til": "until",
    "'bout": "about",
    "'cept": "except",
    "'cos": "because",
    "'fore": "before",
    "'round": "around",
    "'n'": "and",
    "'neath": "beneath",
    "'nother": "another",
    "'nuff": "enough",
}
negation_words = {
    "no",
    "not",
    "none",
    "never",
    "without",
    "avoid",
    "neither",
    "nor",
    "hate",
    "hold",
}
tokenizer = TreebankWordTokenizer()  # Treebank Word Tokenizer
lemmatizer = WordNetLemmatizer()  # WordNet Lemmatizer
vectorizer = TfidfVectorizer()
# tokenizer = MWETokenizer() # Multi-Word Expression Tokenizer
stemmer = (
    PorterStemmer()
)  # Porter Stemmer  # changes ordering to order and cheese to chees it may help us
# stemmer = SnowballStemmer("english") # ! The same as porter stemmer
stop_negation_words = {"and", "but"}
stop_words = set(stopwords.words("english"))
stop_words = stop_words - negation_words - stop_negation_words
stop_words.update({"would", "like", "get", "want"})

## Text Processing


- Clean text from any unneeded characters


In [3]:
def clean_text(text):
    text = re.sub(r"[^\w']", " ", text)  # Remove non-word characters
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces
    text = text.lower().strip()  # Lowercase and strip whitespace
    return text

In [4]:
def expnad_abb(text):
    text = text.replace("can't", "can not")
    text = text.replace("won't", "will not")
    text = text.replace("n't", " not")
    text = text.replace("'ll", " will")
    text = text.replace("'ve", " have")
    text = text.replace("'re", " are")
    text = text.replace("'m", " am")
    text = text.replace("'d", " would")
    return text


def expnad_abb2(text):

    pattern = re.compile(
        r"(" + "|".join(re.escape(key) for key in CONTRACTIONS.keys()) + r")"
    )
    expanded_text = pattern.sub(lambda x: " " + CONTRACTIONS[x.group()], text)
    return expanded_text

In [5]:
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

In [6]:
def handle_negation(text):
    # Look for patterns like "no [word1] [word2] ..." and transform them
    words = text.split()
    transformed_words = []
    negation_flag = False  # To track if we're negating

    for i, word in enumerate(words):
        if word.lower() in negation_words:  # Trigger negation
            negation_flag = True
            continue  # Skip adding "no" to the transformed text
        elif negation_flag and (
            not re.match(r"[a-zA-Z]+", word) or word.lower() in stop_negation_words
        ):  # End negation on punctuation or 'and'
            negation_flag = False

        # Prefix "NOT_" if negation flag is set
        if negation_flag:
            transformed_words.append(f"NOT_{word}")
            if word in ["much"]:
                negation_flag = False
        else:
            if word.lower() not in stop_negation_words:
                transformed_words.append(word)

    return " ".join(transformed_words)

In [7]:
def tokenize_and_lemmatize(text: str):
    tokens = tokenizer.tokenize(text)
    final_string = ""
    # stemmed_tokens = []
    for token in tokens:
        stemmed_token = stemmer.stem(token, False) + " "
        # I think this is better than lemmatization
        final_string += stemmed_token + " "
        # stemmed_tokens.append(stemmed_token)
        # final_string += lemmatizer.lemmatize(token) + " "  # Didn't do anything
    return final_string

In [None]:
cleaned_text = handle_negation(
    remove_stopwords(
        expnad_abb2(
            clean_text(
                # "i want a lunch size pizza with no apple wood bacon but extra cheese" #Success
                # "I'll order some pizza for lunch and I don't eat spaghetti I eat Pasta WIth white sauce" # Failed
                # "i didn't eat from yesterday can you please order me 2 pizzas? I don't love pepperoni What about mushroom?" #Failed
                "I'm not ordering pizza with extra cheese I want Soda and I don't like your organization"
                # "I need pizza with extra cheese without cucumber or tomatoes add only pepperoni 'cause I love peperoni"  # Failed
            )
        )
    )
)
print(cleaned_text)

NOT_ordering NOT_pizza NOT_extra NOT_cheese NOT_soda NOT_organization


In [9]:
stemmed_tokens = tokenize_and_lemmatize(cleaned_text)
# print("Tokens:", tokens)
print("Stemmed Tokens:", stemmed_tokens)
# print("Processed String:", processed_string)

Stemmed Tokens: NOT_order  NOT_pizza  NOT_extra  NOT_chees  NOT_soda  NOT_organ  


In [10]:
df_train = pd.read_json(dataset_dir + "PIZZA_train.json", lines=True)
df_dev = pd.read_json(dataset_dir + "PIZZA_dev.json", lines=True)

In [11]:
df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
1,large pie with green pepper and with extra pep...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
2,i'd like a large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
3,party size stuffed crust pie with american che...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,can i have one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...


In [12]:
df_dev.head()

Unnamed: 0,dev.SRC,dev.EXR,dev.TOP,dev.PCFG_ERR
0,i want to order two medium pizzas with sausage...,(ORDER (PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) ...,(ORDER i want to order (PIZZAORDER (NUMBER two...,False
1,five medium pizzas with tomatoes and ham,(ORDER (PIZZAORDER (NUMBER 5 ) (SIZE MEDIUM ) ...,(ORDER (PIZZAORDER (NUMBER five ) (SIZE medium...,False
2,i need to order one large vegetarian pizza wit...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i need to order (PIZZAORDER (NUMBER one...,False
3,i'd like to order a large onion and pepper pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like to order (PIZZAORDER (NUMBER a...,False
4,i'll have one pie along with pesto and ham but...,(ORDER (PIZZAORDER (NOT (TOPPING OLIVES ) ) (N...,(ORDER i'll have (PIZZAORDER (NUMBER one ) pie...,False


In [13]:
df_train["train.SRC"] = df_train["train.SRC"].apply(clean_text)
df_dev["dev.SRC"] = df_dev["dev.SRC"].apply(clean_text)

df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
1,large pie with green pepper and with extra pep...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
2,i'd like a large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
3,party size stuffed crust pie with american che...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,can i have one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...


In [14]:
# lemmatizer = WordNetLemmatizer()
# df_train['train.SRC'] = df_train['train.SRC'].apply(lemmatizer.lemmatize)
# df_train.head()

In [15]:
df_train["train.SRC"] = df_train["train.SRC"].apply(expnad_abb2)
df_dev["dev.SRC"] = df_dev["dev.SRC"].apply(expnad_abb2)

df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
1,large pie with green pepper and with extra pep...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
2,i would like a large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
3,party size stuffed crust pie with american che...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,can i have one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...


In [16]:
df_train["train.SRC"] = df_train["train.SRC"].apply(remove_stopwords)
df_dev["dev.SRC"] = df_dev["dev.SRC"].apply(remove_stopwords)

df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
1,large pie green pepper and extra peperonni,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
2,large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
3,party size stuffed crust pie american cheese a...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...


In [17]:
df_train["train.SRC"] = df_train["train.SRC"].apply(handle_negation)
df_dev["dev.SRC"] = df_dev["dev.SRC"].apply(handle_negation)

df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
1,large pie green pepper extra peperonni,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
2,large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
3,party size stuffed crust pie american cheese m...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...


In [18]:
df_train["train.SRC"] = df_train["train.SRC"].apply(tokenize_and_lemmatize)
df_dev["dev.SRC"] = df_dev["dev.SRC"].apply(tokenize_and_lemmatize)

df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,larg bbq pull pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
1,larg pie green pepper extra peperonni,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
2,larg vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
3,parti size stuf crust pie american chees...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,one person size artichok,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...


### TF-IDF


In [None]:
tf_idf_data = df_train["train.SRC"]

tf_idf_data = vectorizer.fit_transform(tf_idf_data)
# tf_idf_data = tf_idf_data.apply(vectorizer.fit_transform)
# X_train_tfidf = vectorizer.fit_transform(temp_data)
# X_test_tfidf = vectorizer.transform(temp_data)
# print(X_train_tfidf)
# print(tf_idf_data)
# df_train.head()

### Bag of words


In [20]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
count_vectroizer = CountVectorizer()

bow = df_train["train.SRC"]

# Fit and transform the corpus to a document-term matrix
X = count_vectroizer.fit_transform(bow)

vocab = count_vectroizer.get_feature_names_out()

X_dense = X.toarray()
# Print the vocabulary and document-term matrix
print("Vocabulary:", vocab)
print("Document-Term Matrix:\n", X_dense)

Vocabulary: ['10' '11' '12' '13' '14' '15' '16' '20' '200' '500' 'ale' 'alfredo'
 'also' 'american' 'anchovi' 'appl' 'applewood' 'artichok' 'arugula'
 'bacon' 'balsam' 'balzam' 'banana' 'barbecu' 'basil' 'bay' 'bbq' 'bean'
 'beef' 'big' 'bit' 'black' 'bottl' 'broccoli' 'brocoli' 'buffalo' 'can'
 'caramel' 'carrot' 'cauliflow' 'cheddar' 'chees' 'cheeseburg' 'cherri'
 'chicago' 'chicken' 'chorizo' 'chorrizo' 'coffe' 'coke' 'combin' 'crust'
 'cumin' 'deep' 'deepdish' 'dew' 'diet' 'dish' 'doctor' 'dough' 'dr' 'dri'
 'eight' 'eleven' 'everi' 'everyth' 'extra' 'fanta' 'fat' 'feta' 'fifteen'
 'five' 'fl' 'flake' 'fluid' 'four' 'fourteen' 'free' 'fri' 'garlic'
 'ginger' 'glaze' 'gluten' 'green' 'grill' 'ground' 'ham' 'hawaiian'
 'high' 'hot' 'ice' 'italian' 'jalapeno' 'kalamata' 'keto' 'larg' 'leav'
 'lemon' 'lettuc' 'liter' 'littl' 'lot' 'lover' 'low' 'lunch' 'margarita'
 'margherita' 'meat' 'meatbal' 'meatlov' 'med' 'mediterranean' 'medium'
 'mexican' 'millilit' 'ml' 'mountain' 'mozarella' '

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
temp_data = df_train["train.SRC"].head(20)
# Fit and transform the categorical columns
one_hot_encoded = encoder.fit([temp_data])
print(one_hot_encoded)

OneHotEncoder(sparse_output=False)


In [None]:
from gensim.models import Word2Vec, KeyedVectors

pre_trained = KeyedVectors.load_word2vec_format(
    "GoogleNews-vectors-negative300.bin", binary=True
)

In [None]:
# # Load pre-trained Word2Vec
# test_sentences = df_train["train.SRC"].head(10000)

# # Convert to Word2Vec model for fine-tuning
# pre_trained_model = Word2Vec(min_count=1,workers=14,vector_size=300)

# pre_trained_model.build_vocab(test_sentences)

# # Build vocabulary from the pre-trained model's keys
# pre_trained_model.build_vocab([list(pre_trained.key_to_index.keys())], update=True)

# Initialize weights with the pre-trained vectors
pre_trained_model.wv.vectors = pre_trained_model.wv.vectors[
    : len(pre_trained_model.wv.index_to_key)
]

# Step 4: Intersect with pre-trained embeddings
pre_trained_model.wv.intersect_word2vec_format(
    "GoogleNews-vectors-negative300.bin", binary=True, lockf=1.0
)


# Train further on your dataset
pre_trained_model.train(test_sentences, total_examples=len(custom_sentences), epochs=5)

IndexError: index 31 is out of bounds for axis 0 with size 1

In [None]:
custom_sentences = df_train["train.SRC"].head(10000)

tokenizer = Tokenizer(num_words=500, oov_token="<OOV>")
tokenizer.fit_on_texts(
    [" ".join(tokens) for tokens in custom_sentences]
)  # Flatten sentences for Keras
word_index = tokenizer.word_index

sequences_train = tokenizer.texts_to_sequences(
    [" ".join(tokens) for tokens in custom_sentences]
)  # transforms each text in texts to a sequence of integers
padded_sequences_train = pad_sequences(sequences_train, maxlen=10, padding="post")

print("Word Index:", word_index)
print("Padded Sequences:", padded_sequences_train)

Word Index: {'<OOV>': 1, 'e': 2, 'p': 3, 'a': 4, 'i': 5, 'r': 6, 'n': 7, 'o': 8, 't': 9, 's': 10, 'z': 11, 'c': 12, 'l': 13, 'h': 14, 'm': 15, 'u': 16, 'g': 17, 'b': 18, 'd': 19, 'x': 20, 'v': 21, 'f': 22, 'k': 23, 'w': 24, 'y': 25, 'q': 26, 'j': 27}
Padded Sequences: [[18 26  3 ...  8  6 23]
 [ 4  3  2 ...  7  7  5]
 [ 4  6  5 ... 11 11  4]
 ...
 [ 9  3  2 ...  7  7  5]
 [ 2 13  6 ...  5  8  7]
 [15  8 11 ... 13 13  4]]


In [None]:
w2v_model = Word2Vec(
    sentences=custom_sentences, min_count=1, workers=14, vector_size=300
)

embedding_dim = w2v_model.vector_size  # Size of Word2Vec embeddings
vocab_size = len(word_index) + 1  # Add 1 for padding token

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]  # Assign pre-trained vector
    else:
        embedding_matrix[i] = np.random.normal(
            size=(embedding_dim,)
        )  # Random vector for unknown words

In [None]:
test_sentences = df_train["train.EXR"].head(10000)

tokenizer = Tokenizer(num_words=500, oov_token="<OOV>")
tokenizer.fit_on_texts(
    [" ".join(tokens) for tokens in test_sentences]
)  # Flatten sentences for Keras
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(
    [" ".join(tokens) for tokens in test_sentences]
)  # transforms each text in texts to a sequence of integers
padded_sequences = pad_sequences(sequences, maxlen=10, padding="post")

print("Word Index:", word_index)
print("Padded Sequences:", padded_sequences)
embedding_dim = w2v_model.vector_size  # Size of Word2Vec embeddings
vocab_size = len(word_index) + 1  # Add 1 for padding token

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]  # Assign pre-trained vector
    else:
        embedding_matrix[i] = np.random.normal(
            size=(embedding_dim,)
        )  # Random vector for unknown words

Word Index: {'<OOV>': 1, 'e': 2, 'p': 3, 'r': 4, 'i': 5, 'o': 6, 'n': 7, 't': 8, 'a': 9, 'z': 10, 's': 11, 'g': 12, 'd': 13, 'm': 14, 'c': 15, 'l': 16, 'u': 17, 'b': 18, 'y': 19, '1': 20, 'h': 21, 'x': 22, 'q': 23, 'f': 24, 'k': 25, 'v': 26, 'w': 27, 'j': 28}
Padded Sequences: [[ 3 17 16 ...  6  4 25]
 [12  3  2 ...  6  7  5]
 [26  2 12 ...  5  9  7]
 ...
 [12  3  2 ...  6  7  5]
 [ 5 10  2 ...  6  7 11]
 [ 2 16 16 ...  2 11  2]]


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential(
    [
        Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim,
            weights=[embedding_matrix],
            input_length=padded_sequences_train.shape[1],
            trainable=False,
        ),
        LSTM(128, return_sequences=False),
        Dense(padded_sequences_train.shape[1], activation="softmax"),
    ]
)

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

In [None]:
history = model.fit(
    padded_sequences_train,  # Input sequences (numerical)
    padded_sequences,
    epochs=10,
    batch_size=32,
    # validation_data=(test_padded, test_labels),  # Validation data
    verbose=2,
)

Epoch 1/10


313/313 - 3s - 10ms/step - accuracy: 0.2338 - loss: 236.8394
Epoch 2/10
313/313 - 2s - 5ms/step - accuracy: 0.2360 - loss: 244.8951
Epoch 3/10
313/313 - 2s - 5ms/step - accuracy: 0.2360 - loss: 247.0565
Epoch 4/10
313/313 - 2s - 5ms/step - accuracy: 0.2360 - loss: 248.9971
Epoch 5/10
313/313 - 2s - 5ms/step - accuracy: 0.2360 - loss: 250.9110
Epoch 6/10
313/313 - 2s - 5ms/step - accuracy: 0.2360 - loss: 252.8540
Epoch 7/10
313/313 - 2s - 5ms/step - accuracy: 0.2360 - loss: 254.8490
Epoch 8/10
313/313 - 2s - 5ms/step - accuracy: 0.2360 - loss: 256.9070
Epoch 9/10
313/313 - 2s - 5ms/step - accuracy: 0.2360 - loss: 259.0325
Epoch 10/10
313/313 - 2s - 6ms/step - accuracy: 0.2360 - loss: 261.2292


### RNN


In [None]:
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense

# Define the model architecture
model = Sequential()
model.add(SimpleRNN(units=32, input_shape=(None, 1)))
model.add(Dense(units=1, activation="sigmoid"))
# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
# Train the model
model.fit(x_train, y_train, epochs=10, batch_size=32)
# Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test)
# Make predictions using the model
predictions = model.predict(x_new)

### LSTM


In [None]:
# define model
model = Sequential()
model.add(LSTM(100, activation="relu", input_shape=(n_input, n_features)))
model.add(Dense(1))
model.compile(optimizer="adam", loss="mse")
model.summary()
model.fit(generator, epochs=5)

In [None]:
tokenizer = Tokenizer(num_words=500, oov_token="<OOV>")
tokenizer.fit_on_texts(
    # df_train["train.SRC"]
    df_train["train.SRC"].head(10)
)  # updates internal vocabulary based on a list of texts

sequences = tokenizer.texts_to_sequences(
    df_train["train.SRC"]
)  # transforms each text in texts to a sequence of integers
padded_sequences = pad_sequences(sequences, maxlen=100, padding="post")

In [None]:
tokenizer.word_index

In [None]:
df_train["padded_seq"] = list(padded_sequences)
df_train.head()