In [47]:
import pandas as pd
import spacy

# Loading spaCy's English language model 
nlp = spacy.load('en_core_web_sm')

# adjusting display options
pd.set_option('display.max_colwidth', None)

df = pd.read_csv('../Dataset/sample_submission.csv')
n_rows = df.shape[0]
df

Unnamed: 0,id,text
0,0,advent chimney elf family fireplace gingerbread mistletoe ornament reindeer scrooge
1,1,advent chimney elf family fireplace gingerbread mistletoe ornament reindeer scrooge walk give jump drive bake the sleep night laugh and
2,2,yuletide decorations gifts cheer holiday carol magi nutcracker polar grinch sleigh chimney workshop stocking ornament holly jingle beard naughty nice
3,3,yuletide decorations gifts cheer holiday carol magi nutcracker polar grinch sleigh chimney workshop stocking ornament holly jingle beard naughty nice sing cheer and of the is eat visit relax unwrap
4,4,hohoho candle poinsettia snowglobe peppermint eggnog fruitcake chocolate candy puzzle game doll toy workshop wonder believe dream hope peace joy merry season greeting card wrapping paper bow fireplace night cookie milk star wish wreath angel the to of and in that have it not with as you from we kaggle
5,5,advent chimney elf family fireplace gingerbread mistletoe ornament reindeer scrooge walk give jump drive bake the sleep night laugh and yuletide decorations gifts cheer holiday carol magi nutcracker polar grinch sleigh chimney workshop stocking ornament holly jingle beard naughty nice sing cheer and of the is eat visit relax unwrap hohoho candle poinsettia snowglobe peppermint eggnog fruitcake chocolate candy puzzle game doll toy workshop wonder believe dream hope peace joy merry season greeting card wrapping paper bow fireplace night cookie milk star wish wreath angel the to of and in that have it not with as you from we kaggle


### Tokenizing the text

In [48]:
# from nltk.tokenize import word_tokenize

# # Tokenize the text and convert to lowercase
# df['tokens'] = df['text'].apply(lambda x: word_tokenize(x.lower()))

# # Display the tokenized words
# print("Tokenized Data:\n", df.head())

# # Save tokenized text to a text file
# with open('tokenized_text.txt', 'w') as f:
#     for tokens in df['tokens']:
#         # Join the tokens back into a single string for each line
#         line = ' '.join(tokens)
#         f.write(line + '\n')

# print("Tokenized text saved to tokenized_text.txt")

In [49]:
# Tokenizing the text using spaCy
def spacy_tokenize(text):
    doc = nlp(text.lower())
    return [token.text for token in doc]

df['tokens'] = df['text'].apply(spacy_tokenize)


In [50]:
# Saving the tokenized text to a text file
with open('../Tokens/tokenized_text.txt', 'w') as f:
    for tokens in df['tokens']:
        line = ' '.join(tokens)
        f.write(line + '\n')

print("Tokenized text saved to tokenized_text.txt")

Tokenized text saved to tokenized_text.txt


In [51]:
# Load the tokenized text from the file
with open('../Tokens/tokenized_text.txt', 'r') as f:
    tokenized_lines = [line.strip().split() for line in f]

# Build vocabulary
vocab = {word: i for i, word in enumerate(set(word for tokens in tokenized_lines for word in tokens))}

# Convert sequences to indices
indices_sequences = [[vocab[word] for word in tokens] for tokens in tokenized_lines]

# Find the maximum length of the sequences
max_len = max(len(tokens) for tokens in indices_sequences)

# Pad the sequences
padded_sequences = pad_sequences(indices_sequences, maxlen=max_len, padding='post')

print("Padded Sequences Shape:", padded_sequences.shape)

Padded Sequences Shape: (6, 100)


### Creating the Vocabulary and Convert Sequences to Indices

In [52]:
# # Building vocabulary
# vocab = {word: i for i, word in enumerate(set(word for tokens in tokenized_lines for word in tokens))}

# # Convert sequences to indices
# indices_sequences = [[vocab[word] for word in tokens] for tokens in tokenized_lines]
# print("Sequences of Indices:\n", indices_sequences)


### Padding the sequences

In [53]:
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# max_len = max(len(tokens) for tokens in indices_sequences)
# padded_sequences = pad_sequences(indices_sequences, maxlen=max_len, padding='post')
# print("Padded Sequences:\n", padded_sequences)

In [54]:
# Correct sequence (manually labeled)
correct_sequence = ["advent", "chimney", "fireplace", "family", "elf", "reindeer", "scrooge", "ornament", "mistletoe", "gingerbread", "walk", "drive", "bake", "give", "jump", "laugh", "sleep", "night", "the", "and"]

# Convert correct sequence to indices using the vocabulary
correct_indices = [vocab[word] for word in correct_sequence]

# Pad the correct indices to match the maximum sequence length
padded_correct_indices = pad_sequences([correct_indices], maxlen=max_len, padding='post')[0]

# One-hot encode the padded correct sequence
def one_hot_encode_sequence(sequence, vocab_size):
    one_hot = np.zeros((len(sequence), vocab_size))
    for i, index in enumerate(sequence):
        one_hot[i, index] = 1
    return one_hot

one_hot_target = one_hot_encode_sequence(padded_correct_indices, len(vocab))

# Ensure the extended targets have the correct shape
extended_targets = np.array([one_hot_target] * len(padded_sequences))

print("Padded Sequences Shape:", padded_sequences.shape)
print("Extended Targets Shape:", extended_targets.shape)

Padded Sequences Shape: (6, 100)
Extended Targets Shape: (6, 100, 89)


### Preparing the data

In [55]:
# import numpy as np

# # Correct sequence (manually labeled)
# correct_sequence = ["advent", "chimney", "fireplace", "family", "elf", "reindeer", "scrooge", "ornament", "mistletoe", "gingerbread", "walk", "drive", "bake", "give", "jump", "laugh", "sleep", "night", "the", "and"]

# # Convert correct sequence to indices using the vocabulary
# correct_indices = [vocab[word] for word in correct_sequence]

# # Pad the correct indices to match the maximum sequence length
# padded_correct_indices = pad_sequences([correct_indices], maxlen=max_len, padding='post')[0]

# # One-hot encode the padded correct sequence
# def one_hot_encode_sequence(sequence, vocab_size):
#     one_hot = np.zeros((len(sequence), vocab_size))
#     for i, index in enumerate(sequence):
#         one_hot[i, index] = 1
#     return one_hot

# one_hot_target = one_hot_encode_sequence(padded_correct_indices, len(vocab))

# # Ensure the extended targets have the correct shape
# extended_targets = np.array([one_hot_target] * len(padded_sequences))

# print("Padded Sequences Shape:", padded_sequences.shape)
# print("Extended Targets Shape:", extended_targets.shape)

In [56]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(vocab), output_dim=128, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128, return_sequences=True))  # Ensure this returns sequences
model.add(Dense(len(vocab), activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display the model summary
model.summary()



In [57]:
model.fit(padded_sequences, extended_targets, epochs=10, batch_size=32)

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0033 - loss: 4.4777
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - accuracy: 0.8000 - loss: 4.3615
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.8000 - loss: 4.1743
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - accuracy: 0.8000 - loss: 3.8283
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.8000 - loss: 3.2575
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - accuracy: 0.8000 - loss: 2.5505
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - accuracy: 0.8000 - loss: 1.9138
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step - accuracy: 0.8000 - loss: 1.3681
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

<keras.src.callbacks.history.History at 0x1405f20fbc0>