In [46]:
import pandas as pd
import spacy

# Loading spaCy's English language model 
nlp = spacy.load('en_core_web_sm')

# adjusting display options
pd.set_option('display.max_colwidth', None)

df = pd.read_csv('../Dataset/sample_submission.csv')
n_rows = df.shape[0]
df

Unnamed: 0,id,text
0,0,advent chimney elf family fireplace gingerbread mistletoe ornament reindeer scrooge
1,1,advent chimney elf family fireplace gingerbread mistletoe ornament reindeer scrooge walk give jump drive bake the sleep night laugh and
2,2,yuletide decorations gifts cheer holiday carol magi nutcracker polar grinch sleigh chimney workshop stocking ornament holly jingle beard naughty nice
3,3,yuletide decorations gifts cheer holiday carol magi nutcracker polar grinch sleigh chimney workshop stocking ornament holly jingle beard naughty nice sing cheer and of the is eat visit relax unwrap
4,4,hohoho candle poinsettia snowglobe peppermint eggnog fruitcake chocolate candy puzzle game doll toy workshop wonder believe dream hope peace joy merry season greeting card wrapping paper bow fireplace night cookie milk star wish wreath angel the to of and in that have it not with as you from we kaggle
5,5,advent chimney elf family fireplace gingerbread mistletoe ornament reindeer scrooge walk give jump drive bake the sleep night laugh and yuletide decorations gifts cheer holiday carol magi nutcracker polar grinch sleigh chimney workshop stocking ornament holly jingle beard naughty nice sing cheer and of the is eat visit relax unwrap hohoho candle poinsettia snowglobe peppermint eggnog fruitcake chocolate candy puzzle game doll toy workshop wonder believe dream hope peace joy merry season greeting card wrapping paper bow fireplace night cookie milk star wish wreath angel the to of and in that have it not with as you from we kaggle


### Tokenizing the text

In [47]:
# from nltk.tokenize import word_tokenize

# # Tokenize the text and convert to lowercase
# df['tokens'] = df['text'].apply(lambda x: word_tokenize(x.lower()))

# # Display the tokenized words
# print("Tokenized Data:\n", df.head())

# # Save tokenized text to a text file
# with open('tokenized_text.txt', 'w') as f:
#     for tokens in df['tokens']:
#         # Join the tokens back into a single string for each line
#         line = ' '.join(tokens)
#         f.write(line + '\n')

# print("Tokenized text saved to tokenized_text.txt")

In [48]:
# Tokenizing the text using spaCy
def spacy_tokenize(text):
    doc = nlp(text.lower())
    return [token.text for token in doc]

df['tokens'] = df['text'].apply(spacy_tokenize)


In [49]:
# Saving the tokenized text to a text file
with open('../Tokens/tokenized_text.txt', 'w') as f:
    for tokens in df['tokens']:
        line = ' '.join(tokens)
        f.write(line + '\n')

print("Tokenized text saved to tokenized_text.txt")

Tokenized text saved to tokenized_text.txt


In [50]:
# Loading the tokenized text from the file
with open('../Tokens/tokenized_text.txt', 'r') as f:
    tokenized_lines = [line.strip().split() for line in f]

print("Tokenized Lines:\n", tokenized_lines)

Tokenized Lines:
 [['advent', 'chimney', 'elf', 'family', 'fireplace', 'gingerbread', 'mistletoe', 'ornament', 'reindeer', 'scrooge'], ['advent', 'chimney', 'elf', 'family', 'fireplace', 'gingerbread', 'mistletoe', 'ornament', 'reindeer', 'scrooge', 'walk', 'give', 'jump', 'drive', 'bake', 'the', 'sleep', 'night', 'laugh', 'and'], ['yuletide', 'decorations', 'gifts', 'cheer', 'holiday', 'carol', 'magi', 'nutcracker', 'polar', 'grinch', 'sleigh', 'chimney', 'workshop', 'stocking', 'ornament', 'holly', 'jingle', 'beard', 'naughty', 'nice'], ['yuletide', 'decorations', 'gifts', 'cheer', 'holiday', 'carol', 'magi', 'nutcracker', 'polar', 'grinch', 'sleigh', 'chimney', 'workshop', 'stocking', 'ornament', 'holly', 'jingle', 'beard', 'naughty', 'nice', 'sing', 'cheer', 'and', 'of', 'the', 'is', 'eat', 'visit', 'relax', 'unwrap'], ['hohoho', 'candle', 'poinsettia', 'snowglobe', 'peppermint', 'eggnog', 'fruitcake', 'chocolate', 'candy', 'puzzle', 'game', 'doll', 'toy', 'workshop', 'wonder', 'be

### Creating the Vocabulary and Convert Sequences to Indices

In [51]:
# Building vocabulary
vocab = {word: i for i, word in enumerate(set(word for tokens in tokenized_lines for word in tokens))}

# Convert sequences to indices
indices_sequences = [[vocab[word] for word in tokens] for tokens in tokenized_lines]
print("Sequences of Indices:\n", indices_sequences)


Sequences of Indices:
 [[0, 68, 16, 79, 41, 5, 76, 15, 81, 19], [0, 68, 16, 79, 41, 5, 76, 15, 81, 19, 42, 64, 38, 32, 66, 56, 73, 65, 12, 80], [69, 83, 82, 33, 8, 49, 43, 11, 24, 4, 46, 68, 70, 48, 15, 53, 78, 45, 61, 30], [69, 83, 82, 33, 8, 49, 43, 11, 24, 4, 46, 68, 70, 48, 15, 53, 78, 45, 61, 30, 84, 33, 80, 26, 56, 85, 27, 57, 39, 51], [17, 60, 47, 18, 58, 1, 44, 88, 71, 62, 34, 23, 7, 70, 52, 2, 77, 13, 86, 54, 36, 3, 35, 14, 9, 29, 6, 41, 65, 75, 40, 72, 10, 50, 21, 56, 63, 26, 80, 28, 67, 20, 31, 55, 37, 22, 59, 87, 74, 25], [0, 68, 16, 79, 41, 5, 76, 15, 81, 19, 42, 64, 38, 32, 66, 56, 73, 65, 12, 80, 69, 83, 82, 33, 8, 49, 43, 11, 24, 4, 46, 68, 70, 48, 15, 53, 78, 45, 61, 30, 84, 33, 80, 26, 56, 85, 27, 57, 39, 51, 17, 60, 47, 18, 58, 1, 44, 88, 71, 62, 34, 23, 7, 70, 52, 2, 77, 13, 86, 54, 36, 3, 35, 14, 9, 29, 6, 41, 65, 75, 40, 72, 10, 50, 21, 56, 63, 26, 80, 28, 67, 20, 31, 55, 37, 22, 59, 87, 74, 25]]


### Padding the sequences

In [52]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = max(len(tokens) for tokens in indices_sequences)
padded_sequences = pad_sequences(indices_sequences, maxlen=max_len, padding='post')
print("Padded Sequences:\n", padded_sequences)

Padded Sequences:
 [[ 0 68 16 79 41  5 76 15 81 19  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]
 [ 0 68 16 79 41  5 76 15 81 19 42 64 38 32 66 56 73 65 12 80  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]
 [69 83 82 33  8 49 43 11 24  4 46 68 70 48 15 53 78 45 61 30  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]
 [69 83 82 33  8 49 43 11 24  4 46 68 70 48 15 5

### Preparing the data

In [53]:
# Correct sequence (manually labeled)
correct_sequence = ["advent", "chimney", "fireplace", "family", "elf", "reindeer", "scrooge", "ornament", "mistletoe", "gingerbread", "walk", "drive", "bake", "give", "jump", "laugh", "sleep", "night", "the", "and"]

# Convert correct sequence to indices using the vocabulary
correct_indices = [vocab[word] for word in correct_sequence]

# Pad the correct indices to match the maximum sequence length
padded_correct_indices = pad_sequences([correct_indices], maxlen=max_len, padding='post')[0]

# One-hot encode the padded correct sequence
def one_hot_encode_sequence(sequence, vocab_size):
    one_hot = np.zeros((len(sequence), vocab_size))
    for i, index in enumerate(sequence):
        one_hot[i, index] = 1
    return one_hot

one_hot_target = one_hot_encode_sequence(padded_correct_indices, len(vocab))

# Ensure the extended targets have the correct shape
extended_targets = np.array([one_hot_target] * len(padded_sequences))

print("Padded Sequences Shape:", padded_sequences.shape)
print("Extended Targets Shape:", extended_targets.shape)

Padded Sequences Shape: (6, 100)
Extended Targets Shape: (6, 100, 89)


In [54]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Defining the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(vocab), output_dim=128, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(vocab), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(padded_sequences, extended_targets, epochs=10, batch_size=32)
model.summary()

Epoch 1/10




ValueError: Arguments `target` and `output` must have the same rank (ndim). Received: target.shape=(None, 100, 89), output.shape=(None, 89)