In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, BatchNormalization, LayerNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler
from keras.regularizers import l2
from sklearn.model_selection import train_test_split

import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

2024-06-05 15:01:45.906371: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-05 15:01:45.906571: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-05 15:01:46.090470: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# 1. Read the data and inspect first 10 rows
file_path = "/kaggle/input/phrase-data/phrases_data.txt"
data = pd.read_csv(file_path, sep='\t', names=['phrases'])

In [3]:
# 2. Clean training data
def clean_data(data):
    # Remove punctuation
    data['cleaned'] = data['phrases'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

    # Convert to lowercase
    data['cleaned'] = data['cleaned'].str.lower()

    # Remove whitespace
    data['cleaned'] = data['cleaned'].str.strip()

    # # Remove stop words
    # stop_words = set(stopwords.words('english'))
    # data['cleaned'] = data['cleaned'].apply(lambda x:
    #                                         ' '.join([word for word in x.split()
    #                                         if word not in stop_words]))

    # Lemmatization
#     lemmatizer = WordNetLemmatizer()
#     data['cleaned'] = data['cleaned'].apply(lambda x:
#                                             ' '.join([lemmatizer.lemmatize(word)
#                                             for word in x.split()]))
clean_data(data)
data.head(10)

Unnamed: 0,phrases,cleaned
0,Let's try something.,lets try something
1,I have to go to sleep.,i have to go to sleep
2,Today is June 18th and it is Muiriel's birthday!,today is june 18th and it is muiriels birthday
3,Muiriel is 20 now.,muiriel is 20 now
4,"The password is ""Muiriel"".",the password is muiriel
5,I will be back soon.,i will be back soon
6,I'm at a loss for words.,im at a loss for words
7,This is never going to end.,this is never going to end
8,I just don't know what to say.,i just dont know what to say
9,That was an evil bunny.,that was an evil bunny


In [None]:
# vocabulary = list(set(' '.join(data['cleaned']).replace('\n','').split(' ')))
# vocab_dictionary = {}
# for strings, texts in enumerate(vocabulary):
#     vocab_dictionary[texts] = strings
# total_words = len(vocab_dictionary) + 1
# print(total_words)

In [None]:
# tokenizer = Tokenizer(oov_token='<oov>') # For those words which are not found in word_index
# tokenizer.fit_on_texts(data['cleaned'])
# total_words = len(tokenizer.word_index) + 1

# print(f"Total number of words: {total_words}")
# print(f"All words: {tokenizer.word_index.keys()}")

In [4]:
def create_input_output_pairs(sentences):
    max_sequence_length = 0
    inputs, outputs = [], []
    for sentence in sentences:
        all_words = sentence.split()
        if len(all_words) <= 3:
            inputs.append(' '.join(all_words[:-1]))
            outputs.append(all_words[-1])
            continue
        if len(all_words) > max_sequence_length:
            max_sequence_length = len(all_words)
        for i in range(3, len(all_words)):
            input_seq = ' '.join(all_words[:i])
            output_word = all_words[i]
            inputs.append(input_seq)
            outputs.append(output_word)
    return inputs, outputs, max_sequence_length

inputs, outputs, max_sequence_length = create_input_output_pairs(data['cleaned'])
print(inputs[:10])
print(outputs[:10])

print(len(inputs))
print(len(outputs))

['lets try', 'i have to', 'i have to go', 'i have to go to', 'today is june', 'today is june 18th', 'today is june 18th and', 'today is june 18th and it', 'today is june 18th and it is', 'today is june 18th and it is muiriels']
['something', 'go', 'to', 'sleep', '18th', 'and', 'it', 'is', 'muiriels', 'birthday']
51500
51500


In [6]:
# Get the embedding of inputs
import tensorflow_hub as hub
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"  
get_embedding = hub.load(module_url)

input_embeddings = np.array(get_embedding(inputs))
output_embeddings = np.array(get_embedding(outputs))
print(input_embeddings.shape)
print(output_embeddings.shape)

(51500, 512)
(51500, 512)


In [8]:
tokenizer = Tokenizer(oov_token='<oov>') # For those words which are not found in word_index
tokenizer.fit_on_texts(outputs)
total_words = len(tokenizer.word_index) + 1

print(f"Total number of words: {total_words}")

# tokenized_labels = [tokenizer.word_index[each_word] for each_word in outputs]
# print(tokenized_labels[:2])
# print(tokenized_labels)

Total number of words: 6112
[135, 36]


In [11]:
one_hot_labels = tf.keras.utils.to_categorical(tokenized_labels, num_classes=total_words)
print(one_hot_labels.shape)

(51500, 6112)


In [None]:
# pad sequences
# input_sequences = np.array(pad_sequences(tokenized_phrases,
#                                          maxlen=max_sequence_length - 1,
#                                          padding='pre'))
# print(input_sequences[0])
# print(len(input_sequences))

In [23]:
def get_train_validation_test_data(input_sequences, one_hot_labels):
    # Split dataset into training, validation, and test sets
    # First, split the whole dataset into 80% training and validation + 20% testing
    X_train_val, X_test, y_train_val, y_test = train_test_split(
    input_sequences, one_hot_labels, test_size=0.20, random_state=42)
    
    # Next, split the 80% training and validation set into 50% validation + 50% testing
    X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.5, random_state=42)
    
    train_data, test_data = train_test_split(
        input_sequences, test_size=0.2, random_state=42
    )
    return X_train, y_train, X_val, y_val, X_test, y_test

X_train, y_train, X_val, y_val, X_test, y_test = get_train_validation_test_data(input_embeddings, one_hot_labels)
# print(X_train[0])
print(y_train[0])
print(X_train.shape)
print(y_train.shape)

[0. 0. 0. ... 0. 0. 0.]
(20600, 512)
(20600, 6112)


In [32]:
def create_model(total_words, hidden_size, optimizer):
    model = Sequential()
#     model.add(Embedding(total_words, 100, input_length=max_sequence_length - 1))
#     model.add(LSTM(hidden_size, input_shape=[512]))  # , dropout=0.2, recurrent_dropout=0.2
    model.add(Dense(hidden_size, input_shape=[512], activation='relu'))
    model.add(Dense(total_words, activation='softmax')) 
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])
    return model

# Function to decay the learning rate
def scheduler(epoch, lr):
    min_lr = 0.001  # Set the minimum learning rate
    if epoch < 10:
        return lr
    else:
        new_lr = lr * np.exp(-0.1)  # Decays the learning rate by 1% every epoch after the 10th
#         return new_lr
        return max(new_lr, min_lr)

# Set the initial learning rate
initial_learning_rate = 0.01

# Compile the model with an optimizer
optimizer = Adam(learning_rate=initial_learning_rate)

lr_scheduler = LearningRateScheduler(scheduler)

# Include the epoch in the file name (uses `str.format`)
checkpoint_path = "/kaggle/working/model_lstm.keras"
callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_best_only=True,
    monitor='val_loss',
    verbose=1)

model = create_model(total_words, hidden_size=128, optimizer=optimizer)
history = model.fit(X_train, y_train, epochs=50, batch_size=64,
                    validation_data=(X_val, y_val), 
                    callbacks=[lr_scheduler, callback], verbose=1)

Epoch 1/50
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - acc: 0.0613 - loss: 7.1281
Epoch 1: val_loss improved from inf to 6.61400, saving model to /kaggle/working/model_lstm.keras
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 25ms/step - acc: 0.0614 - loss: 7.1272 - val_acc: 0.0697 - val_loss: 6.6140 - learning_rate: 0.0100
Epoch 2/50
[1m319/322[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 14ms/step - acc: 0.0766 - loss: 6.0105
Epoch 2: val_loss did not improve from 6.61400
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 21ms/step - acc: 0.0767 - loss: 6.0097 - val_acc: 0.0765 - val_loss: 6.6631 - learning_rate: 0.0100
Epoch 3/50
[1m321/322[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 14ms/step - acc: 0.0914 - loss: 5.3729
Epoch 3: val_loss did not improve from 6.61400
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 21ms/step - acc: 0.0914 - loss: 5.3727 - val_acc: 0.0800 - val_

In [None]:
test_loss, test_mse = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {test_loss}, Test MSE: {test_mse}")