In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, BatchNormalization, LayerNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler
from keras.regularizers import l2
from sklearn.model_selection import train_test_split

import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

2024-06-05 16:02:54.114761: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-05 16:02:54.114848: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-05 16:02:54.118489: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# 1. Read the data and inspect first 10 rows
file_path = "/kaggle/input/phrase-data/phrases_data.txt"
data = pd.read_csv(file_path, sep='\t', names=['phrases'])

# 2. Clean training data
def clean_data(data):
    # Remove punctuation
    data['cleaned'] = data['phrases'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

    # Convert to lowercase
    data['cleaned'] = data['cleaned'].str.lower()

    # Remove whitespace
    data['cleaned'] = data['cleaned'].str.strip()

    # # Remove stop words
    # stop_words = set(stopwords.words('english'))
    # data['cleaned'] = data['cleaned'].apply(lambda x:
    #                                         ' '.join([word for word in x.split()
    #                                         if word not in stop_words]))

    # Lemmatization
#     lemmatizer = WordNetLemmatizer()
#     data['cleaned'] = data['cleaned'].apply(lambda x:
#                                             ' '.join([lemmatizer.lemmatize(word)
#                                             for word in x.split()]))
clean_data(data)
data.head(10)

Unnamed: 0,phrases,cleaned
0,Let's try something.,lets try something
1,I have to go to sleep.,i have to go to sleep
2,Today is June 18th and it is Muiriel's birthday!,today is june 18th and it is muiriels birthday
3,Muiriel is 20 now.,muiriel is 20 now
4,"The password is ""Muiriel"".",the password is muiriel
5,I will be back soon.,i will be back soon
6,I'm at a loss for words.,im at a loss for words
7,This is never going to end.,this is never going to end
8,I just don't know what to say.,i just dont know what to say
9,That was an evil bunny.,that was an evil bunny


In [None]:
import gensim
word2vec_model_path = '/kaggle/input/word2vec-model/GoogleNews-vectors-negative300.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)

In [None]:
similar_words = model.most_similar('sleep', topn=5)
print(similar_words)

In [3]:
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

# Tokenizing the corpus
tokenized_corpus = [word_tokenize(each_sentence) for each_sentence in data['cleaned']]
print(tokenized_corpus[:4])

# Training the Word2Vec model
customized_model = Word2Vec(sentences=tokenized_corpus, vector_size=200, window=10, min_count=1, workers=4)

# Saving the model
customized_model.save("/kaggle/working/word2vec.model")

similar_words = customized_model.wv.most_similar('butterfly', topn=5)
print(similar_words)
vocabulary = customized_model.wv
print(len(vocabulary))

[['lets', 'try', 'something'], ['i', 'have', 'to', 'go', 'to', 'sleep'], ['today', 'is', 'june', '18th', 'and', 'it', 'is', 'muiriels', 'birthday'], ['muiriel', 'is', '20', 'now']]
[('ended', 0.9704582095146179), ('wash', 0.9702301025390625), ('island', 0.9702083468437195), ('looks', 0.9701722264289856), ('schools', 0.969989538192749)]
7247


In [4]:
def create_input_output_pairs(sentences):
    max_sequence_length = 0
    inputs, outputs = [], []
    for sentence in sentences:
        all_words = sentence.split()
        if len(all_words) <= 3:
            inputs.append(' '.join(all_words[:-1]))
            outputs.append(all_words[-1])
            continue
        if len(all_words) > max_sequence_length:
            max_sequence_length = len(all_words)
        for i in range(3, len(all_words)):
            input_seq = ' '.join(all_words[:i])
            output_word = all_words[i]
            inputs.append(input_seq)
            outputs.append(output_word)
    return inputs, outputs, max_sequence_length

inputs, outputs, max_sequence_length = create_input_output_pairs(data['cleaned'])
print(inputs[:10])
print(outputs[:10])

print(len(inputs))
print(len(outputs))

['lets try', 'i have to', 'i have to go', 'i have to go to', 'today is june', 'today is june 18th', 'today is june 18th and', 'today is june 18th and it', 'today is june 18th and it is', 'today is june 18th and it is muiriels']
['something', 'go', 'to', 'sleep', '18th', 'and', 'it', 'is', 'muiriels', 'birthday']
51500
51500


In [None]:
def sentence_embedding(sentence, model):
    # Tokenize the sentence into words
    words = sentence.split()
    
    # Get the vector for each word if it exists in the model's vocabulary
    word_vectors = [customized_model.wv[word] for word in words if word in customized_model.wv]
    
    # Handle cases where the sentence may not contain any words with vectors (rare)
    if len(word_vectors) == 0:
        # Return a zero vector if none of the words were in the model's vocabulary
        return np.zeros(model.vector_size)
    else:
        # Compute the mean of these vectors
        return np.mean(word_vectors, axis=0)

# Assuming 'model' is your pre-trained Word2Vec model
input_embeddings = [sentence_embedding(sentence, model) for sentence in inputs]
output_embeddings = [sentence_embedding(word, model) for word in outputs]

In [None]:
def get_train_validation_test_data(input_sequences, one_hot_labels):
    # Split dataset into training, validation, and test sets
    # First, split the whole dataset into 80% training and validation + 20% testing
    X_train_val, X_test, y_train_val, y_test = train_test_split(
    input_sequences, one_hot_labels, test_size=0.20, random_state=42)
    
    # Next, split the 80% training and validation set into 50% validation + 50% testing
    X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.5, random_state=42)
    
    train_data, test_data = train_test_split(
        input_sequences, test_size=0.2, random_state=42
    )
    return X_train, y_train, X_val, y_val, X_test, y_test

X_train, y_train, X_val, y_val, X_test, y_test = get_train_validation_test_data(input_embeddings, output_embeddings)
# print(X_train[0])
# print(y_train[0])
print(X_train.shape)
print(y_train.shape)