In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import LSTM, RepeatVector, TimeDistributed, Dense, Bidirectional,Input, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.metrics import CosineSimilarity
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
from tensorflow.keras.regularizers import l2
from gensim.models import Word2Vec
# import nltk
from nltk.tokenize import word_tokenize

In [6]:
# Load MSRP Dataset
# Replace 'msrp_train.txt' with the actual path to your MSRP dataset
# msrp_path = "msr_paraphrase_data.csv"
data = pd.read_csv('msr_paraphrase_train.csv', on_bad_lines='skip')

# Extract sentences
# sentences = list(msrp_data['sentence1']) + list(msrp_data['sentence2'])
sentences1 = data['#1 String'].fillna("").astype(str).values 
sentences2 = data['#2 String'].fillna("").astype(str).values
sentences = list(sentences1) + list(sentences2)

In [3]:
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
word2vec = Word2Vec(sentences=tokenized_sentences, vector_size=300, window=5, min_count=1, workers=4)

In [4]:
embedding_dim = 300
hidden_dim = 128
max_sentence_len = 20  # Max sequence length for padding

# Preprocess: Tokenize and embed sentences
def embed_sentence(sentence):
    tokens = word_tokenize(sentence.lower())
    # embeddings = [word2vec[word] for word in tokens if word in word2vec]
    embeddings = [word2vec.wv[word] for word in tokens if word in word2vec.wv]
    return embeddings

# Convert sentences to embeddings
embedded_sentences = [embed_sentence(sent) for sent in sentences]
# Pad the sentences to have uniform length
embedded_sentences = pad_sequences(
    [np.array(e) for e in embedded_sentences if len(e) > 0], 
    maxlen=max_sentence_len, 
    dtype='float32', 
    padding='post'
)

# Split into training and testing sets
X_train, X_test = train_test_split(embedded_sentences, test_size=0.2, random_state=42)

# Build the Autoencoder Model
input_dim = (max_sentence_len, embedding_dim)

input_layer = Input(shape=(max_sentence_len, embedding_dim))
encoded = LSTM(128, return_sequences=False, kernel_regularizer=l2(0.01))(input_layer)
encoded = Dropout(0.3)(encoded)
encoded = BatchNormalization()(encoded)
encoded = RepeatVector(max_sentence_len)(encoded)
decoded = LSTM(128, return_sequences=True, kernel_regularizer=l2(0.01))(encoded)
decoded = Dropout(0.3)(decoded)
output_layer = TimeDistributed(Dense(embedding_dim))(decoded)

autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer='adam', loss='mse', metrics=[CosineSimilarity(axis=-1)])

# Define the model
# input_layer = tf.keras.Input(shape=input_dim)

# encoded = Bidirectional(LSTM(hidden_dim, return_sequences=False))(input_layer)
# encoded = LSTM(hidden_dim, return_sequences=False)(input_layer)
# encoded = RepeatVector(max_sentence_len)(encoded)
# decoded = LSTM(hidden_dim, return_sequences=True)(encoded)
# output_layer = TimeDistributed(Dense(embedding_dim))(decoded)

# autoencoder = Model(inputs=input_layer, outputs=output_layer)
# autoencoder.compile(optimizer='adam', loss='mse',metrics=[CosineSimilarity(axis=-1)])
# Train the Autoencoder
num_epochs = 10
batch_size = 32

history = autoencoder.fit(X_train, X_train,
                          epochs=num_epochs,
                          batch_size=batch_size,
                          validation_data=(X_test, X_test))

# Evaluate
print("Training complete.")


Epoch 1/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 36ms/step - cosine_similarity: 0.5612 - loss: 2.1451 - val_cosine_similarity: 0.9014 - val_loss: 0.0476
Epoch 2/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 32ms/step - cosine_similarity: 0.8981 - loss: 0.0452 - val_cosine_similarity: 0.9020 - val_loss: 0.0441
Epoch 3/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 32ms/step - cosine_similarity: 0.9017 - loss: 0.0430 - val_cosine_similarity: 0.9019 - val_loss: 0.0426
Epoch 4/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 34ms/step - cosine_similarity: 0.9012 - loss: 0.0417 - val_cosine_similarity: 0.9018 - val_loss: 0.0405
Epoch 5/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 30ms/step - cosine_similarity: 0.8991 - loss: 0.0410 - val_cosine_similarity: 0.8914 - val_loss: 0.0471
Epoch 6/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 29ms/step - cosin

In [5]:
training_loss = history.history['loss']
validation_loss = history.history['val_loss']
training_accuracy = history.history['cosine_similarity']
validation_accuracy = history.history['val_cosine_similarity']

# Print final results
print(history.history.keys())
print(f"Final Training Loss: {training_loss[-1]}")
print(f"Final Validation Loss: {validation_loss[-1]}")
print(f"Final Training Accuracy (Cosine Similarity): {training_accuracy[-1]}")
print(f"Final Validation Accuracy (Cosine Similarity): {validation_accuracy[-1]}")

dict_keys(['cosine_similarity', 'loss', 'val_cosine_similarity', 'val_loss'])
Final Training Loss: 0.038399066776037216
Final Validation Loss: 0.03698806092143059
Final Training Accuracy (Cosine Similarity): 0.9025304913520813
Final Validation Accuracy (Cosine Similarity): 0.9021615386009216
