In [10]:
import gensim.downloader as api
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import LSTM, RepeatVector, TimeDistributed, Dense
from tensorflow.keras.models import Model
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load pre-trained Word2Vec model
word2vec = api.load("word2vec-google-news-300.bin")
embedding_dim = 300
hidden_dim = 64
max_sentence_len = 20  # Max length for padding sequences

# Assuming `sentences` is a list of strings with your dataset sentences
# Replace with actual data loading
sentences = ["This is a sentence", "Another example sentence", "This is the third sentence"]

# Tokenize and embed sentences
def embed_sentence(sentence):
    tokens = word_tokenize(sentence.lower())
    embeddings = [word2vec[word] for word in tokens if word in word2vec]
    return embeddings

embedded_sentences = [embed_sentence(sent) for sent in sentences]
embedded_sentences = pad_sequences(embedded_sentences, maxlen=max_sentence_len, dtype='float32', padding='post')

# Split data into training and testing sets
X_train, X_test = train_test_split(embedded_sentences, test_size=0.2, random_state=42)

# Model definition using Keras
input_dim = (max_sentence_len, embedding_dim)

input_layer = tf.keras.Input(shape=input_dim)
# Encoder
encoded = LSTM(hidden_dim, return_sequences=False)(input_layer)
encoded = RepeatVector(max_sentence_len)(encoded)

# Decoder
decoded = LSTM(hidden_dim, return_sequences=True)(encoded)
output_layer = TimeDistributed(Dense(embedding_dim))(decoded)

# Compile model
autoencoder = Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer='adam', loss='mse')

# Model summary
autoencoder.summary()

# Train the model
num_epochs = 20
batch_size = 32

history = autoencoder.fit(X_train, X_train,
                          epochs=num_epochs,
                          batch_size=batch_size,
                          validation_data=(X_test, X_test))

# After training, you can inspect the model’s ability to reconstruct sentences


ValueError: Incorrect model/corpus name

In [5]:
# import gensim.downloader as api
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import LSTM, RepeatVector, TimeDistributed, Dense
from tensorflow.keras.models import Model
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors

In [7]:
# Load the Word2Vec model
word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
embedding_dim = 300
hidden_dim = 64
max_sentence_len = 20  # Max sequence length for padding

# Load MSRP Dataset
# Replace 'msrp_train.txt' with the actual path to your MSRP dataset
msrp_path = "msr_paraphrase_data.csv"
msrp_data = pd.read_csv('msr_paraphrase_train.csv', on_bad_lines='skip')

# Extract sentences
# sentences = list(msrp_data['sentence1']) + list(msrp_data['sentence2'])
sentences1 = data['#1 String'].fillna("").astype(str).values 
sentences2 = data['#2 String'].fillna("").astype(str).values
sentences = list(sentences1) + list(sentences2)

MemoryError: Unable to allocate 3.35 GiB for an array with shape (3000000, 300) and data type float32

In [None]:
# Preprocess: Tokenize and embed sentences
def embed_sentence(sentence):
    tokens = word_tokenize(sentence.lower())
    embeddings = [word2vec[word] for word in tokens if word in word2vec]
    return embeddings

# Convert sentences to embeddings
embedded_sentences = [embed_sentence(sent) for sent in sentences]
# Pad the sentences to have uniform length
embedded_sentences = pad_sequences(
    [np.array(e) for e in embedded_sentences if len(e) > 0], 
    maxlen=max_sentence_len, 
    dtype='float32', 
    padding='post'
)

# Split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(embedded_sentences, test_size=0.2, random_state=42)

# Build the Autoencoder Model
input_dim = (max_sentence_len, embedding_dim)

# Define the model
input_layer = tf.keras.Input(shape=input_dim)
encoded = LSTM(hidden_dim, return_sequences=False)(input_layer)
encoded = RepeatVector(max_sentence_len)(encoded)
decoded = LSTM(hidden_dim, return_sequences=True)(encoded)
output_layer = TimeDistributed(Dense(embedding_dim))(decoded)

autoencoder = Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer='adam', loss='mse')

# Train the Autoencoder
num_epochs = 20
batch_size = 32

history = autoencoder.fit(X_train, X_train,
                          epochs=num_epochs,
                          batch_size=batch_size,
                          validation_data=(X_test, X_test))

# Evaluate
print("Training complete.")


In [None]:
training_loss = history.history['loss']
validation_loss = history.history['val_loss']
training_accuracy = history.history['cosine_similarity']
validation_accuracy = history.history['val_cosine_similarity']

# Print final results
print(f"Final Training Loss: {training_loss[-1]}")
print(f"Final Validation Loss: {validation_loss[-1]}")
print(f"Final Training Accuracy (Cosine Similarity): {training_accuracy[-1]}")
print(f"Final Validation Accuracy (Cosine Similarity): {validation_accuracy[-1]}")