In [1]:
from gensim.models import Word2Vec,KeyedVectors
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, bottleneck_dim):
        super(Autoencoder, self).__init__()
        
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 200),
            nn.ReLU(),
            nn.Linear(200, 100),
            nn.ReLU(),
            nn.Linear(100,bottleneck_dim),
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(bottleneck_dim, 100),
            nn.ReLU(),
            nn.Linear(100, 200),
            nn.ReLU(),
            nn.Linear(200, input_dim),
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
    def get_encoder_output(self, x):
        return self.encoder(x)

In [3]:
import json

# Load Word2Vec vectors from the JSON file
with open('word_to_vector.json', 'r') as file:
    word_vectors_dict = json.load(file)

In [4]:
# Define vector size of input (embedding size of each word in Word2Vec model)
input_dim = 400

# Define the AutoEncoder model
autoencoder = Autoencoder(input_dim=input_dim, bottleneck_dim=40)

In [5]:
# Choose an appropriate loss function and optimizer
criterion = torch.nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)

In [7]:
# Number of training epochs
num_epochs = 3

In [8]:
# Training loop
for epoch in range(num_epochs):
    total_loss = 0.0

    # Iterate over all words in the loaded word vectors dictionary
    for word, word_vector in word_vectors_dict.items():
        # Convert the word vector to a PyTorch tensor
        input_data = torch.tensor(word_vector, dtype=torch.float32).view(1, input_dim)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        reconstructed_input = autoencoder(input_data)

        # Calculate the loss (MSE between input and reconstructed output)
        loss = criterion(reconstructed_input, input_data)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate the total loss
        total_loss += loss.item()

    # Calculate and print the average loss for the epoch
    average_loss = total_loss / len(word_vectors_dict)
    print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {average_loss:.4f}")

Epoch [1/3], Average Loss: 0.0300
Epoch [2/3], Average Loss: 0.0417
Epoch [3/3], Average Loss: 0.0360


In [9]:
# Get the new embeddings from the trained autoencoder
new_embeddings = {}
for word, word_vector in word_vectors_dict.items():
    input_data = torch.tensor(word_vector, dtype=torch.float32).view(1, input_dim)
    encoded_output = autoencoder.get_encoder_output(input_data).detach().numpy().tolist()
    new_embeddings[word] = encoded_output[0]

# Save the new embeddings to a new JSON file
with open('reduced_word2vec_embeddings.json', 'w') as file:
    json.dump(new_embeddings, file)