In [10]:
import torch
import torch.nn as nn

In [11]:
class SelfAttentionLayer(nn.Module):
    def __init__(self, input_dim, num_heads=1):
        super(SelfAttentionLayer, self).__init__()
        self.input_dim = input_dim
        self.num_heads = num_heads
        self.head_dim = input_dim // num_heads
        
        self.query = nn.Linear(input_dim, input_dim)
        self.key = nn.Linear(input_dim, input_dim)
        self.value = nn.Linear(input_dim, input_dim)
        
    def forward(self, x):
        batch_size, seq_len, _ = x.shape
        
        # Split input into heads
        query = self.query(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
        key = self.key(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
        value = self.value(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
        
        # Compute attention scores
        attention_scores = torch.matmul(query, key.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))
        attention_weights = torch.softmax(attention_scores, dim=-1)
        
        # Apply attention to values
        attention_output = torch.matmul(attention_weights, value)
        attention_output = attention_output.view(batch_size, seq_len, -1)
        
        return attention_output

class Encoder(nn.Module):
    def __init__(self, input_dim, num_layers=1, num_heads=1, hidden_dim=64):
        super(Encoder, self).__init__()
        self.input_dim = input_dim
        self.num_layers = num_layers
        self.self_attention_layers = nn.ModuleList([
            SelfAttentionLayer(input_dim, num_heads) for _ in range(num_layers)
        ])
        self.feedforward_layers = nn.ModuleList([
            nn.Sequential(
                nn.Linear(input_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, input_dim)
            ) for _ in range(num_layers)
        ])
        self.layer_norms = nn.ModuleList([nn.LayerNorm(input_dim) for _ in range(num_layers)])
        
    def forward(self, x):
        for i in range(self.num_layers):
            # Self-attention layer
            attention_output = self.self_attention_layers[i](x)
            # Add residual connection and apply layer normalization
            x = self.layer_norms[i](x + attention_output)
            # Feedforward layer
            feedforward_output = self.feedforward_layers[i](x)
            # Add residual connection and apply layer normalization
            x = self.layer_norms[i](x + feedforward_output)
        return x

In [12]:
# Init model with input_dim
input_dim = 64

### Read Data

In [89]:
import pickle
import numpy as np
from datasets import load_dataset
import tqdm
import math

In [90]:
dataset = load_dataset("glue", "sst2")
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

Using the latest cached version of the dataset since glue couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'sst2' at /home/dhruv/.cache/huggingface/datasets/glue/sst2/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c (last modified on Thu Apr 11 16:26:56 2024).


In [91]:
# Read pkl file 
with open('sst-train-64.pkl', 'rb') as f:
    vectors = pickle.load(f)

In [92]:
vectors.shape

(67349, 64)

In [100]:
def average_distance_between_vectors(vectors):
    """
    Calculate the average distance between all pairs of vectors within a set.

    Parameters:
    vectors (numpy.ndarray): A 2D numpy array where each row represents a vector.

    Returns:
    float: The average distance between all pairs of vectors.
    """
    # Calculate pairwise distances using broadcasting
    pairwise_distances = np.linalg.norm(vectors[:, None] - vectors, axis=-1)
    
    # Exclude diagonal elements (distances between the same vectors)
    pairwise_distances = np.triu(pairwise_distances, k=1)
    
    # Calculate the average distance
    average_distance = np.mean(pairwise_distances)
    
    return average_distance

In [None]:
N = 128
X = []
encoder = Encoder(input_dim, num_layers = N, num_heads = 8, hidden_dim = 64)
for i in tqdm.tqdm(range(len(train_dataset))):
    # Sample usage
    input_sequence = torch.tensor(vectors[i].reshape(1, 1, 64))
    output_sequence = encoder(input_sequence)[0][0].detach().numpy()
    X.append(output_sequence)

 23%|██▎       | 15637/67349 [04:39<16:57, 50.80it/s]

In [103]:
X_np = np.array(X)
dist = average_distance_between_vectors(np.mean(X_np, axis = 0))
print(dist)

5.348753
