In [1]:
from transformers import BertTokenizer, BertModel
import numpy as np
import torch

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name).to(device)

In [6]:
# Load the preprocessed SQuAD data
import json
with open("../cleaned_squad_data.json", "r", encoding="utf-8") as file:
    squad_data = json.load(file)

In [7]:
batch_size = 8

def generate_embeddings(text_list):
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch_texts = text_list[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
    return np.array(embeddings)

In [8]:
# Extract contexts and questions
contexts = [item["context"] for item in squad_data[:5000]]
questions = [item["question"] for item in squad_data[:5000]]

In [7]:
# Generate embeddings
context_embeddings = generate_embeddings(contexts)
question_embeddings = generate_embeddings(questions)

# Save the embeddings
# import numpy as np
np.save("context_embeddings.npy", context_embeddings)
np.save("question_embeddings.npy", question_embeddings)

print("Context Embeddings:", context_embeddings.shape)
print("Question Embeddings:", question_embeddings.shape)


  attn_output = torch.nn.functional.scaled_dot_product_attention(


KeyboardInterrupt: 

In [None]:
data = np.load('context_embeddings.npy') 
 
# Now you can use the 'data' variable to access the contents of the .npy file 
print(data) 

[[-0.2642473  -0.23228052  0.28082693 ...  0.11196642  0.01162038
  -0.2506955 ]
 [-0.2642473  -0.23228052  0.28082693 ...  0.11196642  0.01162038
  -0.2506955 ]
 [-0.2642473  -0.23228052  0.28082693 ...  0.11196642  0.01162038
  -0.2506955 ]
 ...
 [ 0.123763    0.12276594 -0.08655506 ...  0.18965627  0.00387318
   0.33899373]
 [ 0.123763    0.12276594 -0.08655506 ...  0.18965627  0.00387318
   0.33899373]
 [ 0.123763    0.12276594 -0.08655506 ...  0.18965627  0.00387318
   0.33899373]]


In [9]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/sentence-t5-base')
# embeddings = model.encode(sentences)



In [11]:
def generate_sbert_embeddings(model, texts, batch_size=32):
    embeddings = []
    for start_idx in range(0, len(texts), batch_size):
        end_idx = min(start_idx + batch_size, len(texts))
        batch_texts = texts[start_idx:end_idx]
        batch_embeddings = model.encode(batch_texts, convert_to_tensor=True, device=device)
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings).cpu().numpy()

In [12]:
# Generate embeddings
context_embeddings = generate_sbert_embeddings(model, contexts)
question_embeddings = generate_sbert_embeddings(model, questions)

# Save the embeddings
# import numpy as np
np.save("context_embeddings_sbert.npy", context_embeddings)
np.save("question_embeddings_sbert.npy", question_embeddings)

print("Context Embeddings:", context_embeddings.shape)
print("Question Embeddings:", question_embeddings.shape)

Context Embeddings: (5000, 768)
Question Embeddings: (5000, 768)


In [13]:
data = np.load('context_embeddings_sbert.npy') 
 
# Now you can use the 'data' variable to access the contents of the .npy file 
print(data) 

[[-0.02565339 -0.02768765  0.03099243 ... -0.023538   -0.02609218
  -0.03554582]
 [-0.02565339 -0.02768765  0.03099243 ... -0.023538   -0.02609218
  -0.03554582]
 [-0.02565339 -0.02768765  0.03099243 ... -0.023538   -0.02609218
  -0.03554582]
 ...
 [-0.00315945 -0.04531334  0.016718   ... -0.00831344 -0.0091006
  -0.00221333]
 [-0.00315945 -0.04531334  0.016718   ... -0.00831344 -0.0091006
  -0.00221333]
 [-0.00315945 -0.04531334  0.016718   ... -0.00831344 -0.0091006
  -0.00221333]]
