In [5]:
import os
import sys

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

second_semester_fucken_dataset = os.path.abspath(os.path.join(PROJECT_ROOT, "data", "raw_data", "kaggle_second_sem", "books_data.csv"))

import pandas as pd
data = pd.read_csv(second_semester_fucken_dataset)

In [None]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np

class EmbeddingsProducer:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained('bert-base-uncased').to(self.device)

    def create_embeddings_batch(self, texts: list[str]) -> list[np.ndarray]:
        encoded = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )
        input_ids = encoded['input_ids'].to(self.device)
        attention_mask = encoded['attention_mask'].to(self.device)
    
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            token_embeddings = outputs.last_hidden_state
    
            mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            masked_embeddings = token_embeddings * mask
            summed = masked_embeddings.sum(dim=1)
            counts = mask.sum(dim=1)
            mean_pooled = summed / counts
    
        return [emb.cpu().numpy() for emb in mean_pooled]
    
    

In [24]:
embeddings_path = os.path.join(PROJECT_ROOT, "data", "embeddings", "embeddings_structured.npz")

def produce_embeddings_batched(data, batch_size=16):
    from tqdm import tqdm
    embeddings_producer = EmbeddingsProducer()

    data.loc[:, "description"] = data["description"].fillna("No description")
    descriptions = data["description"].tolist()
    titles = data["Title"].to_numpy()

    embeddings = []

    for i in tqdm(range(0, len(descriptions), batch_size), desc="Embedding..."):
        batch_texts = descriptions[i:i + batch_size]
        batch_embeddings = embeddings_producer.create_embeddings_batch(batch_texts)
        embeddings.extend(batch_embeddings)

    embeddings = np.stack(embeddings).astype(np.float32)
    np.savez(embeddings_path, titles=titles, embeddings=embeddings)
    print("Embeddings saved to embeddings_structured.npz")
    return embeddings

produce_embeddings_batched(data[:30])
print("FINISHED")

Embedding...: 100%|██████████| 2/2 [00:07<00:00,  3.90s/it]

Embeddings saved to embeddings_structured.npz
FINISHED





In [28]:
data = np.load(embeddings_path, allow_pickle=True)

In [30]:
titles = data["titles"]
embeddings = data["embeddings"]

(30, 768)

: 