# Create embeddings for topic modeling

In [1]:
%pip install datasets pandas psycopg2 python-dotenv torch transformers scikit-learn sqlalchemy

Note: you may need to restart the kernel to use updated packages.


## Create base dataset

Ideally you'll only have to do this once.

In [4]:
from dotenv import load_dotenv
from sqlalchemy import create_engine
import pandas as pd
import os

load_dotenv()

# Make sure there's a .env file with PG_URL set to connection string!
# Note for the unwary: make sure to restart the kernel if you change this.
# Don't ask me how I know.
engine = create_engine(os.getenv("PG_URL"))

In [5]:
df = pd.read_sql("SELECT * FROM items WHERE title IS NOT NULL AND score >= 20;", engine)

In [7]:
df.to_parquet("../datasets/raw-story-data.parquet")

## Create title embeddings

In [1]:
import pandas as pd
df = pd.read_parquet("../datasets/raw-story-data.parquet")

In [2]:
import torch

def choose_device():
    if torch.cuda.is_available():
        return "cuda"
    elif torch.has_mps():
        return "mps"
    else:
        return "cpu"

We select bge base embeddings, which at time of writing are atop the embedding benchmark leaderboard and are a reasonable size compromise:

In [3]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-base-en-v1.5')
model = AutoModel.from_pretrained("BAAI/bge-base-en-v1.5")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
device = choose_device()
model = model.to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [5]:
# Remove any rows that don't have a title
df = df[df['title'].notnull()]

TODO: principled ablations on document creation strategy

In [6]:
# Combine title and url columns into a single column
# df['document'] = "TITLE: " + df['title'] + "\n" + df['url'].fillna("")
df['document'] = df['title']

Now let's actually embed the data:

In [7]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

In [8]:
def generate_embeddings(batch):
    """Generate embeddings for a batch of data."""
    inputs = tokenizer(batch['document'], padding=True, truncation=True, return_tensors='pt', max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        # CLS pooling
        embeddings = outputs[0][:, 0]
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        
    # Convert to numpy array on CPU
    return {'embeddings': embeddings.cpu().numpy(), 'uid': batch['id']}

In [20]:
def generate_mean_pooled_embeddings(batch):
    """Generate embeddings for a batch of data."""
    inputs = tokenizer(batch['document'], padding=True, truncation=True, return_tensors='pt', max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        # Exclude [CLS] and [SEP] tokens and compute mean pooling
        embeddings = torch.mean(outputs[0][:, 1:-1, :], dim=1)
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        
    # Convert to numpy array on CPU
    return {'embeddings': embeddings.cpu().numpy(), 'uid': batch['id']}

In [10]:
# generate embeddings
dataset_with_embeddings = dataset.map(generate_embeddings, batched=True, batch_size=128)

Map: 100%|██████████| 450733/450733 [01:17<00:00, 5846.21 examples/s]


In [11]:
dataset_with_embeddings.save_to_disk("../datasets/post-title-bge-cls_pooled")
"""
df = dataset_with_embeddings.to_pandas()
df.to_parquet("../datasets/post-title-url-bge-mean_pooled.parquet")
"""

Saving the dataset (4/4 shards): 100%|██████████| 450733/450733 [00:01<00:00, 346435.42 examples/s]


'\ndf = dataset_with_embeddings.to_pandas()\ndf.to_parquet("../datasets/post-title-url-bge-mean_pooled.parquet")\n'