In [5]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel

# Suppress Warnings
import warnings
warnings.filterwarnings('ignore')

In [6]:
data = pd.read_csv('../data/labeled_cleaned_data.csv')

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
model = AutoModel.from_pretrained('bert-base-cased').to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [9]:
def get_bert_sentence_embedding(text):
    if pd.isna(text) or not isinstance(text, str) or text.strip() == "":
        return np.zeros(model.config.hidden_size, dtype=np.float32)

    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    sentence_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return sentence_embedding

In [10]:
data['title'] = data['title'].apply(lambda x: get_bert_sentence_embedding(x))
data['body'] = data['body'].apply(lambda x: get_bert_sentence_embedding(x))

In [25]:
title_bert_features = pd.DataFrame(data['title'].tolist(), index=data.index)
title_bert_features = title_bert_features.add_prefix('title_')

In [26]:
body_bert_features = pd.DataFrame(data['body'].tolist(), index=data.index)
body_bert_features = body_bert_features.add_prefix('body_')

In [27]:
print(f"Shape of expanded title BERT features: {title_bert_features.shape}")
print(f"Shape of expanded body BERT features: {body_bert_features.shape}")

Shape of expanded title BERT features: (498, 768)
Shape of expanded body BERT features: (498, 768)
