## **Making Embeddings for PubMed**

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import numpy as np

import gensim
import gensim.downloader as api
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

warnings.filterwarnings("ignore")

In [3]:
# Load dataset
dataset = pd.read_csv("../PubMed_dataset.csv")

# Load pretrained Word2Vec model
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True)

In [12]:
# Get embedding for a single abstract
def abstract_to_w2v(abstract, w2v_model, vector_size=300):
    # Tokenize by simple whitespace and lowercase
    tokens = abstract.lower().split()
    # Filter tokens that exist in the Word2Vec vocabulary
    valid_tokens = [t for t in tokens if t in w2v_model]
    
    if not valid_tokens:  # If no tokens are in the vocab, return zeros
        return np.zeros(vector_size)
    
    # Average embeddings
    embeddings = np.array([w2v_model[t] for t in valid_tokens])
    return embeddings.mean(axis=0)

# Apply to the dataset
vector_size = 300  
dataset["Abstract_W2V"] = dataset["Abstract"].apply(lambda x: abstract_to_w2v(x, model, vector_size))

# Check the result
print(dataset[["PMID", "Abstract_W2V"]].head())

       PMID                                       Abstract_W2V
0  12187484  [-0.03999741, 0.062728785, -0.002549661, 0.049...
1   2344352  [0.014284923, 0.03845855, 0.020136734, 0.05924...
2  14654069  [-0.020149924, 0.047227394, 0.012617389, 0.035...
3  16443886  [-0.045007102, 0.025433676, -0.005037438, 0.06...
4   2684155  [-0.028360292, 0.05821858, 0.015442131, 0.0421...


In [13]:
dataset.head(3)

Unnamed: 0,PMID,Title,Abstract,Key_words,Authors,label,TFIDF,summary_words,Abstract_W2V
0,12187484,Retinal metabolic abnormalities in diabetic mo...,PURPOSE: Dogs and rats are commonly used to ex...,"Animals; Diabetes Mellitus, Experimental/*meta...","Kowluru, Renu A",1,"{'rat': 0.09393489570187145, 'common': 0.02869...","['rat', 'common', 'use', 'examin', 'pathogenes...","[-0.03999741, 0.062728785, -0.002549661, 0.049..."
1,2344352,Spatially resolved changes in diabetic rat ske...,Phase-modulated rotating-frame imaging (p.m.r....,Adenosine Triphosphate/metabolism; Animals; Di...,"Challiss, R A; Blackledge, M J; Radda, G K",1,"{'rat': 0.023617916633613394, 'use': 0.0147841...","['rat', 'use', 'anim', 'metabol', 'investig', ...","[0.014284923, 0.03845855, 0.020136734, 0.05924..."
2,14654069,Mitochondria respiration and susceptibility to...,Cardiovascular complications are the primary c...,Animals; Body Weight/physiology; Cell Respirat...,"Lashin, Ossama; Romani, Andrea",1,"{'rat': 0.10226314418677966, 'use': 0.01066898...","['rat', 'use', 'anim', 'contribut', 'develop',...","[-0.020149924, 0.047227394, 0.012617389, 0.035..."


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd

MODEL_FOLDER = "./daberta_finetune_head_PubMed"  # folder where model was saved

tokenizer = AutoTokenizer.from_pretrained(MODEL_FOLDER)
# Load base model only (without classification head)
model = AutoModel.from_pretrained(MODEL_FOLDER)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # set to evaluation mode

# =========================
# Function to get embeddings
# =========================
def get_bert_embedding(text, tokenizer, model, device, max_length=512):
    encoding = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state  # shape: (1, seq_len, hidden_size)
        cls_embedding = last_hidden_state[:, 0, :]      # CLS token embedding
        return cls_embedding.squeeze().cpu().numpy()

# =========================
# Compute embeddings for all abstracts
# =========================
embeddings = []

for abstract in tqdm(dataset['Abstract']):
    emb = get_bert_embedding(abstract, tokenizer, model, device)
    embeddings.append(emb)

# Convert to a NumPy array
import numpy as np
embeddings = np.array(embeddings)
print("Embeddings shape:", embeddings.shape)  # (num_abstracts, hidden_size)

# Optionally, add embeddings to the DataFrame
dataset['bert_embedding'] = list(embeddings)

dataset.to_csv("PubMed_dataset_EMBS.csv", index=False, encoding="utf-8")

 33%|███▎      | 6523/19716 [2:56:15<3:44:21,  1.02s/it] 