## **Making Embeddings for PubMed**

In [10]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import numpy as np

import gensim
import gensim.downloader as api
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

warnings.filterwarnings("ignore")

In [11]:
# Load dataset
dataset = pd.read_csv("../PubMed_dataset.csv")

# Load pretrained Word2Vec model
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True)

### **Word2Vec Embeddings**

In [12]:
# Word2Vec embeddings

# Get embedding for a single abstract
def abstract_to_w2v(abstract, w2v_model, vector_size=300):
    # Tokenize by simple whitespace and lowercase
    tokens = abstract.lower().split()
    # Filter tokens that exist in the Word2Vec vocabulary
    valid_tokens = [t for t in tokens if t in w2v_model]
    
    if not valid_tokens:  # If no tokens are in the vocab, return 300D vector of zeros
        return np.zeros(vector_size)
    
    # Average embeddings
    embeddings = np.array([w2v_model[t] for t in valid_tokens])
    return embeddings.mean(axis=0)

vector_size = 300  
dataset["Abstract_W2V"] = dataset["Abstract"].apply(lambda x: abstract_to_w2v(x, model, vector_size))

# Check the result
print(dataset[["PMID", "Abstract_W2V"]].head())

       PMID                                       Abstract_W2V
0  12187484  [-0.03999741, 0.062728785, -0.002549661, 0.049...
1   2344352  [0.014284923, 0.03845855, 0.020136734, 0.05924...
2  14654069  [-0.020149924, 0.047227394, 0.012617389, 0.035...
3  16443886  [-0.045007102, 0.025433676, -0.005037438, 0.06...
4   2684155  [-0.028360292, 0.05821858, 0.015442131, 0.0421...


### **BERT Embeddings**

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd

MODEL_FOLDER = "./daberta_finetune_head_PubMed" 

tokenizer = AutoTokenizer.from_pretrained(MODEL_FOLDER)
model = AutoModel.from_pretrained(MODEL_FOLDER)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  

# Function to get embeddings
def get_bert_embedding(text, tokenizer, model, device, max_length=512):
    encoding = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state  # shape: (1, seq_len, hidden_size)
        cls_embedding = last_hidden_state[:, 0, :]      # CLS token embedding
        return cls_embedding.squeeze().cpu().numpy()

# Compute embeddings for all abstracts
embeddings = []

for abstract in tqdm(dataset['Abstract']):
    emb = get_bert_embedding(abstract, tokenizer, model, device)
    embeddings.append(emb)


embeddings = np.array(embeddings)
print("Embeddings shape:", embeddings.shape)  # (num_abstracts, hidden_size)

# Add embeddings to the DataFrame
dataset['bert_embedding'] = list(embeddings)

dataset.to_csv("PubMed_dataset_EMBS.csv", index=False, encoding="utf-8")

100%|██████████| 19716/19716 [8:14:00<00:00,  1.50s/it]  


Embeddings shape: (19716, 768)


### **Get all Embeddings**

In [18]:
BERT_embs = pd.read_csv("./PubMed_dataset_EMBS.csv")
df_with_EMBS = pd.merge(dataset[['PMID', 'Abstract_W2V']], BERT_embs, on='PMID')

print(f"Columns of the dataset: {df_with_EMBS.columns}")

Columns of the dataset: Index(['PMID', 'Abstract_W2V', 'Title', 'Abstract', 'Key_words', 'Authors',
       'label', 'TFIDF', 'summary_words', 'bert_embedding'],
      dtype='object')


In [19]:
# Display some entries
df_with_EMBS[['PMID', 'Abstract', 'TFIDF', 'Abstract_W2V', 'bert_embedding']].head(5)

Unnamed: 0,PMID,Abstract,TFIDF,Abstract_W2V,bert_embedding
0,12187484,PURPOSE: Dogs and rats are commonly used to ex...,"{'rat': 0.09393489570187145, 'common': 0.02869...","[-0.03999741, 0.062728785, -0.002549661, 0.049...",[ 5.83067238e-02 -7.34468549e-03 -7.10018054e-...
1,2344352,Phase-modulated rotating-frame imaging (p.m.r....,"{'rat': 0.023617916633613394, 'use': 0.0147841...","[0.014284923, 0.03845855, 0.020136734, 0.05924...",[ 8.21016803e-02 -9.57140699e-03 -8.19785297e-...
2,14654069,Cardiovascular complications are the primary c...,"{'rat': 0.10226314418677966, 'use': 0.01066898...","[-0.020149924, 0.047227394, 0.012617389, 0.035...",[ 6.06562942e-02 -2.23077759e-02 -8.84668827e-...
3,16443886,OBJECTIVE: Mean blood glucose (MBG) over 2-3 m...,"{'model': 0.038714646134547365, 'develop': 0.0...","[-0.045007102, 0.025433676, -0.005037438, 0.06...",[ 5.10609150e-02 -4.73604724e-03 -7.53103644e-...
4,2684155,Hepatocytes were derived from 2-3-day streptoz...,"{'rat': 0.030615817858387732, 'anim': 0.080179...","[-0.028360292, 0.05821858, 0.015442131, 0.0421...",[ 6.99397400e-02 -1.28354207e-02 -8.73763263e-...
