In [1]:
import os
import numpy as np
import pandas as pd

x_file_path = './CellTOG/brain_sc_output/processed_data/brain/alzheimer\'s_disease/alzheimer\'s_disease_X_partition_0.npy'
y_file_path = './CellTOG/brain_sc_output/processed_data/brain/alzheimer\'s_disease/alzheimer\'s_disease_Y_partition_0.npy'

x = np.load(x_file_path)
y = np.load(y_file_path)
print(x.shape, y.shape)

(291, 483817) (291,)


In [10]:
edge_index_path = './CellTOG/brain_sc_output/edge_index.npy'
x_desc_path = './CellTOG/brain_sc_output/X_descriptions.npy'

edge_index = np.load(edge_index_path)
x_desc = np.load(x_desc_path, allow_pickle=True)
print(edge_index.shape, x_desc.shape)
print(edge_index)

(2, 33349084) (483817, 1)
[[     0      1      2 ... 483091 483091 483091]
 [278326 278327 278328 ... 386373 398848 437378]]


In [11]:
x_desc[0]

array(['ADP ribosylation factor 5 [Source:HGNC Symbol;Acc:HGNC:658]'],
      dtype=object)

In [13]:
# convert x_desc to list of strings
x_desc = [str(i) for i in x_desc]

In [14]:
x_desc[:2]

["['ADP ribosylation factor 5 [Source:HGNC Symbol;Acc:HGNC:658]']",
 "['mannose-6-phosphate receptor, cation dependent [Source:HGNC Symbol;Acc:HGNC:6752]']"]

#### Name emb

In [None]:
transcript_name_emb_path = './BioMedGraphica/embeddings/Entity/transcript_embeddings.npy'
protein_name_emb_path = './BioMedGraphica/embeddings/Entity/protein_embeddings.npy'

transcript_name_emb = np.load(transcript_name_emb_path)
protein_name_emb = np.load(protein_name_emb_path)
name_emb = np.concatenate((transcript_name_emb, protein_name_emb), axis=0)
print(name_emb.shape)
print(transcript_name_emb.shape, protein_name_emb.shape)

(483817,)
(278326,) (205491,)


#### Desc emb

In [15]:
from typing import List, Tuple, Dict
from torch.utils.data import DataLoader, Dataset

class SentenceDataset(Dataset):
    def __init__(self, sentences: List[str]):
        self.sentences = sentences

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx]

In [16]:
import os
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch_geometric.nn import GATConv
from torch_geometric.data import Data
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from typing import List, Tuple, Dict
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
from torch.utils.data import DataLoader, Dataset
from pykeen.training.callbacks import TrainingCallback  # Updated base class for callbacks

from pykeen.models import TransR
from pykeen.training import SLCWATrainingLoop

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
class TextEmb():
    def __init__(self, model_path: str = "microsoft/deberta-v3-small", device: str = "cuda"):
        """
        Args:
            model_path (str, optional): Path to the deberta model. Defaults to 'microsoft/deberta-v3-small'.
            device (str, optional): Device to run the model on ('cpu' or 'cuda'). Defaults to 'cpu'.
        """
        self.model_path = model_path
        self.device = device
        self.model = None
        self.tokenizer = None

    def load_model(self):
        """
        Load the deberta model and tokenizer from the specified model path.
        """
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
        self.model = AutoModel.from_pretrained(self.model_path).to(self.device)

    def generate_embeddings(self, sentences: List[str], batch_size: int = 32, text_emb_dim: int = 64) -> List[float]:
        """
        Generate a single-dimensional embedding for each sentence.

        Args:
            sentences (List[str]): List of sentences to embed.
            batch_size (int, optional): Batch size for DataLoader. Defaults to 32.

        Returns:
            List[float]: List of single-dimensional embeddings.
        """
        dataset = SentenceDataset(sentences)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

        embeddings = []
        for batch in tqdm(dataloader, desc="Embedding sentences", unit="batch"):
            inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.device)
            with torch.no_grad():
                outputs = self.model(**inputs)
            batch_embeddings = torch.mean(outputs.last_hidden_state, dim=1).squeeze().cpu()
            # Adaptive pooling to 64 dimensions (pooling over the hidden dimension)
            projected = torch.nn.functional.adaptive_avg_pool1d(batch_embeddings.unsqueeze(1), output_size=text_emb_dim).squeeze(1)
            embeddings.extend(projected.tolist())
        return embeddings

    def save_embeddings(self, embeddings: List[float], ids: List[str], output_npy_path: str, output_csv_path: str) -> None:
        """
        Save embeddings to a .npy file and IDs to a CSV file.

        Args:
            embeddings (List[float]): List of single-dimensional embeddings.
            ids (List[str]): List of corresponding IDs.
            output_npy_path (str): Path to save the .npy file.
            output_csv_path (str): Path to save the index CSV file.
        """
        # Save the embeddings to .npy file
        np.save(output_npy_path, np.array(embeddings))
        print(f"Embeddings saved at {output_npy_path} with shape {np.array(embeddings).shape}")

        # Save the IDs to CSV
        index_df = pd.DataFrame(ids, columns=["biomedgraphica_id"])
        index_df.to_csv(output_csv_path, index=False)

    def process_data_and_generate_embeddings(self, sentences: List, batch_size: str, text_emb_dim: int) -> Tuple[List[float], List[str]]:

        embeddings = self.generate_embeddings(sentences, batch_size, text_emb_dim)
        # print generated embeddings at shape
        print(f"Generated embeddings at shape {np.array(embeddings).shape}")
        return embeddings

In [21]:
mapping_table = pd.read_csv('./CellTOG/brain_sc_output/mapping_table.csv')
display(mapping_table.head())
ids = mapping_table['BioMedGraphica_ID'].tolist()
print(len(ids))
print(ids[:2])

  mapping_table = pd.read_csv('./CellTOG/brain_sc_output/mapping_table.csv')


Unnamed: 0,index,original_id,original_index,BioMedGraphica_ID
0,0,ARF5,ARF5,BMG_TS000001
1,1,M6PR,M6PR,BMG_TS000002
2,2,ESRRA,ESRRA,BMG_TS000003
3,3,FKBP4,FKBP4,BMG_TS000004
4,4,,,BMG_TS000005


483817
['BMG_TS000001', 'BMG_TS000002']


In [22]:
model_path = "microsoft/deberta-v3-small"
device = "cuda"
batch_size = 128
desc_emb_dim = 1
output_npy_path = "./CellTOG/x_desc_emb.npy"

In [25]:
encoder = TextEmb(model_path, device)
encoder.load_model()



In [None]:
embeddings = encoder.process_data_and_generate_embeddings(x_desc, batch_size, desc_emb_dim)

#### Seq emb