In [1]:
import warnings

warnings.filterwarnings("ignore")
import pandas as pd

from gene_benchmark.descriptor import NCBIDescriptor
from gene_benchmark.encoder import SentenceTransformerEncoder
from gene_benchmark.tasks import EntitiesTask

# Create embeddings & descriptions
This package enables turning biological text data and encoding it numerically using language models. The package also includes means to retrieve biological data.
See the README about further details about creation of descriptions and encoding

In [2]:
prompts_maker = NCBIDescriptor()
prompts = prompts_maker.describe(entities=pd.Series(["BRCA1", "FOSL2"]))
print(prompts.loc[0])

Gene symbol BRCA1 full name BRCA1 DNA repair associated with the summary This gene encodes a 190 kD nuclear phosphoprotein that plays a role in maintaining genomic stability, and it also acts as a tumor suppressor. The BRCA1 gene contains 22 exons spanning about 110 kb of DNA. The encoded protein combines with other tumor suppressors, DNA damage sensors, and signal transducers to form a large multi-subunit protein complex known as the BRCA1-associated genome surveillance complex (BASC). This gene product associates with RNA polymerase II, and through the C-terminal domain, also interacts with histone deacetylase complexes. This protein thus plays a role in transcription, DNA repair of double-stranded breaks, and recombination. Mutations in this gene are responsible for approximately 40% of inherited breast cancers and more than 80% of inherited breast and ovarian cancers. Alternative splicing plays a role in modulating the subcellular localization and physiological function of this gen

We can look at the object summary

In [3]:
print(prompts_maker.summary())

{'allow_partial': False, 'allow_missing': True, 'description class': 'NCBIDescriptor', 'num_missing_entities': 0, 'description columns': 'summary,name,symbol'}


Now we can take the text and encode it

In [4]:
mpnet_name = "sentence-transformers/all-mpnet-base-v2"
single_symbol_encoding = SentenceTransformerEncoder(mpnet_name)
encoding = single_symbol_encoding.encode(prompts)
print(encoding)

0    [0.018238682, -0.090244554, -0.02368962, -0.03...
1    [-0.008200348, -0.089555584, -0.03201792, -0.0...
dtype: object


We can look at the encoder summary

In [5]:
print(single_symbol_encoding.summary())

{'encoder class': 'SentenceTransformerEncoder', 'encoder_model_name': 'sentence-transformers/all-mpnet-base-v2'}


The package comes with a set of sum pre-defined tasks with entities and outcomes and a pipeline object to go from entities (currently just one gene symbol) to prompt to embeddings. Following is a simple example:

In [6]:
biolink_name = "pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb"
task = EntitiesTask(
    "long vs short range TF", encoder=biolink_name, description_builder=NCBIDescriptor()
)
res = task.run()
print(res)

{'fit_time': array([0.0275681 , 0.02069569, 0.02062321, 0.05764556, 0.02548075]), 'score_time': array([0.00319576, 0.00227213, 0.00226951, 0.00228214, 0.00228786]), 'estimator': [LogisticRegression(max_iter=2000), LogisticRegression(max_iter=2000), LogisticRegression(max_iter=2000), LogisticRegression(max_iter=2000), LogisticRegression(max_iter=2000)], 'test_roc_auc': array([0.48      , 0.33760684, 0.38034188, 0.53418803, 0.64444444])}


# Existing tasks
See the readme for further details about the tasks.

In [7]:
task = EntitiesTask(
    task="long vs short range TF",
    description_builder=prompts_maker,
    encoder=SentenceTransformerEncoder(mpnet_name),
)
_ = task.run()
task.summary()

{'task_name': 'long vs short range TF',
 'base_prediction_model': 'LogisticRegression(max_iter=2000)',
 'sample_size': 174,
 'class_sizes': '128,46',
 'classes_names': 'short_range,long_range',
 'allow_partial': False,
 'allow_missing': True,
 'description class': 'NCBIDescriptor',
 'num_missing_entities': 0,
 'description columns': 'summary,name,symbol',
 'encoder class': 'SentenceTransformerEncoder',
 'encoder_model_name': 'sentence-transformers/all-mpnet-base-v2',
 'test_roc_auc': '0.664,0.2222222222222222,0.42735042735042733,0.26495726495726496,0.3288888888888889',
 'mean_roc_auc': 0.38148376068376066,
 'sd_roc_auc': 0.15724379731299665,
 'exclude_symbols_num': 0,
 'model_name': None}