In [5]:
!python -m spacy download en_core_web_md -q
!pip install gliner-spacy -q
!pip install sentence-transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 kB[0m [31m704.9 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [6]:
import re
from collections import Counter
import spacy
from gliner_spacy.pipeline import GlinerSpacy
import pandas as pd
from sentence_transformers import SentenceTransformer, util

## Scoring functionality
- the scoring should be broken down by three paragraphs
- Paragraph 1:
  - diagnosis has to match
  - the biomarkers mentioned in the ground truth have to be mentioned in the generated paragraph
  - there cannot be any extra biomarker mentioned in the ground truth that are mentioned in the generated summary
  

## User provides
- diagnosis
- biomarker-list
- for example:
  gene_dict = {
    diagnosis: str | biomarkers: list
  }
## Ground truth
- NER cancer diagnosis
- NER biomarker biomarker-list also (comparing with whitelist)
### issues
- canine osteosarcoma vs osteosarcome




In [33]:
model = SentenceTransformer('all-MiniLM-L6-v2')
gene_whitelist = ['AKT1', 'AKT3', 'ALK', 'APC', 'ARID1A', 'ASXL1', 'ATM', 'ATR', 'ATRX', 'BAP1', 'BARD1', 'BRAF', 'BRCA1', 'BRCA2', 'BRIP1', 'CALR', 'CBL', 'CCND1', \
                    'CCND2', 'CCND3', 'CCNE1', 'CDK12', 'CDK4', 'CDK6', 'CDKN2A', 'CDKN2B', 'CHEK1', 'CHEK2', 'CRKL', 'CTNNB1', 'DNMT3A', 'EGFR', 'ERBB2', 'ERRFI1', 'ESR1',\
                    'EZH2', 'FANCA', 'FANCC', 'FANCL', 'FBXW7', 'FGF3', 'FGFR1', 'FGFR2', 'FGFR3', 'FLCN', 'FLT3', 'GNAQ', 'GNAS', 'GNB1', 'HRAS', 'IDH1', 'IDH2', 'IKZF1', 'KDR', \
                    'KIT', 'KMT2D', 'KRAS', 'MAP2K1', 'MAP2K2', 'MAPK1', 'MDM2', 'MDM4', 'MEN1', 'MET', 'MLH1', 'MSH2', 'MSH3', 'MSH6', 'MTOR', 'MYC', 'MYCN', 'MYD88', 'NF1', 'NF2', 'NFE2L2',\
                    'NOTCH1', 'NPM1', 'NRAS', 'NT5C2', 'PALB2', 'PDGFRA', 'PIK3CA', 'PIK3R1', 'PMS2', 'POLE', 'POT1', 'PPP2R2A', 'PTCH1', 'PTEN', 'PTPN11', 'RAC1', 'RAD51B', 'RAD51C', 'RAD54L', 'RAF1',\
                    'RB1', 'REL', 'RET', 'RICTOR', 'RUNX1', 'SDHB', 'SDHD', 'SETD2', 'SF3B1', 'SMAD4', 'SMARCA4', 'SMARCB1', 'SMO', 'STK11', 'TET2', 'TP53', 'TRAF3', 'TSC1', 'TSC2', 'VEGFA', 'VHL']

def diagnose_similarity(diag1, diag2, threshold=0.6):
    if diag1 and diag2:
        diag1 = diag1.strip()
        diag2 = diag2.strip()
        diag1_embedding = model.encode(diag1, convert_to_tensor=True)
        diag2_embedding = model.encode(diag2, convert_to_tensor=True)
        similarity = util.pytorch_cos_sim(diag1_embedding, diag2_embedding).item()
        return similarity
    return 0.0

def score_para1(gene_dict, grnd_truth, whitelist):
  comments = ""
  score= 0
  grn_biomarker_list = []
  extra_biomarkers = []
  missing_biomarkers = []
  nlp = spacy.load("en_core_web_sm")
  ner = nlp.get_pipe("ner")
  nlp.add_pipe("gliner_spacy", config={"labels":["cancer","biomarkers","gene","drug"]})
  grd_ner = nlp(grnd_truth)
  grd_entity_dict = {}
  for ent in grd_ner.ents:
    if ent.label_ not in grd_entity_dict:
        grd_entity_dict[ent.label_] = []
    grd_entity_dict[ent.label_].append(ent.text)
  #establishing ground truth
  grn_diag = re.search(r"supports the diagnosis of (.+?)(?:\.|$)", grnd_truth).group(1)
  grn_biomarker_list = [ent for ent in grd_entity_dict.get('gene', [])]
  #comparing
  gen_biomarker_list = gene_dict['biomarkers']
  gen_diag = gene_dict['diagnosis']
  intersection = list(set(grn_biomarker_list) & set(gen_biomarker_list))
  missing_genes = [gene for gene in grn_biomarker_list if gene not in intersection]
  extra_genes = [gene for gene in gen_biomarker_list if gene not in intersection]
  #scoring
  if diagnose_similarity(gene_dict['diagnosis'],grn_diag)>=0.6:
    score+=1
  else:
    comments += "Diagnosis  "
  if len(extra_genes)>0:
    comments += "Extra Biomarkers  "
  if len(missing_genes)>0:
    comments += "Missing Biomarkers  "
  if len(extra_genes)==0 and len(missing_genes)==0:
    score+=1
  score = score/2
  return score, comments

def score_para2(gene_dict, grnd_truth):
  score = 0
  comments = ''
  nlp = spacy.load("en_core_web_sm")
  ner = nlp.get_pipe("ner")
  nlp.add_pipe("gliner_spacy", config={"labels":["cancer","biomarkers","gene","drug"]})
  grd_ner = nlp(grnd_truth)
  grd_entity_dict = {}
  for ent in grd_ner.ents:
      if ent.label_ not in grd_entity_dict:
          grd_entity_dict[ent.label_] = []
      grd_entity_dict[ent.label_].append(ent.text)

  for label, entities in grd_entity_dict.items():
    print(f"{label}: {entities}")

  # Normalize drug lists to lowercase and strip whitespace
  grn_drug_list = [ent.strip().lower() for ent in grd_entity_dict.get('drug', [])]
  gen_drug_list = [drug.strip().lower() for drug in gene_dict['drug']]

  intersection = list(set(grn_drug_list) & set(gen_drug_list))
  missing_drugs = [drug for drug in grn_drug_list if drug not in intersection]
  extra_drugs = [drug for drug in gen_drug_list if drug not in intersection]

  if len(extra_drugs)>0:
    comments += "Extra Drugs  "
  if len(missing_drugs)>0:
    comments += "Missing Drugs  "
  if len(extra_drugs)==0 and len(missing_drugs)==0:
    score+=1
  score = score/1
  return score, comments

def score_para3(gene_dict, grnd_truth):
  score = 0
  comments = ""
  gen_para = gene_dict['mutation']

  if gen_para == grnd_truth:
    score+=1
  else:
    comments +='ABCB1-1delta detecttion'
  score = score/1
  return score, comments

## Testing Paragraph 1
- **Ground truth:**
An integrated review of the genomic data, as well as clinical history and pathology review, supports the diagnosis of  osteosarcoma. Specifically, copy number losses of CDKN2B, as well as copy number gains of MDM2, KDR, KIT and PDGFRA have been frequently found in canine osteosarcoma.
- **generated dict:** {'diagnosis':'osteosarcome', 'biomarkers':['CDKN2B','MDM2','PDGFRA','BRCA2']}


In [23]:
grnd_truth = "An integrated review of the genomic data, as well as clinical history and pathology review, supports the diagnosis of  osteosarcoma. Specifically, copy number losses of CDKN2B, as well as copy number gains of MDM2, KDR, KIT and PDGFRA have been frequently found in canine osteosarcoma."
para1 = {'diagnosis':'osteosarcome', 'biomarkers':['CDKN2B','MDM2','PDGFRA','BRCA2']}

check = score_para1(para1, grnd_truth, gene_whitelist)
score = check[0]
comments = check[1]
print("Score:", score, "Comments:", comments) if comments else print("Score:", score)


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Score: 0.5 Comments: Extra Biomarkers  Missing Biomarkers  


## Testing Paragraph 2
- **Ground truth:**
Notably, we identified mutations with therapeutic and prognostic associations based on FDA approval or well-powered studies in humans and/or dogs, as described on page 2. In addition, similar to Gly507Val, PTPN11 Gly507Ala is also a gain-of-function mutation that might be associated with sensitivity to trametinib based on a preclinical study. (PMID:32212266) Trametinib and sirolimus are available through veterinary compounding pharmacies. Monographs describing published data on the use of these agents in dogs are available upon request, or you can find them on our website (https://vidiumah.com/monographs/).
- **generated dict:** {'drugs':['sirolimus','Trametinib']}

In [24]:
grnd_truth = "Notably, we identified mutations with therapeutic and prognostic associations based on FDA approval or well-powered studies in humans and/or dogs, as described on page 2. In addition, similar to Gly507Val, PTPN11 Gly507Ala is also a gain-of-function mutation that might be associated with sensitivity to trametinib based on a preclinical study. (PMID:32212266) Trametinib and sirolimus are available through veterinary compounding pharmacies. Monographs describing published data on the use of these agents in dogs are available upon request, or you can find them on our website (https://vidiumah.com/monographs/)."
para2 = { 'drug':['sirolimus','trametinib']}

check = score_para2(para2, grnd_truth)
score = check[0]
comments = check[1]
print("Score:", score, "Comments:", comments) if comments else print("Score:", score)


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


gene: ['PTPN11']
drug: ['trametinib', 'Trametinib', 'sirolimus']
Score: 1.0


## Testing Paragraph 3
- **Ground truth:**
This test evaluated 120 cancer genes in the submitted sample. The ABCB1-1delta (MDR1-1delta) mutation was not detected, indicating that the patient is unlikely to experience the ABCB1-1delta-related adverse effects of chemotherapy.
- **generated dict:** {'mutation':'This test evaluated 120 cancer genes in the submitted sample. The ABCB1-1delta (MDR1-1delta) mutation was not detected, indicating that the patient is unlikely to experience the ABCB1-1delta-related adverse effects of chemotherapy.'}

In [35]:
grnd_truth = "This test evaluated 120 cancer genes in the submitted sample. The ABCB1-1delta (MDR1-1delta) mutation was not detected, indicating that the patient is unlikely to experience the ABCB1-1delta-related adverse effects of chemotherapy."
para3 = { 'mutation':'This test evaluated 120 cancer genes in the submitted sample. The ABCB1-1delta (MDR1-1delta) mutation was not detected, indicating that the patient is unlikely to experience the ABCB1-1delta-related adverse effects of chemotherapy.'}
check = score_para3(para3, grnd_truth)
score = check[0]
comments = check[1]
print("Score:", score, "Comments:", comments) if comments else print("Score:", score)

Score: 1.0
