In [25]:
!pip install -q --upgrade pip
!pip install -q gliner-spacy
!pip install -q spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [26]:
import os 
import pandas as pd
import spacy
from gliner_spacy.pipeline import GlinerSpacy

In [27]:
def extract_fpaths(dir_path):
    list_fpaths = []
    for f_path in os.listdir(dir_path):
        f_name = f_path.split('_clinical_report')[0]
        folder_path = os.path.join(dir_path, f"{f_name}_clinical_report")
        ms_path = os.path.join(folder_path, f"{f_name}_mutsumm.csv")
        list_fpaths.append(ms_path)
    return list_fpaths


### Format of the csv 
headers: casenum| summ| cancer| mutation | gene | med| organization | quantity 

In [47]:
def ner_df(nlp, text):
    doc = nlp(text)
    entity_dict = {}
    for ent in doc.ents:
        if ent.label_ not in entity_dict:
            entity_dict[ent.label_] = []
        entity_dict[ent.label_].append(ent.text)
    return entity_dict

In [48]:
summ = "An integrated review of the genomic data, as well as clinical history and pathology reports, supports the diagnosis of pulmonary adenocarcinoma. Specifically, CHEK2 and NF2 copy number losses as identified in this sample frequently occur in human and/or canine pulmonary adenocarcinoma.\r\n\r\nNotably, we identified mutations with therapeutic associations based on FDA approval or well-powered studies in humans and/or dogs, as described on page 2. A monograph describing published data on the use of olaparib in dogs is available upon request or you can find it on our website (https://vidiumah.com/monographs/).\r\n\r\nThis test evaluated 120 cancer genes in the submitted sample. The ABCB1-1delta (MDR1-1delta) mutation was not detected, indicating that the patient is unlikely to experience the ABCB1-1delta-related adverse effects of chemotherapy."
entity_dict = ner_df(nlp, summ)
print(entity_dict.get("cancer"))

['pulmonary adenocarcinoma', 'canine pulmonary adenocarcinoma']


### Creating the output file with NER entities

In [49]:
nlp = spacy.load("en_core_web_sm")
ner = nlp.get_pipe("ner")
label_list = ["cancer", "gene", "mutation","medication", "organization", "quantity"]
nlp.add_pipe("gliner_spacy", config={"labels": label_list})

structured_data = []
for i,row in summ_df.iterrows(): 
    case_num = row["Case Number"]
    print("Processing case: ", case_num)
    summ = row["Searchlight CommentsSasha"]
    if pd.notna(summ):  
        entity_dict = ner_df(nlp, summ)
        structured_row = {
            "case_num": case_num,
            "summ": summ,
            "cancer": entity_dict.get("cancer"),
            "mutation": entity_dict.get("mutation"),
            "gene": entity_dict.get("gene"),
            "med": entity_dict.get("medication"),
            "organization": entity_dict.get("organization"),
            "quantity": entity_dict.get("quantity")
        }
    structured_data.append(structured_row)
structured_df = pd.DataFrame(structured_data)
print(structured_df)
output_file_path = 'Searchlight Info/SearchlightReports/NER_summ.csv'
structured_df.to_csv(output_file_path, index=False)

    

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processing case:  SL22-000616
Processing case:  S22-004322
Processing case:  SL22-000658
Processing case:  SL23-000001
Processing case:  SL23-000020
Processing case:  SL23-000031
Processing case:  SL23-000033
Processing case:  SL23-000034
Processing case:  SL23-000035
Processing case:  SL23-000037
Processing case:  SL23-000038
Processing case:  SL23-000041
Processing case:  SL23-000044
Processing case:  SL23-000046
Processing case:  C23-000017
Processing case:  SL23-000053
Processing case:  SL23-000054
Processing case:  SL23-000065
Processing case:  SL23-000069
Processing case:  SL23-000073
Processing case:  SL23-000074
Processing case:  SL23-000075
Processing case:  SL23-000076
Processing case:  SL23-000078
Processing case:  SL23-000079
Processing case:  SL23-000083
Processing case:  SL23-000084
Processing case:  SL23-000085
Processing case:  SL23-000105
Processing case:  SL23-000088
Processing case:  SL23-000090
Processing case:  SL23-000091
Processing case:  SL23-000104
Processing c

### Creating NER for clinical data 

In [62]:
nlp = spacy.load("en_core_web_sm")
ner = nlp.get_pipe("ner")
label_list = ["cancer","medication", "organization", "quantity", "body part", "gender", "breed", "Spaying", "condition"]
nlp.add_pipe("gliner_spacy", config={"labels": label_list})

structured_data = []
for i,row in summ_df.iterrows(): 
    case_num = row["Case Number"]
    print("Processing case: ", case_num)
    notes = row["Clinical Data"]
    if pd.notna(notes):  
        entity_dict = ner_df(nlp, notes)
        structured_row = {
            "case_num": case_num,
            "notes": notes,
            "cancer": entity_dict.get("cancer"),
            "med": entity_dict.get("medication"),
            "organization": entity_dict.get("organization"),
            "quantity": entity_dict.get("quantity"),
            "body part": entity_dict.get("body_part"),
            "gender": entity_dict.get("gender"),
            "breed": entity_dict.get("breed"),
            "condition": entity_dict.get("condition")
        }
    structured_data.append(structured_row)
structured_df = pd.DataFrame(structured_data)
print(structured_df)
output_file_path = 'Searchlight Info/SearchlightReports/NER_clndata.csv'
structured_df.to_csv(output_file_path, index=False)


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processing case:  SL22-000616
Processing case:  S22-004322
Processing case:  SL22-000658
Processing case:  SL23-000001
Processing case:  SL23-000020
Processing case:  SL23-000031
Processing case:  SL23-000033
Processing case:  SL23-000034
Processing case:  SL23-000035
Processing case:  SL23-000037
Processing case:  SL23-000038
Processing case:  SL23-000041
Processing case:  SL23-000044
Processing case:  SL23-000046
Processing case:  C23-000017
Processing case:  SL23-000053
Processing case:  SL23-000054
Processing case:  SL23-000065
Processing case:  SL23-000069
Processing case:  SL23-000073
Processing case:  SL23-000074
Processing case:  SL23-000075
Processing case:  SL23-000076
Processing case:  SL23-000078
Processing case:  SL23-000079
Processing case:  SL23-000083
Processing case:  SL23-000084
Processing case:  SL23-000085
Processing case:  SL23-000105
Processing case:  SL23-000088
Processing case:  SL23-000090
Processing case:  SL23-000091
Processing case:  SL23-000104
Processing c

In [60]:
notes = "retroperitoneal mass, inappetence for  3 days.\r\n  Retroperitoneal mass-effect on AUS. R/O neoplasia, benign mass, hemorrhage, other Anemia.\r\n  Female\r\n  Labrador Retriever\r\n  Canine\r\n  Spayed\r\n \r\n Sample_1 LeftOrRight : Right\r\n Sample_1 SampleType : Excisional (removal or debulking of the lesion)\r\n Sample_1 LocationSizeAppearance : Adrenal Mass\r\n Sample_1 NumberOfSpecimens : 1\r\n Sample_2 LeftOrRight : Right\r\n Sample_2 SampleType : Excisional (removal or debulking of the lesion)\r\n Sample_2 LocationSizeAppearance : retroperitoneal mass, inappetence for  3 days.\r\n Sample_2 NumberOfSpecimens : 1"

nlp = spacy.load("en_core_web_sm")
ner = nlp.get_pipe("ner")
label_list = ["cancer", "gene", "mutation","medication", "organization", "quantity", "body part", "gender", "breed", "Spaying", "condition"]
nlp.add_pipe("gliner_spacy", config={"labels": label_list})



entity_dict = ner_df(nlp, notes)
for i in entity_dict: 
    print(i, entity_dict[i])

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


condition ['retroperitoneal mass', 'inappetence', 'benign mass', 'hemorrhage', 'Anemia', 'retroperitoneal mass', 'inappetence']
gender ['Female']
breed ['Labrador Retriever']
body part ['Adrenal Mass']
quantity ['1', 'Sample_2', 'NumberOfSpecimens', '1']
