In [1]:
import pandas as pd
import numpy as np
import random
import os
from IPython.display import clear_output
from google.cloud import bigquery # SQL table interface on Arcus
import json
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors
from tqdm import tqdm

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Initialize the client service
client = bigquery.Client()

grader_table_name = "lab.grader_table_with_metadata_project_independent"
project_table_name = "lab.proc_ord_projects"

## Check that the latest NLP Models have all missing proc_ord_ids

In [2]:
# Check that there are no missing proc_ord_ids from new NLP delivery
q_get_proc_ord_ids = "SELECT proc_ord_id FROM arcus.procedure_order_narrative"
df_proc_ord_ids = client.query(q_get_proc_ord_ids).to_dataframe()

q_get_new_nlp_grades = '''
select
    proc_ord_id,
    majority_vote
  from
    lab.maryam_tmp_results_feb28_2025 nlp_predict'''

df_new_nlp = client.query(q_get_new_nlp_grades).to_dataframe()

q_get_old_nlp_grades = '''
select
    proc_ord_id,
    majority_vote
  from
    lab.nlp_combined nlp_predict'''

df_old_nlp = client.query(q_get_old_nlp_grades).to_dataframe()

print(df_proc_ord_ids.shape)
print(df_new_nlp.shape)

print("Proc Ord IDs NOT in newly delivered NLP grades:")
print(sum(~df_proc_ord_ids.proc_ord_id.isin(list(df_new_nlp['proc_ord_id'].values))))
print("Proc Ord IDs NOT in previously delivered NLP grades:")
print(sum(~df_proc_ord_ids.proc_ord_id.isin(list(df_old_nlp['proc_ord_id'].values))))
print("Proc Ord IDs NOT in any delivered NLP grades:")
print(sum(~df_proc_ord_ids.proc_ord_id.isin(list(df_old_nlp['proc_ord_id'].values)) &
          ~df_proc_ord_ids.proc_ord_id.isin(list(df_new_nlp['proc_ord_id'].values))))

(353387, 1)
(21832, 2)
Proc Ord IDs NOT in newly delivered NLP grades:
331555
Proc Ord IDs NOT in previously delivered NLP grades:
21832
Proc Ord IDs NOT in any delivered NLP grades:
0


## Update NLP Table

In [None]:
# Make backup of NLP grades
q_create_backup = '''
create table lab.bak_2025_03_03_nlp_combined as 
select * 
from lab.nlp_combined
'''

j_backup = client.query(q_create_backup)

In [3]:
# Confirm that backup exists and is the same as original
q_check_backup = '''
select * 
from lab.bak_2025_03_03_nlp_combined
'''

df_backup = client.query(q_check_backup).to_dataframe().sort_values(by = ["proc_ord_id","timestamp"]).reset_index(drop = True)

q_check_orig = '''
select * 
from lab.nlp_combined
'''

df_orig = client.query(q_check_orig).to_dataframe().sort_values(by = ["proc_ord_id","timestamp"]).reset_index(drop = True)

print(df_backup.head())
print(df_backup.shape)

print(df_orig.head())
print(df_orig.shape)

df_backup.equals(df_orig)

    proc_ord_id  bert  biobert  clinbert  radbert  majority_vote  \
0  100001191224     0        0         0        0              0   
1   10000195927     0        0         0        0              0   
2  100002565131     0        0         0        0              0   
3  100004113878     2        2         2        2              2   
4  100004769424     2        2         2        2              2   

                   timestamp       exp_type  
0 2024-11-05 23:00:02.795774          train  
1 2024-11-06 01:46:09.271110  out_of_sample  
2 2024-11-05 23:00:02.795774          train  
3 2024-11-06 01:46:09.271110  out_of_sample  
4 2024-11-06 01:46:09.271110  out_of_sample  
(331598, 8)
    proc_ord_id  bert  biobert  clinbert  radbert  majority_vote  \
0  100001191224     0        0         0        0              0   
1   10000195927     0        0         0        0              0   
2  100002565131     0        0         0        0              0   
3  100004113878     2        2 

True

In [8]:
# Merge new grades with lab.nlp_combined
q_insert = '''insert into lab.nlp_combined
    select
      *,
      "out_of_sample" as exp_type
    from
      lab.maryam_tmp_results_feb28_2025;'''

j_insert = client.query(q_insert)
j_insert.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f90105df400>

In [9]:
q_get_all_nlp_grades = '''
select
    proc_ord_id,
    majority_vote
  from
    lab.nlp_combined nlp_predict'''

df_all_nlp = client.query(q_get_all_nlp_grades).to_dataframe()

print(df_all_nlp.shape)

(353430, 2)


## Incorporate NLP Grades

In [10]:
# Get only reports where all 4 models agree 100%
q_nlp =  '''
with agreement as (
  select
    proc_ord_id,
    majority_vote
  from
    lab.nlp_combined nlp_predict
  where
    nlp_predict.bert = nlp_predict.biobert
    and nlp_predict.bert = nlp_predict.clinbert
    and nlp_predict.bert = nlp_predict.radbert
    and nlp_predict.bert = 2
)
select
  agreement.proc_ord_id,
  agreement.majority_vote,
  pat.sex,
  pat.race,
  pat.dob_year,
  proc_ord.proc_ord_year,
  proc_ord.start_datetime,
  proc_ord.proc_ord_age,
  proc_ord.proc_ord_desc
from
  arcus.procedure_order proc_ord
  join agreement on agreement.proc_ord_id = proc_ord.proc_ord_id
  join arcus.patient pat on pat.pat_id = proc_ord.pat_id
where
  proc_ord.proc_ord_desc not like "%SPECTROSCOPY%"
  and proc_ord.proc_ord_desc not like "%OUTSIDE%"
  and proc_ord.proc_ord_desc not like "%FUNCTL%"
  and proc_ord.proc_ord_desc not like "%METABOLIC%"
  and proc_ord.proc_ord_desc not like "%AUTOPSY%"
  and (
    proc_ord.proc_ord_desc like "%BRAIN%"
    or proc_ord.proc_ord_desc like "%NEURO%"
  )
order by
  start_datetime desc
'''

In [16]:
def add_reports_for_nlp(nlp_query, grader_name="NLP Models 2025-03-03", project_id="", dry_run = False):
    client = bigquery.Client()
    global grader_table_name

    if not grader_name.startswith("NLP Models"):
        raise ValueError('grader_name must start with "NLP Models". This is essential to ensure that NLP reports graded on different sessions still match (e.g., "NLP Models 2024-12-05" and "NLP Models 2025-02-20"')
    
    # Get the column names from the table
    q_get_cols = "select * from "+grader_table_name+" limit 1;"
    df_get_cols = client.query(q_get_cols).to_dataframe()
    cols_str = " ("+", ".join(list(df_get_cols))+") "
    
    # Run the nlp query
    df_nlp = client.query(nlp_query).to_dataframe()
    print(list(df_nlp))

    # Get the dataframe of reports already in the grader table
    q_get_existing_nlp_grades = 'select * from '+grader_table_name+' where grader_name LIKE "NLP Models%";'
    df_existing = client.query(q_get_existing_nlp_grades).to_dataframe()

    # Get rid of rows in df_nlp if already in grader table
    print(df_nlp.shape[0], "NLP grades exist")
    df_nlp = df_nlp[~df_nlp['proc_ord_id'].isin(list(df_existing['proc_ord_id'].values))]
    print(df_nlp.shape[0], "NLP grades to be added")

    # Divide NLP grades into 100 report chunks
    chunk_size = 100
    num_chunks = df_nlp.shape[0] // chunk_size + 1
    chunks = []
    
    # For each proc_ord_id 
    for i in tqdm(range(num_chunks)):
        start_i = i * chunk_size
        end_i = min((i + 1) * chunk_size, df_nlp.shape[0])
        proc_ord_ids = df_nlp.proc_ord_id.iloc[start_i:end_i]
        grades = "2"
        # Set up the query
        q_insert = '''insert into '''+grader_table_name+cols_str+'''
            select
              distinct 
              proc_ord.proc_ord_id, "'''+grader_name+'''" as grader_name,
              '''+grades+''' as grade,
              "Unique" as grade_category,
              proc_ord.pat_id,
              proc_ord.proc_ord_age as age_in_days,
              proc_ord.proc_ord_year,
              proc_ord.proc_ord_desc as proc_name,
              "arcus.procedure_order" as report_origin_table, 
              "2025-03-03" as grade_date, 
              "SLIP" as grade_criteria 
            from
              arcus.procedure_order proc_ord
              join arcus.patient pat on proc_ord.pat_id = pat.pat_id
            where
              proc_ord.proc_ord_id IN ("'''+'", "'.join(proc_ord_ids)+'''")
            order by 
              proc_ord.proc_ord_year desc;'''
        if dry_run:
            print(q_insert)
            break
        else:
            j_insert = client.query(q_insert)
            j_insert.result()

    print(len(df_nlp), "reports for", grader_name, "added to", grader_table_name)

In [17]:
add_reports_for_nlp(q_nlp, dry_run = True)

['proc_ord_id', 'majority_vote', 'sex', 'race', 'dob_year', 'proc_ord_year', 'start_datetime', 'proc_ord_age', 'proc_ord_desc']
99724 NLP grades exist
4752 NLP grades to be added


  0%|          | 0/48 [00:00<?, ?it/s]

insert into lab.grader_table_with_metadata_project_independent (proc_ord_id, grader_name, grade, grade_category, pat_id, age_in_days, proc_ord_year, proc_name, report_origin_table, grade_date, grade_criteria) 
            select
              distinct 
              proc_ord.proc_ord_id, "NLP Models 2025-03-03" as grader_name,
              2 as grade,
              "Unique" as grade_category,
              proc_ord.pat_id,
              proc_ord.proc_ord_age as age_in_days,
              proc_ord.proc_ord_year,
              proc_ord.proc_ord_desc as proc_name,
              "arcus.procedure_order" as report_origin_table, 
              "2025-03-03" as grade_date, 
              "SLIP" as grade_criteria 
            from
              arcus.procedure_order proc_ord
              join arcus.patient pat on proc_ord.pat_id = pat.pat_id
            where
              proc_ord.proc_ord_id IN ("509219910069", "509261222676", "508837804635", "508763013887", "508818377861", "509000766386", "




In [18]:
add_reports_for_nlp(q_nlp, dry_run = False)

['proc_ord_id', 'majority_vote', 'sex', 'race', 'dob_year', 'proc_ord_year', 'start_datetime', 'proc_ord_age', 'proc_ord_desc']
99724 NLP grades exist
4752 NLP grades to be added


100%|██████████| 48/48 [02:13<00:00,  2.78s/it]

4752 reports for NLP Models 2025-03-03 added to lab.grader_table_with_metadata_project_independent



