In [1]:
import pandas as pd
import numpy as np
import random
import os
from IPython.display import clear_output
from google.cloud import bigquery # SQL table interface on Arcus
import json
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Initialize the client service
client = bigquery.Client()

grader_table_name = "lab.grader_table_with_metadata_project_independent"
project_table_name = "lab.proc_ord_projects"

In [2]:
# Get only reports where all 4 models agree 100%
q_nlp =  '''
with agreement as (
  select
    proc_ord_id,
    majority_vote
  from
    lab.maryam_nlp_combined nlp_predict
  where
    nlp_predict.bert = nlp_predict.biobert
    and nlp_predict.bert = nlp_predict.clinbert
    and nlp_predict.bert = nlp_predict.radbert
    and nlp_predict.bert = 2
)
select
  agreement.proc_ord_id,
  agreement.majority_vote,
  pat.sex,
  pat.race,
  pat.dob_year,
  proc_ord.proc_ord_year,
  proc_ord.start_datetime,
  proc_ord.proc_ord_age,
  proc_ord.proc_ord_desc
from
  arcus.procedure_order proc_ord
  join agreement on agreement.proc_ord_id = proc_ord.proc_ord_id
  join arcus.patient pat on pat.pat_id = proc_ord.pat_id
where
  proc_ord.proc_ord_desc not like "%SPECTROSCOPY%"
  and proc_ord.proc_ord_desc not like "%OUTSIDE%"
  and proc_ord.proc_ord_desc not like "%FUNCTL%"
  and proc_ord.proc_ord_desc not like "%METABOLIC%"
  and proc_ord.proc_ord_desc not like "%AUTOPSY%"
  and (
    proc_ord.proc_ord_desc like "%BRAIN%"
    or proc_ord.proc_ord_desc like "%NEURO%"
  )
order by
  start_datetime desc
'''

In [3]:
def add_reports_for_nlp(nlp_query, grader_name="NLP Models 2024-11-12", project_id=""):
    client = bigquery.Client()
    global grader_table_name
    
    # Get the column names from the table
    q_get_cols = "select * from "+grader_table_name+" limit 1;"
    df_get_cols = client.query(q_get_cols).to_dataframe()
    cols_str = " ("+", ".join(list(df_get_cols))+") "
    
    # Run the nlp query
    df_nlp = client.query(nlp_query).to_dataframe()
    print(list(df_nlp))

    # Get the dataframe of reports already in the grader table
    q_get_existing_nlp_grades = 'select * from '+grader_table_name+' where grader_name ="'+grader_name+'";'
    df_existing = client.query(q_get_existing_nlp_grades).to_dataframe()

    # Get rid of rows in df_nlp if already in grader table
    print(df_nlp.shape[0], "NLP grades exist")
    df_nlp = df_nlp[~df_nlp['proc_ord_id'].isin(list(df_existing['proc_ord_id'].values))]
    print(df_nlp.shape[0], "NLP grades to be added")

    # For each proc_ord_id 
    for idx, row in df_nlp.iterrows():
        proc_ord_id = row['proc_ord_id']
        grade = row['majority_vote']
        # Set up the query
        q_insert = '''insert into '''+grader_table_name+cols_str+'''
            select
              distinct 
              proc_ord.proc_ord_id, "'''+grader_name+'''" as grader_name,
              '''+str(int(grade))+''' as grade,
              "Unique" as grade_category,
              proc_ord.pat_id,
              proc_ord.proc_ord_age as age_in_days,
              proc_ord.proc_ord_year,
              proc_ord.proc_ord_desc as proc_name,
              "arcus.procedure_order" as report_origin_table, 
              "2024-12-04" as grade_date 
            from
              arcus.procedure_order proc_ord
              join arcus.patient pat on proc_ord.pat_id = pat.pat_id
            where
              proc_ord.proc_ord_id = "'''+proc_ord_id+'''"
            order by 
              proc_ord.proc_ord_year desc;'''
        # print(q_insert)
        j_insert = client.query(q_insert)
        j_insert.result()
    
    # print(q_insert)
    print(len(df_nlp), "reports for", grader_name, "added to", grader_table_name)

In [None]:
add_reports_for_nlp(q_nlp)

['proc_ord_id', 'majority_vote', 'sex', 'race', 'dob_year', 'proc_ord_year', 'start_datetime', 'proc_ord_age', 'proc_ord_desc']
101942 NLP grades exist
52429 NLP grades to be added
