In [None]:
import os
import sys

# Get the current working directory
current_dir = os.getcwd()

# Add the code directory to sys.path
sys.path.append(os.path.join(os.path.dirname(current_dir), 'code'))

import pandas as pd
import numpy as np
import random
from IPython.display import clear_output
from google.cloud import bigquery # SQL table interface on Arcus
from dxFilterLibraryPreGrading import *
from reportMarkingFunctions import *
from projectTableFunctions import * 
import json
import matplotlib.pyplot as plt

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Initialize the client service
client = bigquery.Client()
backup_grader_table()

grader_table_name = "lab.grader_table_with_metadata_project_independent"
project_table_name = "lab.proc_ord_projects"

## Update Phecodes

### Create a table containing all patient dx as phecodes

In [None]:
## Create the table for patient phecodes
table_all_patients = "arcus.patient"
df_patient_phecodes = map_proc_req_to_phecodes(table_all_patients)
df_patient_phecodes.dtypes
df_patient_phecodes.shape

In [None]:
df_patient_phecodes = df_patient_phecodes.drop("dx_source", axis = 1)
df_patient_phecodes.head()

In [None]:
table_id = "lab.patient_phecode_dx"

job_config = bigquery.LoadJobConfig(
    # Specify a (partial) schema. All columns are always written to the
    # table. The schema is used to assist in data type definitions.
    schema=[
        # Specify the type of columns whose type cannot be auto-detected. For
        # example the "title" column uses pandas dtype "object", so its
        # data type is ambiguous.
        bigquery.SchemaField("pat_id", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("encounter_id", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("icd10cm", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("icd10cm_str", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("phecode_str", bigquery.enums.SqlTypeNames.STRING),
    ]
)

In [None]:
job = client.load_table_from_dataframe(
    df_patient_phecodes, table_id, job_config=job_config
)  # Make an API request.
job.result()  # Wait for the job to complete.

## Update Projects to Include New Data

In [None]:
# To make sure all reports for all cohorts are indexed in the project table, 
# uncomment the for loop and its contents before running this cell. 
# Warning: it will take time to run, do not panic.
cfg = "../queries/config.json"
with open(cfg, "r") as f:
    cohort_lookup = json.load(f)

cohort_list = list(cohort_lookup.keys())
print(cohort_list)

In [None]:
for cohort in cohort_list:
    print(cohort)
    add_reports_to_project(cohort)

## Incorporate NLP Grades

### Check that the latest NLP Models have all missing proc_ord_ids

In [None]:
# Check that there are no missing proc_ord_ids from new NLP delivery
q_get_proc_ord_ids = "SELECT proc_ord_id FROM arcus.procedure_order_narrative"
df_proc_ord_ids = client.query(q_get_proc_ord_ids).to_dataframe()

q_get_new_nlp_grades = '''
select
    proc_ord_id,
    majority_vote
  from
    lab.maryam_tmp_results_feb28_2025 nlp_predict'''

df_new_nlp = client.query(q_get_new_nlp_grades).to_dataframe()

q_get_old_nlp_grades = '''
select
    proc_ord_id,
    majority_vote
  from
    lab.nlp_combined nlp_predict'''

df_old_nlp = client.query(q_get_old_nlp_grades).to_dataframe()

print(df_proc_ord_ids.shape)
print(df_new_nlp.shape)

print("Proc Ord IDs NOT in newly delivered NLP grades:")
print(sum(~df_proc_ord_ids.proc_ord_id.isin(list(df_new_nlp['proc_ord_id'].values))))
print("Proc Ord IDs NOT in previously delivered NLP grades:")
print(sum(~df_proc_ord_ids.proc_ord_id.isin(list(df_old_nlp['proc_ord_id'].values))))
print("Proc Ord IDs NOT in any delivered NLP grades:")
print(sum(~df_proc_ord_ids.proc_ord_id.isin(list(df_old_nlp['proc_ord_id'].values)) &
          ~df_proc_ord_ids.proc_ord_id.isin(list(df_new_nlp['proc_ord_id'].values))))

### Update NLP Table

In [None]:
# Make backup of NLP grades
q_create_backup = '''
create table lab.bak_2025_03_03_nlp_combined as 
select * 
from lab.nlp_combined
'''

j_backup = client.query(q_create_backup)

In [None]:
# Confirm that backup exists and is the same as original
q_check_backup = '''
select * 
from lab.bak_2025_03_03_nlp_combined
'''

df_backup = client.query(q_check_backup).to_dataframe().sort_values(by = ["proc_ord_id","timestamp"]).reset_index(drop = True)

q_check_orig = '''
select * 
from lab.nlp_combined
'''

df_orig = client.query(q_check_orig).to_dataframe().sort_values(by = ["proc_ord_id","timestamp"]).reset_index(drop = True)

print(df_backup.head())
print(df_backup.shape)

print(df_orig.head())
print(df_orig.shape)

df_backup.equals(df_orig)

In [None]:
# Merge new grades with lab.nlp_combined
q_insert = '''insert into lab.nlp_combined
    select
      *,
      "out_of_sample" as exp_type
    from
      lab.maryam_tmp_results_feb28_2025;'''

j_insert = client.query(q_insert)
j_insert.result()

In [None]:
q_get_all_nlp_grades = '''
select
    proc_ord_id,
    majority_vote
  from
    lab.nlp_combined nlp_predict'''

df_all_nlp = client.query(q_get_all_nlp_grades).to_dataframe()

print(df_all_nlp.shape)

### Incorporate NLP Grades

In [None]:
# Get only reports where all 4 models agree 100%
q_nlp =  '''
with agreement as (
  select
    proc_ord_id,
    majority_vote
  from
    lab.nlp_combined nlp_predict
  where
    nlp_predict.bert = nlp_predict.biobert
    and nlp_predict.bert = nlp_predict.clinbert
    and nlp_predict.bert = nlp_predict.radbert
    and nlp_predict.bert = 2
)
select
  agreement.proc_ord_id,
  agreement.majority_vote,
  pat.sex,
  pat.race,
  pat.dob_year,
  proc_ord.proc_ord_year,
  proc_ord.start_datetime,
  proc_ord.proc_ord_age,
  proc_ord.proc_ord_desc
from
  arcus.procedure_order proc_ord
  join agreement on agreement.proc_ord_id = proc_ord.proc_ord_id
  join arcus.patient pat on pat.pat_id = proc_ord.pat_id
where
  proc_ord.proc_ord_desc not like "%SPECTROSCOPY%"
  and proc_ord.proc_ord_desc not like "%OUTSIDE%"
  and proc_ord.proc_ord_desc not like "%FUNCTL%"
  and proc_ord.proc_ord_desc not like "%METABOLIC%"
  and proc_ord.proc_ord_desc not like "%AUTOPSY%"
  and (
    proc_ord.proc_ord_desc like "%BRAIN%"
    or proc_ord.proc_ord_desc like "%NEURO%"
  )
order by
  start_datetime desc
'''

In [None]:
def add_reports_for_nlp(nlp_query, grader_name="NLP Models 2025-03-03", project_id="", dry_run = False):
    client = bigquery.Client()
    global grader_table_name

    if not grader_name.startswith("NLP Models"):
        raise ValueError('grader_name must start with "NLP Models". This is essential to ensure that NLP reports graded on different sessions still match (e.g., "NLP Models 2024-12-05" and "NLP Models 2025-02-20"')
    
    # Get the column names from the table
    q_get_cols = "select * from "+grader_table_name+" limit 1;"
    df_get_cols = client.query(q_get_cols).to_dataframe()
    cols_str = " ("+", ".join(list(df_get_cols))+") "
    
    # Run the nlp query
    df_nlp = client.query(nlp_query).to_dataframe()
    print(list(df_nlp))

    # Get the dataframe of reports already in the grader table
    q_get_existing_nlp_grades = 'select * from '+grader_table_name+' where grader_name LIKE "NLP Models%";'
    df_existing = client.query(q_get_existing_nlp_grades).to_dataframe()

    # Get rid of rows in df_nlp if already in grader table
    print(df_nlp.shape[0], "NLP grades exist")
    df_nlp = df_nlp[~df_nlp['proc_ord_id'].isin(list(df_existing['proc_ord_id'].values))]
    print(df_nlp.shape[0], "NLP grades to be added")

    # Divide NLP grades into 100 report chunks
    chunk_size = 100
    num_chunks = df_nlp.shape[0] // chunk_size + 1
    chunks = []
    
    # For each proc_ord_id 
    for i in tqdm(range(num_chunks)):
        start_i = i * chunk_size
        end_i = min((i + 1) * chunk_size, df_nlp.shape[0])
        proc_ord_ids = df_nlp.proc_ord_id.iloc[start_i:end_i]
        grades = "2"
        # Set up the query
        q_insert = f'''insert into {grader_table_name} {cols_str}
            select
              distinct 
              proc_ord.proc_ord_id, "{grader_name+}" as grader_name,
              {grades} as grade,
              "Unique" as grade_category,
              proc_ord.pat_id,
              proc_ord.proc_ord_age as age_in_days,
              proc_ord.proc_ord_year,
              proc_ord.proc_ord_desc as proc_name,
              "arcus.procedure_order" as report_origin_table, 
              "2025-03-03" as grade_date, 
              "SLIP" as grade_criteria 
            from
              arcus.procedure_order proc_ord
              join arcus.patient pat on proc_ord.pat_id = pat.pat_id
            where
              proc_ord.proc_ord_id IN ("'''+'", "'.join(proc_ord_ids)+'''")
            order by 
              proc_ord.proc_ord_year desc;'''
        if dry_run:
            print(q_insert)
            break
        else:
            j_insert = client.query(q_insert)
            j_insert.result()

    print(len(df_nlp), "reports for", grader_name, "added to", grader_table_name)

In [None]:
add_reports_for_nlp(q_nlp, dry_run = True)

In [None]:
add_reports_for_nlp(q_nlp, dry_run = False)