In [15]:
# CaisisImport. A utility to prepare a Caisis-to-Excel export for import into Oncoscape (via cBioPortal format).

# Steps:
# . Limit by disease
# . ExportAsTSVs
# . ZeroDates
# . 
import pandas as pd
import os
import errno
import sys
from typing import List

dataset_folder = '../Caisis_NonPublicData/Brain_nov2021'
diseaseChosen = 'Brain'
#knownTables = ['Demographics', 'Encounters', 'ClinicalStage', 'LabTests', 'PathTest', 'SocialHistory', 'LabTestGenetics', 'PathStageGrade', 'RadiationTherapy',  ]
# table Status is treated separately, first.
knownTables = ['Clinical Stages', 'Demographics', 'Encounters', 'Medical Therapy', 'Pathology', 'Procedures', 'Radiation', 'Social History']
patients_first_dx = pd.DataFrame()   # just PatientId and DiagnosisDate
data_clinical_patient = pd.DataFrame() # Will become the data_clinical_patient table.
loaded_tables_dict = {}

days_before_dx_to_include = 0  # if 30, can include events up to 30 days before the diagnosis date.


In [2]:
def save_file_and_folder(filename, obj):
    if not os.path.exists(os.path.dirname(filename)):
        try:
            os.makedirs(os.path.dirname(filename))
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

    with open(filename, "w") as f:
        f.write(obj)

def patients_and_descriptive_header(df:pd.DataFrame, header):
    # TBD: note header in separate place
    str_ids = df.to_string(index=False)
    a =  str_ids  # header + "\n" + str_ids
    return a

In [55]:
def find_patients_first_dx():
    global patients_first_dx, data_clinical_patient
    print("\nSTEP 1: Find patients' first diagnosis date.")
    print(dataset_folder)
    status_fullpath = dataset_folder +'/raw_csv/Status.csv'
    if not os.path.exists(dataset_folder +'/raw_csv'):
        sys.exit('ERROR: Cannot find folder "raw_csv".')

    if not os.path.exists(status_fullpath):
        sys.exit('ERROR: Status.csv file not found at ' + status_fullpath)

    tbl_status = pd.read_csv(status_fullpath) #'Demographics.csv')
    tbl_status['Date'] =pd.to_datetime(tbl_status.StatusDate)
    tbl_status['PatientId'] =  tbl_status['PatientId'].astype(str)

    dxRows = tbl_status[tbl_status.Status.eq('Diagnosis Date')]
    diseaseRows = dxRows[dxRows.StatusDisease.eq(diseaseChosen)].sort_values('Date')

    dxNanRows = diseaseRows[diseaseRows.Date.isna()][['PatientId']]
    if(dxNanRows.size > 0):
        print('CHECK NoDxDate? REPORT NoDxDate.txt has *' +str(dxNanRows.size)+'* diagnoses of '+diseaseChosen+' without diagnosis dates.')
        report_body = patients_and_descriptive_header(dxNanRows, 'Patients with "'+diseaseChosen+" but no DiagnosisDate:")
    #    save_file_and_folder('reports/NoDxDate.txt', report_body)
        save_file_and_folder(dataset_folder+'/reports/NoDxDate.txt', report_body)
    else:
        print('CHECK NoDxDate? OK')
        print('TBD: delete existing NoDxDate.txt report.')
        if os.path.exists(dataset_folder+'/reports/NoDxDate.txt'):
            os.remove(dataset_folder+'/reports/NoDxDate.txt')
            print('REMOVED NoDxDate.txt')
            
    diseaseDatedRows = diseaseRows[diseaseRows.Date.isna()==False]
    patientid_date_dict = {}
    for index, row in diseaseDatedRows.iterrows():
        pid = str(row['PatientId'])
        if((pid in patientid_date_dict) == False):
            patientid_date_dict[pid] = row['Date']
        else:
            pass
    print('Resulting patient IDs = ' + str(len(patientid_date_dict)))   

    data = []
    for key in patientid_date_dict.keys():
        new_row = [key, patientid_date_dict[key]]
        data.append(new_row)
    patients_first_dx = pd.DataFrame(data, columns=["PatientId", "DiagnosisDate"])
    data_clinical_patient = patients_first_dx.copy()

    report_body = patients_and_descriptive_header(patients_first_dx, 'Patients First Diagnosis Date')
    
    save_file_and_folder(dataset_folder+'/reports/PatientsFirstDx.txt', report_body)
    #patients_first_dx


In [54]:
def load_all_tables():
    global loaded_tables_dict
    print("\nSTEP 2: Load all CSV tables.")
    for tableName in knownTables:
        full_path = dataset_folder+'/raw_csv/'+tableName+'.csv'
        if not os.path.exists(full_path):
            print("WARN -- missing table "+tableName)
        else:
            print('Reading ' + tableName+'.csv...')
            df = pd.read_csv(full_path)
            df = df.astype({"PatientId": str})
            #df.set_index('PatientId', inplace=True)
            #print(df.head(2))
            loaded_tables_dict[tableName] = df

In [25]:
class ImportField:
    source_name:str
    final_name:str
    numeric:bool = False
    conversion_function = None

    def __init__(self, source_name, final_name, numeric=False, conversion_function=None):
        self.source_name = source_name
        self.final_name = final_name
        self.numeric = numeric
        self.conversion_function = conversion_function

In [75]:
current_lookup_field = 'notPtGender'
noval_list = []

# imports: a dictionary of column name from the tname table, where value is a function to convert data
def import_to_patient_table(tname, import_fields:List[ImportField]):
    global patients_first_dx, data_clinical_patient, loaded_tables_dict, current_lookup_field, noval_list
    if (tname in loaded_tables_dict) == False:
        print("WARN -- Could not process table "+tname)
    else:
        current_table = loaded_tables_dict[tname]
        for ifield in import_fields:

            if ifield.source_name in current_table.columns:
                data_clinical_patient.insert(1, ifield.final_name, None)
               # global current_lookup_field
                current_lookup_field= ifield.source_name
                print("Looking for field " + tname+"."+current_lookup_field)
                noval_list.clear()

                def getFieldValue(pid):
                    global current_lookup_field, noval_list
                    gg = current_table.loc[current_table['PatientId'] == str(pid)]
                    hh = gg[current_lookup_field]

                    try:
                        return hh.iloc[0] 
                    except:
                        noval_list.append(pid)
                        return None 

                new_values = data_clinical_patient['PatientId'].apply(getFieldValue)
                if len(noval_list) > 0 :
                    percent_str = "{0:.0%}".format(len(noval_list) / new_values.shape[0])
                    print("- Field " + tname+"."+current_lookup_field + " had " + str(len(noval_list)) + " empty entries. ("+percent_str+" empty)")
                    #print(str( len(noval_list) / new_values.shape[0]))


                data_clinical_patient[ifield.final_name] = new_values
                #print(data_clinical_patient.head(12))


In [76]:
find_patients_first_dx()
print(patients_first_dx.shape)
load_all_tables()

print("\nSTEP 3: Import columns to patient table.")
import_fields = [
    ImportField('PtGender', final_name='Sex') ,   #, conversion_function=None, numeric=False),
    ImportField('PtBirthDate', final_name='BirthDate' )
]
import_to_patient_table('Demographics', import_fields)

import_fields = [
    ImportField('SocHxTobaccoType', final_name='Tobacco_Use'),
    ImportField('SocHxTobaccoYears', final_name='Tobacco_Years' ), #SocHxAlcohol
    ImportField('SocHxAlcohol', final_name='Alcohol_Use')
]
import_to_patient_table('Social History', import_fields)


print("DONE")
# print(data_clinical_patient.to_string())



STEP 1: Find patients' first diagnosis date.
../Caisis_NonPublicData/Brain_nov2021
CHECK NoDxDate? REPORT NoDxDate.txt has *6* diagnoses of Brain without diagnosis dates.
Resulting patient IDs = 2164
(2164, 2)

STEP 2: Load all CSV tables.
Reading Clinical Stages.csv...
Reading Demographics.csv...
Reading Encounters.csv...
Reading Medical Therapy.csv...
Reading Pathology.csv...
Reading Procedures.csv...
Reading Radiation.csv...
WARN -- missing table Social History

STEP 3: Import columns to patient table.
Looking for field Demographics.PtGender
Looking for field Demographics.PtBirthDate
Looking for field Social History.SocHxTobaccoType
- Field Social History.SocHxTobaccoType had 2150 empty entries. (99% empty)
Looking for field Social History.SocHxTobaccoYears
- Field Social History.SocHxTobaccoYears had 2150 empty entries. (99% empty)
Looking for field Social History.SocHxAlcohol
- Field Social History.SocHxAlcohol had 2150 empty entries. (99% empty)
DONE
