In [29]:
# CaisisImport. A utility to prepare a Caisis-to-Excel export for import into Oncoscape (via cBioPortal format).

# Steps:
# . Limit by disease
# . ExportAsTSVs
# . ZeroDates
# . 
import pandas as pd
import os
import errno
import sys

dataset_folder = '../Caisis_NonPublicData/Brain'
diseaseChosen = 'Brain'
#knownTables = ['Demographics', 'Encounters', 'ClinicalStage', 'LabTests', 'PathTest', 'SocialHistory', 'LabTestGenetics', 'PathStageGrade', 'RadiationTherapy',  ]
knownTables = ['Demographics', 'Encounters', 'Procedures', 'Path Stage Grade', 'Social History', 'Radiation Therapy'  ]
patients_first_dx = pd.DataFrame()   # just PatientId and DiagnosisDate
data_clinical_patient = pd.DataFrame() # Will become the data_clinical_patient table.
loaded_tables_dict = {}

days_before_dx_to_include = 0  # if 30, can include events up to 30 days before the diagnosis date.


In [24]:
def save_file_and_folder(filename, obj):
    if not os.path.exists(os.path.dirname(filename)):
        try:
            os.makedirs(os.path.dirname(filename))
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

    with open(filename, "w") as f:
        f.write(obj)

def patients_and_descriptive_header(df:pd.DataFrame, header):
    # TBD: note header in separate place
    str_ids = df.to_string(index=False)
    a =  str_ids  # header + "\n" + str_ids
    return a

In [25]:
def find_patients_first_dx():
    global patients_first_dx, data_clinical_patient

    status_fullpath = dataset_folder +'/raw_csv/Status.csv'
    if not os.path.exists(dataset_folder +'/raw_csv'):
        sys.exit('ERROR: Cannot find folder "raw_csv".')

    if not os.path.exists(status_fullpath):
        sys.exit('ERROR: Status.csv file not found at ' + status_fullpath)

    tbl_status = pd.read_csv(status_fullpath) #'Demographics.csv')
    tbl_status['Date'] =pd.to_datetime(tbl_status.StatusDate)
    tbl_status['PatientId'] =  tbl_status['PatientId'].astype(str)

    dxRows = tbl_status[tbl_status.Status.eq('Diagnosis Date')]
    diseaseRows = dxRows[dxRows.StatusDisease.eq(diseaseChosen)].sort_values('Date')

    dxNanRows = diseaseRows[diseaseRows.Date.isna()][['PatientId']]
    if(dxNanRows.size > 0):
        print('CHECK NoDxDate? REPORT NoDxDate.txt has *' +str(dxNanRows.size)+'* diagnoses of '+diseaseChosen+' without diagnosis dates.')
        report_body = patients_and_descriptive_header(dxNanRows, 'Patients with "'+diseaseChosen+" but no DiagnosisDate:")
    #    save_file_and_folder('reports/NoDxDate.txt', report_body)
        save_file_and_folder(dataset_folder+'/reports/NoDxDate.txt', report_body)
    else:
        print('CHECK NoDxDate? OK')
        print('TBD: delete existing NoDxDate.txt report.')
        if os.path.exists(dataset_folder+'/reports/NoDxDate.txt'):
            os.remove(dataset_folder+'/reports/NoDxDate.txt')
            print('REMOVED NoDxDate.txt')
            
    diseaseDatedRows = diseaseRows[diseaseRows.Date.isna()==False]
    #print(diseaseRows[diseaseRows.PatientId.eq('1920')])
    print('----')
    patientid_date_dict = {}
    for index, row in diseaseDatedRows.iterrows():
        pid = str(row['PatientId'])
        if((pid in patientid_date_dict) == False):
            patientid_date_dict[pid] = row['Date']
        else:
            pass
    print('Resulting patient IDs = ' + str(len(patientid_date_dict)))   
    #patients_first_dx = pd.DataFrame.from_dict(patientid_date_dict, orient='index', columns=['PatientId'])
    #patients_first_dx.set_index('PatientId', inplace=True)  
    #patients_first_dx

    data = []
    for key in patientid_date_dict.keys():
        new_row = [key, patientid_date_dict[key]]
        data.append(new_row)
    patients_first_dx = pd.DataFrame(data, columns=["PatientId", "DiagnosisDate"])
    data_clinical_patient = patients_first_dx.copy()

    report_body = patients_and_descriptive_header(patients_first_dx, 'Patients First Diagnosis Date')
    
    save_file_and_folder(dataset_folder+'/reports/PatientsFirstDx.txt', report_body)
    #patients_first_dx


In [120]:
def load_all_tables():
    global loaded_tables_dict
    for tableName in knownTables:
        print('attempting to read ' + tableName+'.csv...')
        df = pd.read_csv(dataset_folder+'/raw_csv/'+tableName+'.csv')
        df = df.astype({"PatientId": str})
        #df.set_index('PatientId', inplace=True)
        #print(df.head(2))
        loaded_tables_dict[tableName] = df

In [164]:
def process_demographics():
    global patients_first_dx, data_clinical_patient, loaded_tables_dict
    current_table = loaded_tables_dict['Demographics']
    print("current columns...")
    #print(current_table.columns)
    if 'PtGender' in current_table.columns:
        print("================gender===")
        data_clinical_patient.insert(1, 'PtGender', None)

        def setGender(pid):
            gg = current_table.loc[current_table['PatientId'] == str(pid)]
            return gg['PtGender'].iloc[0]   

        gender_results = data_clinical_patient['PatientId'].apply(setGender)
        print("results......")
        #print(gender_results)
        data_clinical_patient['PtGender'] = gender_results
        #print(data_clinical_patient.head(12))


In [165]:
find_patients_first_dx()
print(patients_first_dx.shape)
load_all_tables()

process_demographics()

print("DONE")
#print(data_clinical_patient)


CHECK NoDxDate? REPORT NoDxDate.txt has *3* diagnoses of Brain without diagnosis dates.
----
Resulting patient IDs = 1775
(1775, 2)
attempting to read Demographics.csv...
attempting to read Encounters.csv...
attempting to read Procedures.csv...
attempting to read Path Stage Grade.csv...
attempting to read Social History.csv...
attempting to read Radiation Therapy.csv...
current columns...
results......
DONE
