In [181]:
# CaisisImport. A utility to prepare a Caisis-to-Excel export for import into Oncoscape (via cBioPortal format).

# Steps:
# . Limit by disease
# . ExportAsTSVs
# . ZeroDates
# . 


import pandas as pd
import os
import errno
import sys
#import datetime
from typing import List
from datetime import datetime
from datetime import date

dataset_folder = '../Caisis_NonPublicData/Brain_nov2021'
diseaseChosen = 'Brain'
## knownTables = ['Demographics', 'Encounters', 'ClinicalStage', 'LabTests', 'PathTest', 'SocialHistory', 'LabTestGenetics', 'PathStageGrade', 'RadiationTherapy',  ]
# table Status is treated separately, first.
knownTables = ['Clinical Stages', 'Demographics', 'Encounters', 'Medical Therapy', 'Pathology', 'Procedures', 'Radiation', 'Social History']
date_format = '%Y-%M-%d'  # '%d-%b-%y'   #'%b %d %Y %I:%M%p'
#days_before_dx_to_include = 0  # if 30, can include events up to 30 days before the diagnosis date.

## -- internal --
patients_first_dx = pd.DataFrame()   # just PatientId and DiagnosisDate
data_clinical_patient = pd.DataFrame() # Will become the data_clinical_patient table.
loaded_tables_dict = {}
data_clinical_patient = None
datafiles_fields = {
    'patient': []   #not including PATIENT_ID
}

In [35]:
class ImportField:
    source_name:str
    final_name:str
    type:str = 'STRING'  # STRING, NUMBER, or DATE (DATE gets turned into STING in final TSV files.)
    conversion_function = None

    def __init__(self, source_name, final_name, type="STRING", conversion_function=None ):
        self.source_name = source_name
        self.final_name = final_name
        self.type = type
        self.conversion_function = conversion_function

    def __str__(self):
        return self.source_name +"->"+ self.final_name+", type="+self.type+"."

In [3]:
def save_file_and_folder(filename, obj):
    if not os.path.exists(os.path.dirname(filename)):
        try:
            os.makedirs(os.path.dirname(filename))
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

    with open(filename, "w") as f:
        f.write(obj)

def patients_and_descriptive_header(df:pd.DataFrame, header):
    # TBD: note header in separate place
    str_ids = df.to_string(index=False)
    a =  str_ids  # header + "\n" + str_ids
    return a

In [36]:
def find_patients_first_dx():
    global patients_first_dx, data_clinical_patient, dataset_folder
    print("\nSTEP 1: Find patients' first diagnosis date.")
    print(dataset_folder)
    status_fullpath = dataset_folder +'/raw_csv/Status.csv'
    if not os.path.exists(dataset_folder +'/raw_csv'):
        sys.exit('ERROR: Cannot find folder "raw_csv".')

    if not os.path.exists(status_fullpath):
        sys.exit('ERROR: Status.csv file not found at ' + status_fullpath)

    tbl_status = pd.read_csv(status_fullpath) #'Demographics.csv')
    tbl_status['Date'] =pd.to_datetime(tbl_status.StatusDate)
    tbl_status['PatientId'] =  tbl_status['PatientId'].astype(str)

    dxRows = tbl_status[tbl_status.Status.eq('Diagnosis Date')]
    diseaseRows = dxRows[dxRows.StatusDisease.eq(diseaseChosen)].sort_values('Date')

    dxNanRows = diseaseRows[diseaseRows.Date.isna()][['PatientId']]
    if(dxNanRows.size > 0):
        print('CHECK NoDxDate? REPORT NoDxDate.txt has *' +str(dxNanRows.size)+'* diagnoses of '+diseaseChosen+' without diagnosis dates.')
        report_body = patients_and_descriptive_header(dxNanRows, 'Patients with "'+diseaseChosen+" but no DiagnosisDate:")
    #    save_file_and_folder('reports/NoDxDate.txt', report_body)
        save_file_and_folder(dataset_folder+'/reports/NoDxDate.txt', report_body)
    else:
        print('CHECK NoDxDate? OK')
        print('TBD: delete existing NoDxDate.txt report.')
        if os.path.exists(dataset_folder+'/reports/NoDxDate.txt'):
            os.remove(dataset_folder+'/reports/NoDxDate.txt')
            print('REMOVED NoDxDate.txt')
            
    diseaseDatedRows = diseaseRows[diseaseRows.Date.isna()==False]
    patientid_date_dict = {}
    for index, row in diseaseDatedRows.iterrows():
        pid = str(row['PatientId'])
        if((pid in patientid_date_dict) == False):
            patientid_date_dict[pid] = row['Date']
        else:
            pass
    print('Resulting patient IDs = ' + str(len(patientid_date_dict)))   

    data = []
    for key in patientid_date_dict.keys():
        new_row = [key, patientid_date_dict[key]]
        data.append(new_row)
    patients_first_dx = pd.DataFrame(data, columns=["PatientId", "DiagnosisDate"])
    data_clinical_patient = patients_first_dx.copy()

    report_body = patients_and_descriptive_header(patients_first_dx, 'Patients First Diagnosis Date')
    
    save_file_and_folder(dataset_folder+'/reports/PatientsFirstDx.txt', report_body)
    #patients_first_dx


In [37]:
def load_all_tables():
    global loaded_tables_dict
    print("\nSTEP 2: Load all CSV tables.")
    for tableName in knownTables:
        full_path = dataset_folder+'/raw_csv/'+tableName+'.csv'
        if not os.path.exists(full_path):
            print("WARN -- missing table "+tableName)
        else:
            print('Reading ' + tableName+'.csv...')
            df = pd.read_csv(full_path)
            df = df.astype({"PatientId": str})
            #df.set_index('PatientId', inplace=True)
            #print(df.head(2))
            loaded_tables_dict[tableName] = df

In [158]:
current_lookup_field = 'notPtGender'  # placeholder.
noval_list = []   # List of patient IDs with no associated value in one or more of the import_fields. Use this for reporting. 


# imports: a dictionary of column name from the tname table, where value is a function to convert data
def import_to_patient_table(tname, import_fields:List[ImportField]):
    global patients_first_dx, data_clinical_patient, loaded_tables_dict, current_lookup_field, noval_list
    if (tname in loaded_tables_dict) == False:
        print("WARN -- Could not process table "+tname)
    else:
        current_table = loaded_tables_dict[tname]
        for ifield in import_fields:

            if ifield.source_name in current_table.columns:
                data_clinical_patient.insert(1, ifield.final_name, None)
               # global current_lookup_field
                current_lookup_field= ifield.source_name
                print("Looking for field " + tname+"."+current_lookup_field)
                noval_list.clear()

                def get_field_value(pid):
                    global current_lookup_field, noval_list
                    gg = current_table.loc[current_table['PatientId'] == str(pid)]
                    hh = gg[current_lookup_field]
                    val = None
                    try:
                        val = hh.iloc[0] 
                        if ifield.type=='NUMBER':
                            val = val # convert to number
                        if ifield.type=='DATE':
                            print("val..."+str(val))
                            if isinstance(val, str):
                                val = datetime.strptime(val, date_format)
                            else:
                                val="NOTSTR" # TBD: error reporting
                        try:
                            return val
                        except:
                            noval_list.append(pid)
                            return None
                    except:
                        print("ERROR "+current_lookup_field+", "+str(pid)+"   "+str(val)+".")
                        typef, value, traceback = sys.exc_info()
                        print('Error value '+ str (value))


                new_values = data_clinical_patient['PatientId'].apply(get_field_value)
                if len(noval_list) > 0 :
                    percent_str = "{0:.0%}".format(len(noval_list) / new_values.shape[0])
                    print("- Field " + tname+"."+current_lookup_field + " had " + str(len(noval_list)) + " missing entries. ("+percent_str+" empty)")
                    #print(str( len(noval_list) / new_values.shape[0]))


                data_clinical_patient[ifield.final_name] = new_values
                #print(data_clinical_patient.head(12))


In [180]:
val = datetime.strptime('1933-07-02', '%Y-%M-%d')   # '%d-%b-%y'   #'%b %d %Y %I:%M%p'
datetime.strftime(val, '%Y-%M-%d')

'1933-07-02'

In [39]:
def import_fields_to_patient_table():
    # This is the core of specifying which fields we want to import. May change between datasets, so consider moving to an external file.
    
    print("\nSTEP 3: Import columns to patient table.")
    import_fields = [
        ImportField('PtGender', final_name='Sex') ,   #, conversion_function=None),
        ImportField('PtBirthDate', final_name='BirthDate', type="DATE" )
    ]
    import_to_patient_table('Demographics', import_fields)
    datafiles_fields['patient'].extend(import_fields)

    import_fields = [
        ImportField('SocHxTobaccoType', final_name='Tobacco_Use'),
        ImportField('SocHxTobaccoYears', final_name='Tobacco_Years' ), #SocHxAlcohol
        ImportField('SocHxAlcohol', final_name='Alcohol_Use')
    ]
    import_to_patient_table('Social History', import_fields)
    datafiles_fields['patient'].extend(import_fields)


In [102]:
def get_col_types(filename):
    rr=reversed(list(map(lambda s: s.type, datafiles_fields[filename])))
    q=[]
    if filename=='patient':
        q.append('STRING') # for 'PatientID'
    q.extend(list(rr))
    if filename=='patient':
        q.append('DATE') # for 'DiagnosisDate'
    return q

In [201]:
def write_file_as_cbioportal(df:pd.DataFrame, filename):
    output_filename = 'data_clinical_'+filename+'.txt'
    full_filename = dataset_folder + "/01_with_headers/" + output_filename
    if not os.path.exists(os.path.dirname(full_filename)):
        try:
            os.makedirs(os.path.dirname(full_filename))
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

    with open(full_filename, "w") as f:
        col_names_raw = list(data_clinical_patient.columns.values)
        col_names = [x.upper() for x in col_names_raw]
        col_types = get_col_types(filename)  #  ['STRING' ]* len(col_names)
        # f.write('>>>'+str(" ".join(str(datafiles_fields[filename]))))

        f.write('#' + '\t'.join(col_names) + "\n") # header 1, internal name
        f.write('#' + '\t'.join(col_names) + "\n")  # header 2, description
        f.write('#' + '\t'.join(col_types) + "\n")  # header 3, type (STRING or NUMBER)  <<<<<
        f.write('#' + '\t'.join(['1'] * len(col_names)) + "\n")  # header 4, position
        f.write('\t'.join(col_names) + "\n")  # header 5, readable name

        for index, row in df.iterrows():
            output_row = []
            row_as_list = list(row)
            i = 0
            for item in row_as_list:
                cleaned_item = str(item)
                if item == None:
                    cleaned_item = "NA"
                else:
                    if 'time' in str(type(item)):   # datetime.datetime or pandas...timestamp.Timestamp
                        cleaned_item = datetime.strftime(item, '%Y-%M-%d')
                output_row.append(cleaned_item)
                i = i+1
            items_to_str = '\t'.join(output_row)
            f.write(items_to_str + "\n")


In [166]:
def write_files_as_cbioportal():
    # clear out the folder, if it has files
    dir = dataset_folder + "/01_with_headers/" 
    if not os.path.exists(dir):
        os.makedirs(dir)
        print("Created Directory : ", dir)
    else:
        print("Directory already existed : ", dir)  
        import shutil
        for root, dirs, files in os.walk(dir):
            for f in files:
                os.unlink(os.path.join(root, f))


    # loop through all table files
    filename = 'patient'
    write_file_as_cbioportal(data_clinical_patient, filename)

In [None]:
find_patients_first_dx()
print(patients_first_dx.shape)
load_all_tables()
import_fields_to_patient_table()


In [202]:
write_files_as_cbioportal()

Directory already existed :  ../Caisis_NonPublicData/Brain_nov2021/01_with_headers/


In [None]:
print(list(data_clinical_patient.columns.values))
print(data_clinical_patient['BirthDate'])

In [None]:
from zero_dates import square_plus, zero_dates

square_plus(3)

zero_dates('../Caisis_NonPublicData/Prostate_TAN')

In [None]:

data_clinical_patient = patients_first_dx.copy()
import_fields_to_patient_table()
print(data_clinical_patient)