In [None]:
# CaisisImport. A utility to prepare a Caisis-to-Excel export for import into Oncoscape (via cBioPortal format).

dataset_folder = '../Caisis_NonPublicData/Brain_nov2021'
diseaseChosen = 'Brain'  
study_identifier = 'brain_abc'  #used for copying brain_abc_custom_caisis_prep.py to this folder as custom_caisis_prep.py.
type_of_cancer = "misc"  # Use cbioportal term

# dataset_folder = '../Caisis_NonPublicData/Sarcoma_nov2021'   
# diseaseChosen = 'Sarcoma'  #'Brain'
# study_identifier = 'sarcoma_caisis'  
# type_of_cancer = "misc"  # Use cbioportal term

# dataset_folder = '../Caisis_NonPublicData/Liver_nov2021'   
# diseaseChosen = 'Liver Cancer'   
# study_identifier = 'liver_caisis' 
# type_of_cancer = "misc"  # Use cbioportal term

# dataset_folder = '../Caisis_NonPublicData/Pancreas_nov2021'   
# diseaseChosen = 'Pancreas Cancer'  #'Brain'
# study_identifier = 'pancreas_caisis'  
# type_of_cancer = "misc"  # Use cbioportal term




In [None]:

# Steps:
# . Limit by disease
# . Read in CSVs
# . Export as TSVs
# . ZeroDates
# . 

# oncoscape_bar_override: {"version": "1.0", "style": "Symbols", "shape": "circle", "subtypeColors": {"rp": "#FF0000", "xrt": "#00FF00"}}

import pandas as pd
import os
import errno
import sys
import math
import shutil
import pipes
import re
from typing import List
from datetime import datetime
from datetime import date
from os.path import exists 

from zero_dates import   zero_dates

global custom_prep_filename

#===============================================================
custom_prep_fullpath = os.path.join(".", study_identifier +"_custom_caisis_prep.py") # Can override this to any location you like.
study_description = study_identifier + ' study'

## knownTables = ['Demographics', 'Encounters', 'ClinicalStage', 'LabTests', 'PathTest', 'SocialHistory', 'LabTestGenetics', 'PathStageGrade', 'RadiationTherapy',  ]
# table Status is treated separately, first.
knownTables = ['Clinical Stages', 'Demographics', 'Encounters', 'Medical Therapy', 'Pathology', 'Procedures', 'Radiation', 'Social History', 'Status']
date_format = '%Y-%m-%d'  # e.g. 2021-01-23

#days_before_dx_to_include = 0  # if 30, can include events up to 30 days before the zero date. Default is 30.
global missing_date_str
missing_date_str = '2222-02-02'  # use instead of empty/None to indicate missing data

missing_date = datetime.strptime(missing_date_str, date_format)  #datetime(2222, 2,2)  # TBD: 


## -- internal initialization --
patients_first_dx = pd.DataFrame()   # just PatientId and DiagnosisDate
data_clinical_patient = pd.DataFrame() # Will become the data_clinical_patient table.
loaded_tables_dict = {}
data_clinical_patient = None
data_clinical_timeline_dfs = {}   # dictionary of dataframes, keyed off of "timeline-foo" names.
data_clinical_timeline_graph_markers = {}  # dictionary of graph_marker_type values, to define arcs, bars, squares, diamonds, circles, triangles.
datafiles_fields = {}
current_lookup_field = None  # placeholder.
noval_list = []   # List of patient IDs with no associated value in one or more of the import_fields. Use this for reporting. 

foldername_with_headers = '01_with_headers'
foldername_zero_dates = '02_zero_dates'
custom_prep_filename = "custom_caisis_prep.py"
has_custom_prep_file = False
# src_path = dataset_folder
# dst = "."

file_path = custom_prep_fullpath 
if  exists(file_path):
    print("== custom prep exists.")
    shutil.copy(file_path, os.path.join(".", custom_prep_filename))
    has_custom_prep_file = True
    if 'custom_caisis_prep' in sys.modules.keys():
        print("====================prepmod exists   ")
        del sys.modules['custom_caisis_prep']
    import custom_caisis_prep
    print("testing...")
    print(custom_caisis_prep.version)
else:
    print("== no custom prep exists.")
    if  exists("./"+custom_prep_filename):
        os.remove("./"+custom_prep_filename)



In [None]:
graph_marker_types = {
    "arcs"    : { "mtype": "Arcs", "shape":None},
    "bars"    : { "mtype": "Bars", "shape":None},
    "circles" : { "mtype": "Symbols", "shape":"circle"},
    "squares" : { "mtype": "Symbols", "shape":"square"},
    "triangles": { "mtype": "Symbols", "shape":"triangle"},
    "diamonds" : { "mtype": "Symbols", "shape":"diamond"},
    "stars"    : { "mtype": "Symbols", "shape":"star"},  # Not yet implemented
}


In [None]:
class ImportField:
    source_name:str
    final_name:str
    type:str = 'STRING'  # STRING, NUMBER, or DATE (DATE gets turned into STING in final TSV files.)
    conversion_function = None

    def __init__(self, source_name, final_name, type="STRING", conversion_function=None ):
        self.source_name = source_name
        self.final_name = final_name
        self.type = type
        self.conversion_function = conversion_function

    def __str__(self):
        return self.source_name +"->"+ self.final_name+", type="+self.type+"."

In [None]:
core_timeline_fields = [
    ImportField('EVENT_TYPE', final_name='EVENT_TYPE') , 
    ImportField('START_DATE', final_name='START_DATE', type='DATE') , 
    ImportField('STOP_DATE', final_name='STOP_DATE', type='DATE') , 
]

In [None]:
def save_file_and_folder(filename, obj):
    if not os.path.exists(os.path.dirname(filename)):
        try:
            os.makedirs(os.path.dirname(filename))
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

    with open(filename, "w") as f:
        f.write(obj)

def patients_and_descriptive_header(df:pd.DataFrame, header):
    # TBD: note header in separate place
    str_ids = df.to_string(index=False)
    a =  str_ids  # header + "\n" + str_ids
    return a

In [None]:
def find_patients_first_dx():
    global patients_first_dx, data_clinical_patient, dataset_folder
    print("\nSTEP 1: Find patients' first diagnosis date.")
    print(dataset_folder)
    status_fullpath = dataset_folder +'/raw_csv/Status.csv'
    if not os.path.exists(dataset_folder +'/raw_csv'):
        sys.exit('ERROR: Cannot find folder "raw_csv".')

    if not os.path.exists(status_fullpath):
        sys.exit('ERROR: Status.csv file not found at ' + status_fullpath)

    tbl_status = pd.read_csv(status_fullpath) #'Demographics.csv')
    tbl_status['Date'] =pd.to_datetime(tbl_status.StatusDate)
    tbl_status['PatientId'] =  tbl_status['PatientId'].astype(str)

    dxRows = tbl_status[tbl_status.Status.eq('Diagnosis Date')]
    diseaseRows = dxRows[dxRows.StatusDisease.eq(diseaseChosen)].sort_values('Date')

    dxNanRows = diseaseRows[diseaseRows.Date.isna()][['PatientId']]
    if(dxNanRows.size > 0):
        print('CHECK NoDxDate? REPORT NoDxDate.txt has *' +str(dxNanRows.size)+'* diagnoses of '+diseaseChosen+' without diagnosis dates.')
        report_body = patients_and_descriptive_header(dxNanRows, 'Patients with "'+diseaseChosen+" but no DiagnosisDate:")
    #    save_file_and_folder('reports/NoDxDate.txt', report_body)
        save_file_and_folder(dataset_folder+'/reports/NoDxDate.txt', report_body)
    else:
        print('CHECK NoDxDate? OK')
        print('TBD: delete existing NoDxDate.txt report.')
        if os.path.exists(dataset_folder+'/reports/NoDxDate.txt'):
            os.remove(dataset_folder+'/reports/NoDxDate.txt')
            print('REMOVED NoDxDate.txt')
            
    diseaseDatedRows = diseaseRows[diseaseRows.Date.isna()==False]
    patientid_date_dict = {}
    print("===START PATIENT LOOP===")
    for index, row in diseaseDatedRows.iterrows():
        pid = str(row['PatientId'])
        if((pid in patientid_date_dict) == False):
            justYMD = row['Date']  #datetime.strftime(row['Date'], "%Y-%m-%d")
            patientid_date_dict[pid] = justYMD
            
        else:
            pass
    print("===END PATIENT LOOP===")

    print('Resulting patient IDs = ' + str(len(patientid_date_dict)))   

    data = []
    for key in patientid_date_dict.keys():
        new_row = [key, patientid_date_dict[key]]
        data.append(new_row)
    patients_first_dx = pd.DataFrame(data, columns=["PatientId", "DiagnosisDate"])
    data_clinical_patient = patients_first_dx.copy()

    report_body = patients_and_descriptive_header(patients_first_dx, 'Patients First Diagnosis Date')
    
    save_file_and_folder(dataset_folder+'/reports/PatientsFirstDx.txt', report_body)
    #patients_first_dx


In [None]:
def load_all_tables():
    global loaded_tables_dict
    print("\nSTEP 2: Load all CSV tables.")
    for tableName in knownTables:
        full_path = dataset_folder+'/raw_csv/'+tableName+'.csv'
        if not os.path.exists(full_path):
            print("WARN -- missing table "+tableName)
        else:
            print('Reading ' + tableName+'.csv...')
            df = pd.read_csv(full_path)
            df = df.astype({"PatientId": str})
            #df.set_index('PatientId', inplace=True)
            #print(df.head(2))
            loaded_tables_dict[tableName] = df
            print(tableName +" read.")

In [None]:
def process_val(val, ifield:ImportField):
    # print("enter process_val, name="+ifield.source_name+",type="+ifield.type+"!")
    if ifield.type=='NUMBER':  # for now, this means just integers
        print(str(val))
        try:
            val = int(val)
            if math.isnan(val):
                print("nan found")
                val = ""
            if str(val)=="nan":
                print("nan text found")
                val = ""   
        except:
            val =""
    if ifield.type=='DATE':
        # print(str(ifield.type)+", val-type=" + str(type(val)) + " val=("+str(val)+")")
        if isinstance(val, str):
            val = datetime.strptime(val, date_format)
            if str(val) == "NaT":  # Not a Time
                val = ""
        else:
            val="" # TBD: error reporting
            print("WARN: expected string for date in " +ifield.source_name+", but isn't a string.")
    try:
        #print("process_val about to return "+str(val))
        return val
    except:
        noval_list.append(pid)
        return None


# imports: a dictionary of column name from the tname table into patient table, where value is a function to convert data
def import_to_patient_table(tname, import_fields:List[ImportField]): 
    global patients_first_dx, data_clinical_patient, loaded_tables_dict, current_lookup_field, noval_list
    if (tname in loaded_tables_dict) == False:
        print("WARN -- Could not process table "+tname)
    else:
        current_table = loaded_tables_dict[tname]
        for ifield in import_fields:

            if ifield.source_name in current_table.columns:
                data_clinical_patient.insert(1, ifield.final_name, None)
               # global current_lookup_field
                current_lookup_field= ifield.source_name
                print("Looking for field " + tname+"."+current_lookup_field)
                noval_list.clear()

                def get_field_value(pid):
                    global current_lookup_field, noval_list
                    gg = current_table.loc[current_table['PatientId'] == str(pid)]
                    hh = gg[current_lookup_field]
                    val = None
                    try:
                        val = hh.iloc[0] 
                        return process_val(val, ifield)

                    except:
                        #print("ERROR "+current_lookup_field+", "+str(pid)+"   "+str(val)+".")
                        #typef, value, traceback = sys.exc_info()
                        #print('Error value '+ str (value))
                        pass


                new_values = data_clinical_patient['PatientId'].apply(get_field_value)
                if len(noval_list) > 0 :
                    percent_str = "{0:.0%}".format(len(noval_list) / new_values.shape[0])
                    print("- Field " + tname+"."+current_lookup_field + " had " + str(len(noval_list)) + " missing entries. ("+percent_str+" empty)")
                    #print(str( len(noval_list) / new_values.shape[0]))


                data_clinical_patient[ifield.final_name] = new_values
                #print(data_clinical_patient.head(12))
            else:
                print("==== ERROR: Three is no column '"+ifield.source_name+"' in '"+tname+"' table. ====")

In [None]:
#
# event_table_name: name of table as it will appear in "data_clinical_<timeline-foo>.txt" filename. For events, should be "timeline-foo".
# tname: Name of the dataframe table we have loaded from CSV, which contains these events to import.
# import_fields: list of ImportFields OR str. str for case of EVENT_TYPE, which may not exist as column in an event CSV.

# event_Type is the name of the subtype column (e.g., RADIATION)
# subtype_source is the CSV column with the  source info for the subtype (e.g., RADTXTYPE)
# safelist, if not None, is the *only* values acceptable for subtype.
def import_to_event_table(event_table_name, tname, import_fields:List[ImportField], event_type="EVENT", start_date_col_name=None, stop_date_col_name=None, subtype_source=None, graph_marker=None, safelist=None): 
    global patients_first_dx, data_clinical_patient, loaded_tables_dict, current_lookup_field, noval_list, has_custom_prep_file
    print("INFO starting to import for event_type of....")
    print(str(event_type))
    print("... .")
    if (tname in loaded_tables_dict) == False:
        print("WARN -- Could not process table "+tname)
    else:
        # strategy...
        #   a_list = [['dog', 1], ['cat', 2], ['fish', 3]]
        #   df = pd.DataFrame(a_list, columns=['animal', 'amount'])

        current_table = loaded_tables_dict[tname]

        # df_new_events = pd.DataFrame
        # if  event_table_name not in data_clinical_timeline_dfs:
        #     df_new_events = pd.DataFrame(columns=["PatientID", "START_DATE", "STOP_DATE", "EVENT_TYPE"])
        #     data_clinical_timeline_dfs[event_table_name] = df_new_events

        these_subtype_rewrites = None  # for typos and cleanup
        these_subtype_groupings = None # for bucketing for legend

        noval_list.clear()
        rows_to_save = []
        column_names = ["PatientId", "EVENT_TYPE", "START_DATE", "STOP_DATE"]
        for ifield in import_fields:
            if ifield.source_name in current_table.columns:
                column_names.append(ifield.final_name)
        if subtype_source is not None:
            column_names.append(event_type)  # e.g., "RADIATION"
            if has_custom_prep_file:

                if event_type not in custom_caisis_prep.subtype_rewrites:
                    print("INFO: Event type '"+event_type+"' has no subtype rewrite rules.")
                else:
                    these_subtype_rewrites = custom_caisis_prep.subtype_rewrites[event_type]
                    print("INFO: Event type '"+event_type+"' has "+ str(len(these_subtype_rewrites)) + " subtype rewrite rules.")


                
                if event_type not in custom_caisis_prep.subtype_groupings:
                    print("INFO: Event type '"+event_type+"' has no subtype grouping rules.")
                else:
                    these_subtype_groupings = custom_caisis_prep.subtype_groupings[event_type]
                    print("INFO: Event type '"+event_type+"' has "+ str(len(these_subtype_groupings)) + " subtype grouping rules.")
            else:
                print("In import_to_event_Table, has_custom_prep_file FALSE")
        print(">>>>>>>>>>>")
        print('For ' + event_type+", rewrite keys:")
        print(str(these_subtype_rewrites))
        print('>>>end')
        # Loop through rows in current_table.
        # Pull out each field from import_fields, and add event type, and start and stop dates.
        current_table_rows_of_dict = current_table.to_dict('records')
        for row in current_table_rows_of_dict:

            etype = None
            if event_type in row:  # get event from the event_type column
                etype = row[event_type]
            else:  # just use event_type as text.
                etype = event_type

            # common to all events: pid, type, start, and stop.
            pid = row['PatientId']
            
            start_date = missing_date
            if start_date_col_name != None:
                    start_date = row[start_date_col_name]
            stop_date = missing_date
            if stop_date_col_name != None:
                stop_date = row[stop_date_col_name]

            # print('pid='+pid+', etype='+etype+', start='+str(start_date)+',  stop='+str(stop_date)+'.')
            
            new_row = [pid, 
                process_val(etype, core_timeline_fields[0]), 
                process_val(start_date, core_timeline_fields[1]), 
                process_val(stop_date, core_timeline_fields[2]), 
                ]

            print("columns....")
            print(*current_table.columns, sep = ", ") 
            print("done 123")
            if "EncECOG_Score" in current_table.columns:
                print('YES, ECOG is there.')
            else:
                print("NO, ECOG is NOT there.")

            # now append all import fields
            for ifield in import_fields:
                if ifield.source_name in current_table.columns:
                    fieldval = row[ifield.source_name]
                    if ifield.source_name == "EncECOG_Score":
                        print("===EncECOG_Score Seen!===")
                        print('if source_name='+ifield.source_name+',  fieldval = ' + str(fieldval))
                    # process raw value
                    val = None
                    try:
                        val = process_val(fieldval, ifield)
                        if ifield.source_name == "EncECOG_Score":
                            print('try source_name='+ifield.source_name+',  process_val = ' + str(fieldval))

                    except:
                        print("ERROR fiif  "+current_lookup_field+", "+str(pid)+"   "+str(val)+".")
                        typef, value, traceback = sys.exc_info()
                        print('Error value '+ str (value))
                        pass

                    new_row.append(val)
            print("new_row....")
            print(*new_row, sep = ", ")

            if subtype_source is not None:
                # print('wwww subtype_source is...')
                # print(subtype_source)
                # print(str(row))
                # print('val...')
                val = str(row[subtype_source]).strip()
                if these_subtype_rewrites:
                    key = val.lower()
                    if key in these_subtype_rewrites:
                        val = these_subtype_rewrites[key]
                        # print("Rewrite Match: "+key+" -> " + val)

                if safelist is not None:
                    if val not in safelist:
                        # print('safelist does not contain ['+val+'].')
                        continue

                if these_subtype_groupings:
                    lower_val = val.lower()
                    grouping_match_found = False
                    for regex in these_subtype_groupings:
                        if re.search(regex, lower_val, re.IGNORECASE) is not None:
                            val = these_subtype_groupings[regex]
                            # print("Grouping Match: "+ regex +" -> " + val)
                            grouping_match_found = True
                            break
                    if grouping_match_found == False:
                        val = "Other " + event_type

                if val == "-1234":
                    print("=====1234====")

                if (val.strip() == "") or (val.lower()=="nan"):
                    val = "MISSING" # TBD: add reporting of error.
                    print("WARNING: missing subtype value.")
                new_row.append(val)  #row[subtype_source])

            # print(new_row)
            rows_to_save.append(new_row)

        # # turn list into df.
        # new_values = data_clinical_patient['PatientId'].apply(get_field_value)
        # data_clinical_patient[ifield.final_name] = new_values
        # print(data_clinical_patient.head(12))


        print('column_names....')
        print(type(column_names))
        print(column_names)
        print('---- end of column_names ---')

        added_df = pd.DataFrame(rows_to_save, columns=column_names)
        data_clinical_timeline_dfs[event_table_name] = added_df

        data_clinical_timeline_graph_markers[event_table_name] = graph_marker
        
        # if len(noval_list) > 0 :
        #     percent_str = "{0:.0%}".format(len(noval_list) / new_values.shape[0])
        #     print("- Field " + tname+"."+current_lookup_field + " had " + str(len(noval_list)) + " missing entries. ("+percent_str+" empty)")
        #     #print(str( len(noval_list) / new_values.shape[0]))


In [None]:
# Import LAST_DATE of given field in another table.
# given table T, dataval field F, and dateval field D. Like largest value in dateval_field "StatusDate", for matching_val "Alive" in import_field.source_name "Status". (Then, Transform)
global_gg = []

def import_lastdate_to_patient_table(tname, import_fields:List[ImportField], dateval_field, dataval_field, matching_val): 
    global patients_first_dx, data_clinical_patient, loaded_tables_dict, current_lookup_field, noval_list
    if (tname in loaded_tables_dict) == False:
        print("WARN -- Could not process lastdate for table "+tname)
    else:
        current_table = loaded_tables_dict[tname]
        for ifield in import_fields: # really only one field here.

            if ifield.source_name in current_table.columns:
                data_clinical_patient.insert(1, ifield.final_name, None)
                current_lookup_field= ifield.source_name
                print("Looking for matching_val ["+matching_val+"] tbl " + tname+"."+current_lookup_field)
                noval_list.clear()

                def get_field_value(pid):
                    global current_lookup_field, noval_list, global_gg

                    gg = current_table.loc[ (current_table['PatientId'] == str(pid)) & (current_table[dataval_field] == str(matching_val))  ]
                    last_alive_str = gg[dateval_field].values[-1]  # <-- TBD: SHould actually loop through dates trings for last date. We *might* get an out of order Alive statement, so don't assume last in list is always the latest.
                    val = None
                    try:
                        val = last_alive_str 
                        val_final = process_val(val, ifield)
                        return val_final
                    except:
                        print("ERROR "+current_lookup_field+", "+str(pid)+"   "+str(val)+".")
                        #typef, value, traceback = sys.exc_info()
                        #print('Error value '+ str (value))
                        pass

                new_values = data_clinical_patient['PatientId'].apply(get_field_value)
                if len(noval_list) > 0 :
                    percent_str = "{0:.0%}".format(len(noval_list) / new_values.shape[0])
                    print("- Field " + tname+"."+current_lookup_field + " had " + str(len(noval_list)) + " missing entries. ("+percent_str+" empty)")

                data_clinical_patient[ifield.final_name] = new_values

            else:
                print("==== ERROR: There is no column '"+ifield.source_name+"' in '"+tname+"' table. Attempting lastdate. ====")

In [None]:
data_clinical_patient
global_gg


In [None]:
def import_fields_to_patient_table():
    # This is the core of specifying which fields we want to import. May change between datasets, so consider moving to an external file.
    
    print("\nSTEP 3: Import columns to patient table.")

    import_fields = [
       ImportField('PtGender', final_name='Sex') ,   #, conversion_function=None),
       ImportField('PtBirthDate', final_name='BirthDate', type="DATE" ),
       ImportField('PtDeathDate', final_name='DeathDate', type="DATE" )
    ]
    import_to_patient_table('Demographics', import_fields)
    datafiles_fields['patient'].extend(import_fields)

   
    import_fields = [
        ImportField('Status', final_name='LastAliveDate', type="DATE") #, conversion_function=None )
    ]
    import_lastdate_to_patient_table('Status', import_fields, dataval_field="Status", dateval_field="StatusDate", matching_val="Alive")
    datafiles_fields['patient'].extend(import_fields)
    

    import_fields = [
       ImportField('SocHxTobaccoType', final_name='Tobacco_Use'),
       ImportField('SocHxTobaccoYears', final_name='Tobacco_Years' ), #SocHxAlcohol
       ImportField('SocHxAlcohol', final_name='Alcohol_Use')
    ]
    import_to_patient_table('Social History', import_fields)
    datafiles_fields['patient'].extend(import_fields)


In [None]:
def import_fields_to_event_tables():
    # This is the core of specifying which fields we want to import. May change between datasets, so consider moving to an external file.

    print("\nSTEP 4: Import to events tables.")
    import_fields = [
        ImportField('EncECOG_Score', final_name='ECOG', type='NUMBER') ,   #, conversion_function=None),
        ImportField('EncKPS', final_name='KPS', type='NUMBER') ,   #, conversion_function=None),
    ]
    import_to_event_table('timeline-encounters', 'Encounters', import_fields, event_type='ENCOUNTERS', start_date_col_name='EncDate', stop_date_col_name='EncDate', graph_marker=graph_marker_types["diamonds"] )
    datafiles_fields['timeline-encounters'] = core_timeline_fields.copy()
    datafiles_fields['timeline-encounters'].extend(import_fields)


    print('datafiles_fields[timeline-encounters]  is....')
    print(datafiles_fields['timeline-encounters'] )

    import_fields = [
        ImportField('RadTxType', final_name='RadTxType' ) ,   
        ImportField('RadTxTarget', final_name='RadTxTarget' ) ,   
        ImportField('RadTxTotalDose', final_name='RadTxTotalDose' ) ,   
    ]
    import_to_event_table('timeline-radiation', 'Radiation', import_fields, event_type='RADIATION', start_date_col_name='RadTxDate', stop_date_col_name='RadTxStopDate', subtype_source="RadTxType", graph_marker=graph_marker_types["arcs"] )
    datafiles_fields['timeline-radiation'] = core_timeline_fields.copy()
    datafiles_fields['timeline-radiation'].extend(import_fields)
    datafiles_fields['timeline-radiation'].extend( [ ImportField('RADIATION', final_name='RADIATION') ])
    
    import_fields = [
        ImportField('RadTxType', final_name='RadTxType' ) ,   
        ImportField('RadTxTarget', final_name='RadTxTarget' ) ,   
        ImportField('RadTxTotalDose', final_name='RadTxTotalDose' ) ,   
    ]
    import_to_event_table('timeline-medicaltherapy', 'Medical Therapy', import_fields, event_type='MEDICALTHERAPY', start_date_col_name='MedTxDate', stop_date_col_name='MedTxStopDate', subtype_source="MedTxAgent" )
    datafiles_fields['timeline-medicaltherapy'] = core_timeline_fields.copy()
    datafiles_fields['timeline-medicaltherapy'].extend(import_fields)
    datafiles_fields['timeline-medicaltherapy'].extend( [ ImportField('MEDICALTHERAPY', final_name='MEDICALTHERAPY') ])
    
    import_fields = [
        # ImportField('Status', final_name='Status' ) ,   
    ]
    import_to_event_table('timeline-status', 'Status', import_fields, event_type='STATUS', start_date_col_name='StatusDate', stop_date_col_name='StatusDate', subtype_source="Status", safelist=[
        '1st Progression',
        '2nd Progression',
        '3rd Progression',
        'Last Status Check',
        'Recurrence',
        'Last Status Check',
        'Local Recurrence',
        'Locoregional',
        'Metastatic Disease',
        'New Diagnosis',
        'Newly Diagnosed',
        'No Evidence of Disease',

        ] )
    datafiles_fields['timeline-status'] = core_timeline_fields.copy()
    datafiles_fields['timeline-status'].extend(import_fields)
    datafiles_fields['timeline-status'].extend( [ ImportField('STATUS', final_name='STATUS') ])
    



In [None]:
def get_col_types(filename):
    rr=reversed(list(map(lambda s: s.type, datafiles_fields[filename])))  # ????? TBD: reversed?
    q=[]
    # if filename=='patient':
    q.append('STRING') # for 'PatientID'
    if 'timeline-' in filename:
        # rr2=list(map(lambda s: s, datafiles_fields[filename]))
        # for w in list(rr2):
        #     print( "sourcename> "+str(w.source_name))

        wq=list(map(lambda s: s.type, datafiles_fields[filename]))
        print("in getcoltypes for "+filename+",...")
        for atype in list(wq):
            print( "> "+str(atype))
            q.append(atype)
    else:
        # just patient table
        q.extend(list(rr))

    if filename=='patient':
        q.append('DATE') # for 'DiagnosisDate'

    print("end getcoltypes,")
    print(q)
    return q

In [None]:
def write_file_as_cbioportal(df:pd.DataFrame, filename):
    output_filename = 'data_clinical_'+filename+'.txt'
    full_filename = dataset_folder + "/"+foldername_with_headers+"/" + output_filename
    if not os.path.exists(os.path.dirname(full_filename)):
        try:
            os.makedirs(os.path.dirname(full_filename))
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

    with open(full_filename, 'w', newline='\r\n') as f:
        col_names_raw = list(df.columns.values)   # was data_clinical_patient.columns.values
        col_names_raw[0] = "PATIENT_ID" # Hack to insert underscore
        print('col_names_raw...')
        print(col_names_raw)

        col_names = [x.upper() for x in col_names_raw]
        col_types = get_col_types(filename)
        
        f.write('#' + '\t'.join(col_names) + "\n") # header 1, internal name
        f.write('#' + '\t'.join(col_names) + "\n")  # header 2, description
        f.write('#' + '\t'.join(col_types) + "\n")  # header 3, type (STRING or NUMBER)  <<<<<
        f.write('#' + '\t'.join(['1'] * len(col_names)) + "\n")  # header 4, position
        f.write('\t'.join(col_names) + "\n")  # header 5, readable name

        for row in df.iterrows():
            output_row = []
            row_as_list = list(row[1])
            i=0

            for item in row_as_list:
                cleaned_item = str(item)
                # print('--> '+cleaned_item)
                if item == None:
                    cleaned_item = ""
                else:
                    if 'time' in str(type(item)):   # datetime.datetime or pandas...timestamp.Timestamp # this does not work:  col_types[i]=="DATE":  # 
                        # print('DATE for col ' + str(i) +', '+str(col_names[i]))
                        cleaned_item = datetime.strftime(item, date_format) # '%Y-%M-%d')
                if (str(item)=="nan"):
                    cleaned_item = ""
                output_row.append(cleaned_item)
                i=i+1

            items_to_str = '\t'.join(output_row)
            f.write(items_to_str + "\n")


In [None]:
def create_and_empty_folder(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
        print("Created Directory : ", dir)
    else:
        print("Directory already existed : ", dir)  
        import shutil
        for root, dirs, files in os.walk(dir):
            for f in files:
                os.unlink(os.path.join(root, f))


In [None]:
def write_files_as_cbioportal():
    # clear out the folder, if it has files
    dir = dataset_folder + "/"+foldername_with_headers+"/" 
    create_and_empty_folder(dir)

    print('Write patient meta file...')

    filename = 'patient'
    write_file_as_cbioportal(data_clinical_patient, filename)
    
    # loop through all table files
    for df_key in data_clinical_timeline_dfs.keys():
        print('Attempt to write '+df_key+'.')
        write_file_as_cbioportal( data_clinical_timeline_dfs[df_key], df_key)


In [None]:
find_patients_first_dx()
print(patients_first_dx.shape)
load_all_tables()
datafiles_fields = {
    'patient': []   #not including PATIENT_ID
}
import_fields_to_patient_table()
import_fields_to_event_tables()


In [None]:
write_files_as_cbioportal()

In [None]:
def write_meta_files(dir):
    # Assumes dir already created, with create_and_empty_folder().

    full_filename = dir+"/" + "meta_study.txt"
    with open(full_filename, 'w', newline='\r\n') as f:
        f.write("type_of_cancer: "+ type_of_cancer +"\n") 
        f.write("cancer_study_identifier: "+ study_identifier +"\n" ) 
        f.write("name: "+ study_identifier +"\n") 
        f.write("short_name: "+ study_identifier +"\n") 
        f.write("description: "+ study_description +"\n") 
        f.write("add_global_case_list: true" +"\n") 

    full_filename = dir+"/" + "meta_clinical_patient.txt"
    with open(full_filename, 'w', newline='\r\n') as f:
        f.write("cancer_study_identifier: "+ study_identifier +"\n" ) 
        f.write("genetic_alteration_type: CLINICAL\n") 
        f.write("datatype: PATIENT_ATTRIBUTES\n") 
        f.write("data_filename: data_clinical_patient.txt" +"\n") 

    # ==== START SAMPLE FILE =====
    # Oncoscape expects a sample/specimen file. Currently we don't support importing them, 
    # so for now make a dummy file with a one-to-one mapping of patientid with "sample-<patientid>".
    full_filename = dir+"/" + "meta_clinical_specimen_placeholder.txt"
    with open(full_filename, 'w', newline='\r\n') as f:
        f.write("cancer_study_identifier: "+ study_identifier +"\n" ) 
        f.write("genetic_alteration_type: CLINICAL\n") 
        f.write("datatype: SAMPLE_ATTRIBUTES\n") 
        f.write("data_filename: data_clinical_specimen_placeholder.txt" +"\n") 

    full_filename = dir+"/" + "data_clinical_specimen_placeholder.txt"
    with open(full_filename, 'w', newline='\r\n') as f:
        f.write("#Unique_patient_identifier	SPECIMEN_ID\n")
        f.write("#STRING	STRING\n")
        f.write("#1	1\n")
        f.write("PATIENT_ID	SPECIMEN_ID\n")
        for row in data_clinical_patient.iterrows():
            f.write(row[1]['PatientId'] +"\tsample-" + row[1]['PatientId']+"\n")
    # ==== END SAMPLE FILE =====

    for event_table_name in data_clinical_timeline_dfs.keys():
        full_filename = dir+"/" + "meta_clinical_"+event_table_name+".txt"
        with open(full_filename, 'w', newline='\r\n') as f:
            f.write("cancer_study_identifier: "+ study_identifier +"\n" ) 
            f.write("genetic_alteration_type: CLINICAL\n") 
            f.write("datatype: TIMELINE\n") 
            f.write("data_filename: data_clinical_"+ event_table_name+".txt" +"\n") 
            graph_marker = data_clinical_timeline_graph_markers[event_table_name]
            if graph_marker is None:
                # f.write('oncoscape_bar_override: {"version": "1.0", "style": "Bars", "shape": "circle", "subtypeColors": {"rp": "#FF0000", "xrt": "#00FF00"} }')
                pass
            else:
                label = event_table_name
                if label.startswith('timeline-'):
                    label = label[len('timeline-'):]

                bar_override = 'oncoscape_bar_override: {"version": "1.0", "label": "'+label+'" '
                shape_extender = ""
                print("===========  graph_marker_key ===========")
                print(str(graph_marker))
                if graph_marker["shape"] is not None:
                    shape_extender = ', "shape": "'+graph_marker["shape"]+'" '
                bar_override = bar_override + ', "style": "'+graph_marker["mtype"]+'" ' + shape_extender
                bar_override = bar_override + ' }'
                

                f.write(bar_override)


In [None]:
dir = dataset_folder + "/"+foldername_zero_dates+"/" 
create_and_empty_folder(dir)
zero_dates(dataset_folder+'/'+foldername_with_headers, patient_info_filename = "data_clinical_patient.txt", zero_day_column_name =  "DIAGNOSISDATE", output_folder=dir )


In [None]:
write_meta_files(dir)