Here, we'll process the clinical data for breast cancer patients that we've downloaded from https://portal.gdc.cancer.gov/

In [4]:
import pandas as pd

pd.set_option('future.no_silent_downcasting', True)

In [5]:
# Load clinical data to DataFrame
clinical = pd.read_json("clinical_cart.json")

In [6]:
clinical.head()

Unnamed: 0,exposures,case_id,project,submitter_id,diagnoses,demographic
0,"[{'alcohol_history': 'Not Reported', 'updated_...",001cef41-ff86-4d3f-a140-a647ac4b10a1,{'project_id': 'TCGA-BRCA'},TCGA-E2-A1IU,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",{'demographic_id': 'd14426b2-e0a0-519a-bea6-4f...
1,"[{'alcohol_history': 'Not Reported', 'updated_...",00a2d166-78c9-4687-a195-3d6315c27574,{'project_id': 'TCGA-BRCA'},TCGA-AN-A0AM,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",{'demographic_id': '7366952a-e8e7-56ec-9867-23...
2,"[{'alcohol_history': 'Not Reported', 'updated_...",00b11ca8-8540-4a3d-b602-ec754b00230b,{'project_id': 'TCGA-BRCA'},TCGA-LL-A440,"[{'synchronous_malignancy': 'Not Reported', 'a...",{'demographic_id': 'f5229922-62e2-51d5-ba4e-94...
3,"[{'alcohol_history': 'Not Reported', 'updated_...",011b9b2d-ebe5-42bf-9662-d922faccc7a1,{'project_id': 'TCGA-BRCA'},TCGA-A7-A26E,"[{'synchronous_malignancy': 'Not Reported', 'a...",{'demographic_id': '87f2d711-4b09-5f94-863a-3f...
4,"[{'alcohol_history': 'Not Reported', 'updated_...",01263518-5f7c-49dc-8d7e-84b0c03a6a63,{'project_id': 'TCGA-BRCA'},TCGA-A8-A07W,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",{'demographic_id': '401df5b3-f3c8-5816-a5ed-c0...


In [7]:
# In clinical DataFrame, 'exposures', 'diagnoses', and 'demographic' are each a series of dictionaries. 
# We'll unpack these and make them DataFrames

exposures = pd.json_normalize([i[0] for i in clinical.exposures])
demographic = pd.json_normalize([i for i in clinical.demographic])
diagnoses = pd.json_normalize([i[0] for i in clinical.diagnoses])

In [8]:
# Within 'diagnoses', each element of the 'treatment' series is itself a list of exactly two dictionaries--
# one for pharmaceutical treatment, the other for radiation therapy.
# For example:
diagnoses['treatments'][0]

[{'updated_datetime': '2019-07-31T21:13:49.301117-05:00',
  'submitter_id': 'TCGA-E2-A1IU_treatment',
  'treatment_id': '27868bc3-23c8-5e85-a0e2-314e6cdf9b2a',
  'treatment_type': 'Radiation Therapy, NOS',
  'state': 'released',
  'treatment_or_therapy': 'no'},
 {'updated_datetime': '2019-07-31T21:13:49.301117-05:00',
  'submitter_id': 'TCGA-E2-A1IU_treatment_1',
  'treatment_id': '5dd4fd8f-53e1-53cf-8729-4cf6034cbb98',
  'treatment_type': 'Pharmaceutical Therapy, NOS',
  'state': 'released',
  'treatment_or_therapy': 'yes',
  'created_datetime': '2019-04-28T13:29:03.550445-05:00'}]

In [9]:
# The only useful data here is whether radiation and/or pharmaceutical therapy was given. 
# I'll record this info into DataFrames called 'radiation' and 'pharm', which we'll combine
# into a DataFrame called 'treatment'
radiation = []
pharm = []

for i in diagnoses.index:
    treat = pd.json_normalize(diagnoses['treatments'][i])
    for j in treat.index:
        #Populates 'radiation' with Boolean based on whether radiation therapy is 'yes' or 'no'
        if 'radiation' in treat['treatment_type'][j].lower(): 
            radiation.append(treat.loc[j]['treatment_or_therapy']=='yes') 
        #Populates 'pharm' with Boolean based on whether pharm therapy is 'yes' or 'no'
        elif 'pharm' in treat['treatment_type'][j].lower():
            pharm.append(treat.loc[j]['treatment_or_therapy']=='yes')

# Convert lists to DataFrames, reset to original indices
radiation = pd.DataFrame(radiation)
radiation = radiation.reset_index(drop=True) 
radiation = radiation.rename(columns={0:'radiation'}).replace({True:1, False:0})

pharm = pd.DataFrame(pharm)
pharm = pharm.reset_index(drop=True)
pharm = pharm.rename(columns={0:'pharm_treatment'}).replace({True:1, False:0})

# Combine into a DataFrame called 'treatment'
treatment = pd.concat([pharm,radiation],axis=1)
treatment.head()

Unnamed: 0,pharm_treatment,radiation
0,1,0
1,0,0
2,1,0
3,1,1
4,1,0


In [10]:
# Now we merge all the DataFrames we've obtained from 'clinical' into a single DataFrame

clinical_df = pd.concat([clinical['case_id'], clinical['submitter_id'], diagnoses.drop(['treatments'], axis=1), treatment, demographic], axis = 1)

In [11]:
clinical_df.head()

Unnamed: 0,case_id,submitter_id,synchronous_malignancy,ajcc_pathologic_stage,days_to_diagnosis,last_known_disease_status,tissue_or_organ_of_origin,days_to_last_follow_up,age_at_diagnosis,primary_diagnosis,...,race,vital_status,updated_datetime,age_at_index,submitter_id.1,days_to_birth,state,year_of_birth,days_to_death,year_of_death
0,001cef41-ff86-4d3f-a140-a647ac4b10a1,TCGA-E2-A1IU,No,Stage IA,0,not reported,"Breast, NOS",337.0,22279.0,"Infiltrating duct carcinoma, NOS",...,white,Alive,2019-07-31T21:13:49.301117-05:00,60,TCGA-E2-A1IU_demographic,-22279.0,released,1950.0,,
1,00a2d166-78c9-4687-a195-3d6315c27574,TCGA-AN-A0AM,No,Stage IIA,0,not reported,"Breast, NOS",5.0,20713.0,"Infiltrating duct carcinoma, NOS",...,white,Alive,2019-07-31T21:13:58.038172-05:00,56,TCGA-AN-A0AM_demographic,-20713.0,released,1954.0,,
2,00b11ca8-8540-4a3d-b602-ec754b00230b,TCGA-LL-A440,Not Reported,Stage IA,0,not reported,"Breast, NOS",759.0,22497.0,"Lobular carcinoma, NOS",...,white,Alive,2019-07-31T21:29:45.266135-05:00,61,TCGA-LL-A440_demographic,-22497.0,released,1951.0,,
3,011b9b2d-ebe5-42bf-9662-d922faccc7a1,TCGA-A7-A26E,Not Reported,Stage IIIA,0,not reported,"Breast, NOS",954.0,26274.0,"Lobular carcinoma, NOS",...,white,Alive,2019-07-31T15:26:11.511879-05:00,71,TCGA-A7-A26E_demographic,-26274.0,released,1939.0,,
4,01263518-5f7c-49dc-8d7e-84b0c03a6a63,TCGA-A8-A07W,No,Stage IV,0,not reported,"Breast, NOS",304.0,27942.0,"Infiltrating duct carcinoma, NOS",...,not reported,Alive,2019-07-31T21:14:01.028332-05:00,76,TCGA-A8-A07W_demographic,-27942.0,released,1933.0,,


In [12]:
clinical_df['year_of_birth'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1006 entries, 0 to 1005
Series name: year_of_birth
Non-Null Count  Dtype  
--------------  -----  
1004 non-null   float64
dtypes: float64(1)
memory usage: 8.0 KB


In [13]:
# Let's drop columns that are not very useful 
# (these columns either have the same value for all/almost all entries, are irrelevant, or are redundant with other cols)

drop_cols = ['days_to_diagnosis','gender','site_of_resection_or_biopsy','last_known_disease_status','morphology','synchronous_malignancy','tissue_or_organ_of_origin','state','submitter_id','classification_of_tumor','diagnosis_id','icd_10_code','tumor_grade','progression_or_recurrence','demographic_id','updated_datetime','age_at_diagnosis','days_to_birth','year_of_death']

In [14]:
clinical_df.drop(drop_cols,inplace=True,axis=1)

In [15]:
# We can take a look at what's left. 
# Some of these columns (case_id, ajcc_staging_system_edition, etc.) we may still exclude from the model, 
# but are left as reference

clinical_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006 entries, 0 to 1005
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   case_id                      1006 non-null   object 
 1   ajcc_pathologic_stage        995 non-null    object 
 2   days_to_last_follow_up       909 non-null    float64
 3   primary_diagnosis            1006 non-null   object 
 4   prior_malignancy             1006 non-null   object 
 5   year_of_diagnosis            1004 non-null   float64
 6   prior_treatment              1006 non-null   object 
 7   ajcc_staging_system_edition  870 non-null    object 
 8   ajcc_pathologic_t            1006 non-null   object 
 9   ajcc_pathologic_n            1006 non-null   object 
 10  ajcc_pathologic_m            1006 non-null   object 
 11  pharm_treatment              1006 non-null   object 
 12  radiation                    1006 non-null   object 
 13  ethnicity         

### Now we'll clean it up. First, we deal with 'days_to_death'. These entries are only non-null if the patient's vital status is 'Dead'.

In [16]:
# We'll make 'days_to_last_follow_up' for patients with vital status 'Dead' equal to their 
# 'days_to_death', then remove the 'days_to_death' column

# Isolate days to death rows
iso_days_death = pd.DataFrame(clinical_df[clinical_df['vital_status']=='Dead']['days_to_death'])

# Re-label the single column in iso_days_death as 'days_to_last_follow_up', so we can use .update 
iso_days_death = iso_days_death.rename(columns={"days_to_death": "days_to_last_follow_up"})

# Update 'days_to_last_follow_up' with 'days_to_death' using .update
clinical_df.update(iso_days_death)

clinical_df.drop(['days_to_death'],inplace=True,axis=1)

In [17]:
# There's an entry with days_to_last_follow_up < 0; we'll replace that with 0
clinical_df.loc[clinical_df['days_to_last_follow_up'] < 0, ['days_to_last_follow_up']] = 0

### Next, we clean up 'ajcc_pathologic_stage'. We address unknown data, then assign each stage an ordinal.

In [18]:
clinical_df['ajcc_pathologic_stage'].value_counts()

ajcc_pathologic_stage
Stage IIA     330
Stage IIB     239
Stage IIIA    141
Stage IA       80
Stage I        80
Stage IIIC     60
Stage IIIB     23
Stage IV       18
Stage X        12
Stage IB        5
Stage II        5
Stage III       2
Name: count, dtype: int64

In [19]:
# First we'll deal with 23 entries which have unknown 'ajcc_pathologic_stage' ('NaN' or 'Stage X').
# For 8 of these, there's enough (TNM) information to infer the pathologic stage
# according to: https://www.cancer.net/cancer-types/breast-cancer/stages. 

clinical_df.loc[[267, 999],['ajcc_pathologic_stage']] = 'Stage IIA'
clinical_df.loc[[279],['ajcc_pathologic_stage']] = 'Stage IIB'
clinical_df.loc[[350],['ajcc_pathologic_stage']] = 'Stage IIIA'
clinical_df.loc[[618, 851],['ajcc_pathologic_stage']] = 'Stage IIIB'
clinical_df.loc[[94, 793],['ajcc_pathologic_stage']] = 'Stage IV'

# For the rest, the pathologic stage is unknown and can't be inferred. We drop these 15:

clinical_df.drop(clinical_df[(clinical_df['ajcc_pathologic_stage'].isnull()) | (clinical_df['ajcc_pathologic_stage']=='Stage X')].index,inplace=True)

In [20]:
# Some entries don't provide a sub-stage (i.e. 'Stage I' rather than 'IA' or 'IB'). 
# We again use the TNM criteria to classify these into the appropriate sub-stage, when possible.

clinical_df.loc[clinical_df.loc[clinical_df['ajcc_pathologic_stage']=='Stage I'].index,['ajcc_pathologic_stage']] = 'Stage IA'

clinical_df.loc[[52, 358, 916],['ajcc_pathologic_stage']] = 'Stage IIA'
clinical_df.loc[[37, 813],['ajcc_pathologic_stage']] = 'Stage IIB'

clinical_df.loc[[111],['ajcc_pathologic_stage']] = 'Stage IIIA'
clinical_df.loc[[221],['ajcc_pathologic_stage']] = 'Stage IIIC'

# For simplicity, we remove 3 elements in 'Stage I' that can neither be classified into 'IA' or 'IB'
clinical_df.drop(clinical_df[(clinical_df['ajcc_pathologic_stage']=='Stage I') & (clinical_df['ajcc_pathologic_n']=='NX')].index,inplace=True)

In [21]:
# Replace pathologic stage with ordinal values

clinical_df = clinical_df.replace(['Stage IA','Stage IB','Stage IIA', 'Stage IIB', 'Stage IIIA', 'Stage IIIB', 'Stage IIIC', 'Stage IV'],[1, 2, 3, 4, 5, 6, 7, 8])

In [22]:
clinical_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 991 entries, 0 to 1005
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   case_id                      991 non-null    object 
 1   ajcc_pathologic_stage        991 non-null    object 
 2   days_to_last_follow_up       991 non-null    float64
 3   primary_diagnosis            991 non-null    object 
 4   prior_malignancy             991 non-null    object 
 5   year_of_diagnosis            989 non-null    float64
 6   prior_treatment              991 non-null    object 
 7   ajcc_staging_system_edition  857 non-null    object 
 8   ajcc_pathologic_t            991 non-null    object 
 9   ajcc_pathologic_n            991 non-null    object 
 10  ajcc_pathologic_m            991 non-null    object 
 11  pharm_treatment              991 non-null    object 
 12  radiation                    991 non-null    object 
 13  ethnicity               

### Further preprocessing performed to simplify the levels in some variables are:

Reclassifictaion of  'primary_diagnosis' into two categories: 'Infiltrating duct carcinoma, NOS'-(n=745), 'Lobular carcinoma, NOS'-(n=191), and others **--done**\

Replacing missing values of two columns (year_of_diagnosis, year_of_birth) with their mean

In [23]:
clinical_df.primary_diagnosis = pd.DataFrame([i if i in ['Infiltrating duct carcinoma, NOS','Lobular carcinoma, NOS'] else 'others' for i in clinical_df.primary_diagnosis],
                                              index=clinical_df.index)

# Replace missing values with the mean
clinical_df['year_of_diagnosis'] = clinical_df['year_of_diagnosis'].fillna(clinical_df['year_of_diagnosis'].mean())
clinical_df['year_of_birth'] = clinical_df['year_of_birth'].fillna(clinical_df['year_of_birth'].mean())

In [24]:
clinical_df.drop(columns=['ajcc_staging_system_edition', 'ajcc_pathologic_t', 'ajcc_pathologic_m', 'ajcc_pathologic_n', 'days_to_last_follow_up'], inplace=True)

In [25]:
clinical_df

Unnamed: 0,case_id,ajcc_pathologic_stage,primary_diagnosis,prior_malignancy,year_of_diagnosis,prior_treatment,pharm_treatment,radiation,ethnicity,race,vital_status,age_at_index,year_of_birth
0,001cef41-ff86-4d3f-a140-a647ac4b10a1,1,"Infiltrating duct carcinoma, NOS",no,2010.0,No,1,0,not hispanic or latino,white,Alive,60,1950.0
1,00a2d166-78c9-4687-a195-3d6315c27574,3,"Infiltrating duct carcinoma, NOS",no,2010.0,No,0,0,not hispanic or latino,white,Alive,56,1954.0
2,00b11ca8-8540-4a3d-b602-ec754b00230b,1,"Lobular carcinoma, NOS",yes,2012.0,No,1,0,not hispanic or latino,white,Alive,61,1951.0
3,011b9b2d-ebe5-42bf-9662-d922faccc7a1,5,"Lobular carcinoma, NOS",yes,2010.0,No,1,1,not hispanic or latino,white,Alive,71,1939.0
4,01263518-5f7c-49dc-8d7e-84b0c03a6a63,8,"Infiltrating duct carcinoma, NOS",no,2009.0,No,1,0,not reported,not reported,Alive,76,1933.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001,fe15c48b-116d-461e-9b40-68514730fd18,1,"Infiltrating duct carcinoma, NOS",no,2003.0,No,1,1,not hispanic or latino,white,Alive,60,1943.0
1002,fe2cd610-aa52-4789-ac62-7683281bb22f,8,"Infiltrating duct carcinoma, NOS",no,2013.0,No,0,0,not hispanic or latino,black or african american,Alive,35,1978.0
1003,fe7f74b8-20f4-4471-91dc-4cca8c68e5c0,1,"Infiltrating duct carcinoma, NOS",no,2009.0,No,1,0,not hispanic or latino,black or african american,Alive,46,1963.0
1004,fec0da58-1047-44d2-b6d1-c18cceed43dc,3,"Lobular carcinoma, NOS",no,2011.0,No,1,1,not hispanic or latino,white,Alive,71,1940.0


In [26]:
# Finally, let's save the processed data to csv

clinical_df.to_csv('clinical_data.csv', index=False)

That's all the preprocessing for now.