In [3]:
import numpy as np
import pandas as pd


In [8]:
## define directory
data_directory = 'data/'
genera_survivail_information_path = 'TCGA-BRCA.survival.tsv'
detail_survivail_information_path = 'survival-BRCA_survival.txt'
detail_phenotype_information_path = 'TCGA-BRCA.GDC_phenotype.tsv'

## load all necessary data
general_survival_information = pd.read_csv(data_directory + genera_survivail_information_path, sep='\t', header=0)
detail_survivail_information = pd.read_csv(data_directory + detail_survivail_information_path, sep='\t', header=0)

In [9]:
general_survival_information

Unnamed: 0,sample,OS,_PATIENT,OS.time
0,TCGA-C8-A275-01A,0,TCGA-C8-A275,1
1,TCGA-BH-A1F8-11B,1,TCGA-BH-A1F8,1
2,TCGA-BH-A1F8-01A,1,TCGA-BH-A1F8,1
3,TCGA-AC-A7VC-01A,0,TCGA-AC-A7VC,1
4,TCGA-AN-A0AM-01A,0,TCGA-AN-A0AM,5
...,...,...,...,...
1255,TCGA-B6-A0RE-01A,0,TCGA-B6-A0RE,7777
1256,TCGA-B6-A0RN-01A,0,TCGA-B6-A0RN,8008
1257,TCGA-B6-A0IA-01A,0,TCGA-B6-A0IA,8391
1258,TCGA-B6-A0I5-01A,0,TCGA-B6-A0I5,8556


In [11]:
len(set(detail_survivail_information['_PATIENT']))

1097

In [28]:
## check correctness of the duplicated information
## check on the general one
patient_id = general_survival_information['_PATIENT']
duplicated_survival_information = general_survival_information[patient_id.isin(patient_id[patient_id.duplicated()])]

print('General cases:')
duplicated_patient_list = set(duplicated_survival_information['_PATIENT'])
all_correct = True
error_patient_list = []
for patient in duplicated_patient_list:
    current_information = duplicated_survival_information.loc[duplicated_survival_information['_PATIENT'].isin([patient]),]
    if not np.all(current_information['OS.time'].eq(current_information['OS.time'].to_list()[0])):
        error_patient_list.append(patient)
        all_correct = False

if all_correct:
    print('Correct!')
else:
    print('Error! Patient list:')
    print(error_patient_list)
        
## check on the detailed one
patient_id = detail_survivail_information['_PATIENT']
duplicated_survival_information = detail_survivail_information[patient_id.isin(patient_id[patient_id.duplicated()])]

print('Detailed cases:')
duplicated_patient_list = set(duplicated_survival_information['_PATIENT'])
all_correct = True
for patient in duplicated_patient_list:
    current_information = duplicated_survival_information.loc[duplicated_survival_information['_PATIENT'].isin([patient]),]
    if not np.all(current_information['OS.time'].eq(current_information['OS.time'].to_list()[0])):
        error_patient_list.append(patient)
        all_correct = False
        
if all_correct:
    print('Correct!')
else:
    print('Error! Patient list:')
    print(error_patient_list)

General cases:
Correct!
Detailed cases:
Correct!


In [36]:
## obtain only unique patient information
detail_survivail_information = detail_survivail_information.drop_duplicates(subset=['_PATIENT'])
detail_survivail_information.index = range(detail_survivail_information.shape[0])

In [37]:
detail_survivail_information.agg({"OS.time": ["min", "max", "median", "skew"],"DSS.time": ["min", "max", "median", "mean"],"DFI.time": ["min", "max", "median", "mean"],"PFI.time": ["min", "max", "median", "mean"]})

Unnamed: 0,OS.time,DSS.time,DFI.time,PFI.time
min,0.0,0.0,0.0,0.0
max,8605.0,8605.0,8556.0,8556.0
median,843.0,843.0,763.5,773.0
skew,2.212804,,,
mean,,1245.64781,1159.317227,1157.697993


In [39]:
print(detail_survivail_information[detail_survivail_information['OS.time'].isna()])
print(detail_survivail_information[detail_survivail_information['DSS.time'].isna()])
print(detail_survivail_information[detail_survivail_information['DFI.time'].isna()])
print(detail_survivail_information[detail_survivail_information['PFI.time'].isna()])

               sample      _PATIENT  OS  OS.time  DSS  DSS.time  DFI  \
1058  TCGA-OL-A66H-01  TCGA-OL-A66H   0      NaN  0.0       NaN  0.0   

      DFI.time  PFI  PFI.time Redaction  
1058       NaN    0       NaN       NaN  
               sample      _PATIENT  OS  OS.time  DSS  DSS.time  DFI  \
1058  TCGA-OL-A66H-01  TCGA-OL-A66H   0      NaN  0.0       NaN  0.0   

      DFI.time  PFI  PFI.time Redaction  
1058       NaN    0       NaN       NaN  
               sample      _PATIENT  OS  OS.time  DSS  DSS.time  DFI  \
3     TCGA-3C-AALK-01  TCGA-3C-AALK   0   1448.0  0.0    1448.0  NaN   
5     TCGA-5L-AAT0-01  TCGA-5L-AAT0   0   1477.0  0.0    1477.0  NaN   
6     TCGA-5L-AAT1-01  TCGA-5L-AAT1   0   1471.0  0.0    1471.0  NaN   
7     TCGA-5T-A9QA-01  TCGA-5T-A9QA   0    303.0  0.0     303.0  NaN   
16    TCGA-A1-A0SK-01  TCGA-A1-A0SK   1    967.0  1.0     967.0  NaN   
...               ...           ...  ..      ...  ...       ...  ...   
1075  TCGA-PL-A8LY-01  TCGA-PL-A8LY   

OD, DSS and PFI only have one entry with NA value, patient 'TCGA-OL-A66H'. DFI has 145 entries with NA values.

In [46]:
## drop the patient with NA OD, DSS and PFI.
detail_survivail_information = detail_survivail_information.drop([1058])
detail_survivail_information.sort_values(by=['_PATIENT'])
detail_survivail_information.index = range(detail_survivail_information.shape[0])

In [48]:
## check if all OD and DSS are equal.
counter = 0
for idx,row in detail_survivail_information.iterrows():
    # print(row)
    if row[3] == row[5] and row[2] == row[4]:
        counter += 1
    else:
        print(row)
print(counter)

sample       TCGA-A2-A0CO-01
_PATIENT        TCGA-A2-A0CO
OS                         1
OS.time               3492.0
DSS                      0.0
DSS.time              3492.0
DFI                      NaN
DFI.time                 NaN
PFI                        0
PFI.time              3492.0
Redaction                NaN
Name: 35, dtype: object
sample       TCGA-A2-A0CS-01
_PATIENT        TCGA-A2-A0CS
OS                         1
OS.time               2348.0
DSS                      0.0
DSS.time              2348.0
DFI                      NaN
DFI.time                 NaN
PFI                        0
PFI.time              2348.0
Redaction                NaN
Name: 39, dtype: object
sample       TCGA-A2-A0CU-01
_PATIENT        TCGA-A2-A0CU
OS                         1
OS.time                158.0
DSS                      0.0
DSS.time               158.0
DFI                      NaN
DFI.time                 NaN
PFI                        0
PFI.time               158.0
Redaction               

As all OD and DSS time are equal, we can use either one. We choose DSS.

In [None]:
detail_survivail_information.drop(['OS.time'], axis=1)