In [3]:
import numpy as np
import pandas as pd


In [8]:
## define directory
data_directory = 'data/'
genera_survivail_information_path = 'TCGA-BRCA.survival.tsv'
detail_survivail_information_path = 'survival-BRCA_survival.txt'
detail_phenotype_information_path = 'TCGA-BRCA.GDC_phenotype.tsv'

## load all necessary data
general_survival_information = pd.read_csv(data_directory + genera_survivail_information_path, sep='\t', header=0)
detail_survivail_information = pd.read_csv(data_directory + detail_survivail_information_path, sep='\t', header=0)

In [9]:
general_survival_information

Unnamed: 0,sample,OS,_PATIENT,OS.time
0,TCGA-C8-A275-01A,0,TCGA-C8-A275,1
1,TCGA-BH-A1F8-11B,1,TCGA-BH-A1F8,1
2,TCGA-BH-A1F8-01A,1,TCGA-BH-A1F8,1
3,TCGA-AC-A7VC-01A,0,TCGA-AC-A7VC,1
4,TCGA-AN-A0AM-01A,0,TCGA-AN-A0AM,5
...,...,...,...,...
1255,TCGA-B6-A0RE-01A,0,TCGA-B6-A0RE,7777
1256,TCGA-B6-A0RN-01A,0,TCGA-B6-A0RN,8008
1257,TCGA-B6-A0IA-01A,0,TCGA-B6-A0IA,8391
1258,TCGA-B6-A0I5-01A,0,TCGA-B6-A0I5,8556


In [11]:
len(set(detail_survivail_information['_PATIENT']))

1097

In [28]:
## check correctness of the duplicated information
## check on the general one
patient_id = general_survival_information['_PATIENT']
duplicated_survival_information = general_survival_information[patient_id.isin(patient_id[patient_id.duplicated()])]

print('General cases:')
duplicated_patient_list = set(duplicated_survival_information['_PATIENT'])
all_correct = True
error_patient_list = []
for patient in duplicated_patient_list:
    current_information = duplicated_survival_information.loc[duplicated_survival_information['_PATIENT'].isin([patient]),]
    if not np.all(current_information['OS.time'].eq(current_information['OS.time'].to_list()[0])):
        error_patient_list.append(patient)
        all_correct = False

if all_correct:
    print('Correct!')
else:
    print('Error! Patient list:')
    print(error_patient_list)
        
## check on the detailed one
patient_id = detail_survivail_information['_PATIENT']
duplicated_survival_information = detail_survivail_information[patient_id.isin(patient_id[patient_id.duplicated()])]

print('Detailed cases:')
duplicated_patient_list = set(duplicated_survival_information['_PATIENT'])
all_correct = True
for patient in duplicated_patient_list:
    current_information = duplicated_survival_information.loc[duplicated_survival_information['_PATIENT'].isin([patient]),]
    if not np.all(current_information['OS.time'].eq(current_information['OS.time'].to_list()[0])):
        error_patient_list.append(patient)
        all_correct = False
        
if all_correct:
    print('Correct!')
else:
    print('Error! Patient list:')
    print(error_patient_list)

General cases:
Correct!
Detailed cases:
Correct!


In [29]:
## obtain only unique patient information
detail_survivail_information = detail_survivail_information.drop_duplicates(subset=['_PATIENT'])


In [30]:
detail_survivail_information.agg({"OS.time": ["min", "max", "median", "skew"],"DSS.time": ["min", "max", "median", "mean"],"DFI.time": ["min", "max", "median", "mean"],"PFI.time": ["min", "max", "median", "mean"]})

Unnamed: 0,OS.time,DSS.time,DFI.time,PFI.time
min,0.0,0.0,0.0,0.0
max,8605.0,8605.0,8556.0,8556.0
median,843.0,843.0,763.5,773.0
skew,2.212804,,,
mean,,1245.64781,1159.317227,1157.697993


In [32]:
detail_survivail_information

Unnamed: 0,sample,_PATIENT,OS,OS.time,DSS,DSS.time,DFI,DFI.time,PFI,PFI.time,Redaction
0,TCGA-3C-AAAU-01,TCGA-3C-AAAU,0,4047.0,0.0,4047.0,1.0,1808.0,1,1808.0,
1,TCGA-3C-AALI-01,TCGA-3C-AALI,0,4005.0,0.0,4005.0,0.0,4005.0,0,4005.0,
2,TCGA-3C-AALJ-01,TCGA-3C-AALJ,0,1474.0,0.0,1474.0,0.0,1474.0,0,1474.0,
3,TCGA-3C-AALK-01,TCGA-3C-AALK,0,1448.0,0.0,1448.0,,,0,1448.0,
4,TCGA-4H-AAAK-01,TCGA-4H-AAAK,0,348.0,0.0,348.0,0.0,348.0,0,348.0,
...,...,...,...,...,...,...,...,...,...,...,...
1231,TCGA-WT-AB44-01,TCGA-WT-AB44,0,883.0,0.0,883.0,0.0,883.0,0,883.0,
1232,TCGA-XX-A899-01,TCGA-XX-A899,0,467.0,0.0,467.0,0.0,467.0,0,467.0,
1233,TCGA-XX-A89A-01,TCGA-XX-A89A,0,488.0,0.0,488.0,0.0,488.0,0,488.0,
1234,TCGA-Z7-A8R5-01,TCGA-Z7-A8R5,0,3287.0,0.0,3287.0,,,1,181.0,
