# Compile TCGA

We first define the needed modules, functions and directories.

In [1]:
from IPython.display import Markdown as md
import sys, os
import json
import gzip
import numpy as np
import pandas as pd
import subprocess
#import rpy2.robjects as ro
#from rpy2.robjects import pandas2ri, numpy2ri
#from rpy2.robjects.packages import importr


In [2]:
def fileExist(file):
    """
    Checks if a file exists AND is a file
    """
    return os.path.exists(file) and os.path.isfile(file)

def create_directory(directory):
    """
    Checks if a directory exists and if not, creates it
    """
    try:
        os.stat(directory)
    except:
        os.mkdir(directory)
    return


In [4]:
# Define main directories
#gdc_dir = '/home/j.aguirreplans/Databases/TCGA/2022-03-28-Dataset/dummy'
gdc_dir = '/home/j.aguirreplans/Databases/TCGA/2022-03-28-Dataset/TCGA'
input_dir = os.path.join(gdc_dir, 'raw')
data_dir = os.path.join(input_dir, 'data')
output_dir = os.path.join(gdc_dir, 'out')
folder_list = os.listdir(data_dir)

## Parse mappings of files and cases

In GDC data, the RNA-seq files are named using **file IDs** (e.g., a82ab02e-4894-43f3-adf3-8a9faebee4e3.htseq.counts.gz), and the patient metadata is named using **case IDs** (e.g., a614255d-9f1b-4e9a-8a50-435d4d123b3f). The mapping between file IDs and case IDs is found in a JSON file named `files.YYYY-MM-DD.json`.

In [8]:
mappings_file = os.path.join(input_dir, 'additional/files.2022-03-28.json')
with open(mappings_file) as f:
    mappings = json.load(f)


In [10]:
names_corresp = {}
for file in mappings:
    file_id = file["file_name"]
    entity = file["cases"][0] # All cases are composed of only 1 case id
    case_id = entity["case_id"]
    names_corresp[file_id] = case_id


In [14]:
print(len(set(names_corresp.keys())))
print(len(set(names_corresp.values())))


11093
10237


## Parse RNA-seq data

We parse the folders inside the input directory, which store the files containing the expression.

We parse the expression files inside the folders, and keep them into a unique dataframe.

In [9]:
output_file = os.path.join(output_dir, 'dummy.csv')
#output_file = os.path.join(output_dir, 'TCGA.csv')
i=1
if not fileExist(output_file):
    # Parse patients data
    main_data = None
    for patient_folder in folder_list:
        patient_folder_path = os.path.join(data_dir, os.fsdecode(patient_folder))
        if(os.path.isdir(patient_folder_path)):
            for file_id in os.listdir(patient_folder_path):
                if file_id in names_corresp:
                    case_id = names_corresp[file_id] # Get case ID
                    file_path = os.path.join(patient_folder_path, os.fsdecode(file_id))
                    if(file_id.split(".")[-1] == "gz" or file_id.split(".")[-1] == "counts"):
                        # Uncompress (if necessary) and read file
                        if(file_id.split(".")[-1] == "gz"):
                            f = gzip.open(file_path)
                            aux_data = pd.read_csv(f,sep='\t',index_col=0,header=None, names=[case_id])
                        else:
                            aux_data = pd.read_csv(file_path,sep='\t',index_col=0,header=None, names=[case_id])
                        # Concatenate previous results
                        if main_data is not None:
                            to_ = pd.concat([main_data, aux_data],axis=1,copy=False)
                        else:
                            main_data = aux_data
                        if i%1000 == 0:
                            print('{} files processed'.format(i))
                        i+=1
    # Write output file
    main_data.to_csv(output_file, index=True, sep=',')
else:
    main_data = pd.read_csv(output_file, sep=',', index_col=0)

md("The final dataframe contains **{} rows (genes)** and **{} columns (samples)**".format(len(main_data), len(main_data.columns)))

The final dataframe contains **60488 rows (genes)** and **57 columns (samples)**

In [10]:
main_data

Unnamed: 0,305eaef4-4644-46e3-a696-d2e4a972f691,7b47489f-c3cc-4388-b5d7-c7c02790a5f6,faf4c71c-c4f3-4513-8c44-9114a5d3729a,04d7a52f-a89b-4114-a608-a6254b4d604f,4d71dd15-cd01-4dae-ad70-6dc325140207,913f21a9-f727-4983-ab3a-61b5b3fc7f5e,53c94a86-1b4d-4833-a169-d990c460b301,c0d47e0f-4918-4e06-9db6-201671b1259c,e9f0f030-d9aa-47c8-9f4d-a9ce58888c41,3a3a8fe1-e35c-45d0-aa0b-4fefa9ee9183,...,5cbf0aea-ebb4-4005-bdd5-14ef2dc6826c,285d8e2c-f183-49d5-bc1c-fd47c6670acc,128fd280-9a04-4038-9269-9a945b9a4dc9,3ef779cf-772e-445e-93c8-e1094ea8b878,4cf1e489-dd24-4227-b306-4bc5266a224c,787c3e2f-a708-4985-a5b1-b2383c7bfc52,4244912d-8527-410c-a084-a56c55eb1f82,5d66191d-da49-4d7e-89a4-a382db338340,dc7ec60b-5ea5-48a4-8908-55caf9214272,70f34c5c-3671-44c8-9469-99f9786efec1
ENSG00000000003.13,11489,4855,5963,3958,9832,2425,1622,2660,1813,1049,...,5490,3011,1969,2447,0,618,3053,5040,2102,3884
ENSG00000000005.5,35,15,0,14,161,1,0,4,0,1,...,9,1,1,0,0,4,0,40,4,109
ENSG00000000419.11,2355,1011,1665,974,2594,1197,1173,2341,609,1036,...,2163,1937,1764,2000,436,390,1392,1461,961,3337
ENSG00000000457.12,1078,207,994,415,869,655,702,1067,292,595,...,1183,1174,551,996,380,393,885,796,481,4953
ENSG00000000460.15,307,155,848,205,1153,307,186,710,279,175,...,1592,1296,622,1031,431,172,636,757,74,2284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
__no_feature,4974025,848397,2460195,1430952,8227844,3338475,2735180,4809545,1360149,1909771,...,3326229,3868348,2581708,10362704,25321982,3587016,3174372,2442435,2765636,6446108
__ambiguous,4077621,772567,1540857,1559762,3129187,2924738,2176210,2597394,2720306,2124112,...,2683646,2745813,1868480,2083487,977124,1739229,2375390,1980619,2156458,3580088
__too_low_aQual,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
__not_aligned,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Process metadata

There are two folders of metadata in GDC:
- Biospecimen: Information about the samples, and how they were extracted.
    - `aliquot.tsv`
    - `analyte.tsv`
    - `portion.tsv`
    - `sample.tsv`
    - `slide.tsv`
- Clinical
    - `clinical.tsv`: Age, if there is treatment, and which type of treatment.
    - `exposure.tsv`: Exposure to substances such as alcohol or tobacco.
    - `family_history.tsv`: Empty.


Let's analyze the files in the **Biospecimen** folder:

In [11]:
biospecimen_dir = os.path.join(input_dir, 'biospecimen')
aliquot_file = os.path.join(biospecimen_dir, 'aliquot.tsv')
analyte_file = os.path.join(biospecimen_dir, 'analyte.tsv')
portion_file = os.path.join(biospecimen_dir, 'portion.tsv')
sample_file = os.path.join(biospecimen_dir, 'sample.tsv')
slide_file = os.path.join(biospecimen_dir, 'slide.tsv')


In [12]:
aliquot_df = pd.read_csv(aliquot_file,sep='\t')
analyte_df = pd.read_csv(analyte_file,sep='\t')
portion_df = pd.read_csv(portion_file,sep='\t')
sample_df = pd.read_csv(sample_file,sep='\t')
slide_df = pd.read_csv(slide_file,sep='\t')


Let's analyze the files in the **Clinical** folder:

In [13]:
clinical_dir = os.path.join(input_dir, 'clinical')
clinical_file = os.path.join(clinical_dir, 'clinical.tsv')
exposure_file = os.path.join(clinical_dir, 'exposure.tsv')
family_history_file = os.path.join(clinical_dir, 'family_history.tsv')


In [14]:
clinical_df = pd.read_csv(clinical_file,sep='\t')
exposure_df = pd.read_csv(exposure_file,sep='\t')
family_history_df = pd.read_csv(family_history_file,sep='\t')


In [15]:
clinical_df.head()

Unnamed: 0,case_id,case_submitter_id,project_id,age_at_index,age_is_obfuscated,cause_of_death,cause_of_death_source,country_of_residence_at_enrollment,days_to_birth,days_to_death,...,treatment_arm,treatment_dose,treatment_dose_units,treatment_effect,treatment_effect_indicator,treatment_frequency,treatment_intent_type,treatment_or_therapy,treatment_outcome,treatment_type
0,febe2ce5-737b-43b8-bc70-4194fe3ed5fb,TCGA-FG-A4MY,TCGA-LGG,44,'--,'--,'--,'--,-16310,'--,...,'--,'--,'--,'--,'--,'--,'--,no,'--,"Pharmaceutical Therapy, NOS"
1,febe2ce5-737b-43b8-bc70-4194fe3ed5fb,TCGA-FG-A4MY,TCGA-LGG,44,'--,'--,'--,'--,-16310,'--,...,'--,'--,'--,'--,'--,'--,'--,yes,'--,"Radiation Therapy, NOS"
2,b2bcc97f-03a6-409a-8b91-78221cb10d7f,TCGA-YL-A9WK,TCGA-PRAD,63,'--,'--,'--,'--,-23272,'--,...,'--,'--,'--,'--,'--,'--,'--,yes,'--,"Pharmaceutical Therapy, NOS"
3,b2bcc97f-03a6-409a-8b91-78221cb10d7f,TCGA-YL-A9WK,TCGA-PRAD,63,'--,'--,'--,'--,-23272,'--,...,'--,'--,'--,'--,'--,'--,'--,no,'--,"Radiation Therapy, NOS"
4,03c3ae62-d0aa-412e-bd3c-4577fc9f919c,TCGA-BB-A6UM,TCGA-HNSC,52,'--,'--,'--,'--,-19194,'--,...,'--,'--,'--,'--,'--,'--,'--,yes,'--,"Radiation Therapy, NOS"


In [16]:
clinical_df.shape

(20396, 154)

In [17]:
exposure_df.head()

Unnamed: 0,case_id,case_submitter_id,project_id,age_at_onset,alcohol_days_per_week,alcohol_drinks_per_day,alcohol_history,alcohol_intensity,alcohol_type,asbestos_exposure,...,smoking_frequency,time_between_waking_and_first_smoke,tobacco_smoking_onset_year,tobacco_smoking_quit_year,tobacco_smoking_status,tobacco_use_per_day,type_of_smoke_exposure,type_of_tobacco_used,weight,years_smoked
0,febe2ce5-737b-43b8-bc70-4194fe3ed5fb,TCGA-FG-A4MY,TCGA-LGG,'--,'--,'--,Not Reported,'--,'--,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--
1,b2bcc97f-03a6-409a-8b91-78221cb10d7f,TCGA-YL-A9WK,TCGA-PRAD,'--,'--,'--,Not Reported,'--,'--,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--
2,03c3ae62-d0aa-412e-bd3c-4577fc9f919c,TCGA-BB-A6UM,TCGA-HNSC,'--,'--,'--,No,'--,'--,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,35.0
3,588a48f7-947c-463e-8991-9a4ee82683aa,TCGA-D3-A8GS,TCGA-SKCM,'--,'--,'--,Not Reported,'--,'--,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--
4,fd3315da-c870-4ad0-9d2a-50b1647d3e46,TCGA-J8-A4HW,TCGA-THCA,'--,'--,'--,Not Reported,'--,'--,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--


In [18]:
exposure_df.shape

(10198, 33)

## Combine data and metadata

Map the data and metadata can be done using the JSON file with the mappings:

In [19]:
list(clinical_df.columns)

['case_id',
 'case_submitter_id',
 'project_id',
 'age_at_index',
 'age_is_obfuscated',
 'cause_of_death',
 'cause_of_death_source',
 'country_of_residence_at_enrollment',
 'days_to_birth',
 'days_to_death',
 'ethnicity',
 'gender',
 'occupation_duration_years',
 'premature_at_birth',
 'race',
 'vital_status',
 'weeks_gestation_at_birth',
 'year_of_birth',
 'year_of_death',
 'age_at_diagnosis',
 'ajcc_clinical_m',
 'ajcc_clinical_n',
 'ajcc_clinical_stage',
 'ajcc_clinical_t',
 'ajcc_pathologic_m',
 'ajcc_pathologic_n',
 'ajcc_pathologic_stage',
 'ajcc_pathologic_t',
 'ajcc_staging_system_edition',
 'anaplasia_present',
 'anaplasia_present_type',
 'ann_arbor_b_symptoms',
 'ann_arbor_clinical_stage',
 'ann_arbor_extranodal_involvement',
 'ann_arbor_pathologic_stage',
 'best_overall_response',
 'breslow_thickness',
 'burkitt_lymphoma_clinical_variant',
 'child_pugh_classification',
 'circumferential_resection_margin',
 'classification_of_tumor',
 'cog_liver_stage',
 'cog_neuroblastoma_ri

In [23]:
metadata = clinical_df[clinical_df["case_id"].isin(main_data.columns)]
metadata.shape

(112, 154)

In [24]:
metadata = metadata[["case_id", "case_submitter_id", "project_id", "cause_of_death", "treatment_or_therapy", "treatment_outcome", "treatment_type"]]

In [26]:
metadata

Unnamed: 0,case_id,case_submitter_id,project_id,cause_of_death,treatment_or_therapy,treatment_outcome,treatment_type
186,2fabeb98-05e3-4f55-97f5-fbc675e25a3d,TCGA-62-8398,TCGA-LUAD,'--,no,'--,"Radiation Therapy, NOS"
187,2fabeb98-05e3-4f55-97f5-fbc675e25a3d,TCGA-62-8398,TCGA-LUAD,'--,yes,'--,"Pharmaceutical Therapy, NOS"
544,5d66191d-da49-4d7e-89a4-a382db338340,TCGA-D5-6931,TCGA-COAD,'--,no,'--,"Pharmaceutical Therapy, NOS"
545,5d66191d-da49-4d7e-89a4-a382db338340,TCGA-D5-6931,TCGA-COAD,'--,no,'--,"Radiation Therapy, NOS"
666,c92696ba-f549-480d-8d4a-5f23b27aa336,TCGA-C5-A907,TCGA-CESC,'--,yes,'--,"Pharmaceutical Therapy, NOS"
...,...,...,...,...,...,...,...
19465,128fd280-9a04-4038-9269-9a945b9a4dc9,TCGA-FU-A770,TCGA-CESC,'--,not reported,'--,"Radiation Therapy, NOS"
19856,305eaef4-4644-46e3-a696-d2e4a972f691,TCGA-CZ-4865,TCGA-KIRC,'--,no,'--,"Radiation Therapy, NOS"
19857,305eaef4-4644-46e3-a696-d2e4a972f691,TCGA-CZ-4865,TCGA-KIRC,'--,no,'--,"Pharmaceutical Therapy, NOS"
19914,ef974c39-8bac-486b-a395-9d1283e08028,TCGA-ZG-A9LB,TCGA-PRAD,'--,no,'--,"Radiation Therapy, NOS"


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
​
​
import pandas as pd
import numpy as np
import sys
import os
import subprocess
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri, numpy2ri
from rpy2.robjects.packages import importr
import gzip
import json
​
​
​
ro.pandas2ri.activate()
​
ro.r('library(biomaRt)')
ro.r('human = useMart("ensembl", dataset = "hsapiens_gene_ensembl",  host="www.ensembl.org")')
​
file_folder = os.path.abspath(os.path.dirname(__file__))
gdc_folder = os.path.dirname(file_folder) +"/gdc/"
folder = file_folder + '/results/'
with open(gdc_folder + "metadata.cart.2018-10-15.json") as f:
    metadata = json.load(f)
​
ids = []
filenames = []
for file in metadata:
    filenames += [file["file_name"]]
    entity = file["associated_entities"][0]
    ids += [entity["case_id"]]
names_corresp = {filenames[i] : ids[i] for i in range(len(ids))}
clinical = pd.read_csv(gdc_folder +"clinical.tsv", delimiter='\t') 
case_id = clinical.loc[:,"case_id"]
clinical.index = case_id
​
diagnosis = clinical.loc[:,"primary_diagnosis"]
stage = clinical.loc[:,"tumor_stage"]
grade = clinical.loc[:,"tumor_grade"]
last_followup = clinical.loc[:,"days_to_last_follow_up"]
clinical.loc[:,"vital_status"] = [1  if stat == "dead" else 0 for stat in clinical.loc[:,"vital_status"]]
status = clinical.loc[:,"vital_status"]
days_to_death = clinical.loc[:,"days_to_death"]
age = clinical.loc[:,"age_at_diagnosis"] 
recurr = clinical.loc[:,"days_to_recurrence"]
names_common = clinical.loc[:,"primary_diagnosis"]
names_common.index = clinical.loc[:,"submitter_id"]
submitters = clinical.loc[:,"submitter_id"]
locations = clinical.loc[:,"project_id"]
locations = [loc.split("-")[-1] for loc in locations]
loc_corresp = {case_id[i] : locations[i] for i in range(len(case_id))}
submitters.index = clinical.loc[:,"case_id"]
i = 0
old_i=0
folder_list =os.listdir(gdc_folder)
for location in set(locations):
    print(location)
    folder_num =[]
    folder_count = 0
    data_list = []
    filenames_ordered = []
    for patient_folder in folder_list:
        patient_folder_path = gdc_folder + os.fsdecode(patient_folder)
        if(os.path.isdir(patient_folder_path)):
            for patient in os.listdir(patient_folder_path):
                filename = patient_folder_path + '/' +os.fsdecode(patient)
                
                
                if((patient.split(".")[-1] == "gz" or patient.split(".")[-1] == "counts") and loc_corresp[names_corresp[patient]] == location):
                    if(patient.split(".")[-1] == "gz"):
                        f = gzip.open(filename)
                        aux_data = pd.read_csv(f,sep='\t',index_col=0,header=None, names=[patient])
                    else:
                        aux_data = pd.read_csv(filename,sep='\t',index_col=0,header=None, names=[patient])
                    filenames_ordered += [names_corresp[patient]]
                    data_list.append(aux_data)
                    i += 1
                    folder_num += [patient_folder]
    folder_list = [fold for fold in folder_list if fold not in folder_num]
    print(location)
    print(len(folder_num))
    print(len(folder_list))
    print(i-old_i)
    if(i - old_i >1):
        data_list = pd.concat(data_list,axis=1,copy=False)
    elif i-old_i ==1:
        data_list = data_list[0]
    old_i = i
    """aux = [name for name in data_list.index if "ENSG" in name] 
    data_list = data_list.loc[aux,:]
    aux = np.array([name for name in data_list.index if "ENSG0" in name])
    aux2 = ["ENSG0" + name.split('R')[1] for name in data_list.index if "ENSGR" in name]
    aux2 = np.array([name.split(".")[0] for name in aux2])
    ro.globalenv["ind"] =aux
    ro.r("ind=as.vector(ind)")
    
    ro.r('ann <- getBM(attributes=c("hgnc_symbol","description","chromosome_name","band","strand","start_position","end_position","ensembl_gene_id_version"),  filters="ensembl_gene_id_version", values=ind, mart=human,uniqueRows=False)') 
    aux = [str(ind) for ind in ro.r('ann$hgnc_symbol')]
    print(ro.r('ann'))
    ro.globalenv["ind"] =aux2
    ro.r("ind=as.vector(ind)")
    ro.r('ann <- getBM(attributes=c("hgnc_symbol","description","chromosome_name","band","strand","start_position","end_position","ensembl_gene_id_version"),  filters="ensembl_gene_id_version", values=ind, mart=human,uniqueRows=False)') 
    aux2 = [str(ind) +'_2' for ind in ro.r('ann$hgnc_symbol')]
    print(ro.r('ann'))
    print(len(aux+aux2))
    data_list.index = aux + aux2
    print(data_list.index)
    data_list = data_list.T"""
    survival = [] 
    for j in filenames_ordered:
        if (status[j]==1 and days_to_death[j]!="--" ):
            survival += [int(days_to_death[j])]
        elif last_followup[j]!="--":
            survival += [int(last_followup[j])]
    binary = [status[k] for k in filenames_ordered]
    not_found = [k  for k in range(len(filenames_ordered)) if days_to_death[filenames_ordered[k]]=="--" and last_followup[filenames_ordered[k]]=="--"]
    data_list.to_pickle(folder + "genes_counts_" + location +"_gdc")
    diagnosis[filenames_ordered].to_pickle(folder + "diagnosis_" + location +"_gdc")
    stage[filenames_ordered].to_pickle(folder + "stage_" + location +"_gdc")
    names_common[submitters[filenames_ordered]].to_pickle(folder + "histo_" + location +"_gdc")
    
    np.save(file_folder+"/results/gdc_survival",survival)
    np.save(file_folder+"/results/gdc_survival_binary",binary)
    
    np.save(file_folder+"/results/gdc_survival_not_found_samples",not_found)    
print(i)