# Compile TCGA

We first define the needed modules, functions and directories.

In [1]:
from IPython.display import Markdown as md
import sys, os
import json
import gzip
import numpy as np
import pandas as pd
import subprocess
#import rpy2.robjects as ro
#from rpy2.robjects import pandas2ri, numpy2ri
#from rpy2.robjects.packages import importr


In [2]:
def fileExist(file):
    """
    Checks if a file exists AND is a file
    """
    return os.path.exists(file) and os.path.isfile(file)

def create_directory(directory):
    """
    Checks if a directory exists and if not, creates it
    """
    try:
        os.stat(directory)
    except:
        os.mkdir(directory)
    return


In [3]:
# Define main directories
#gdc_dir = '/work/ccnr/j.aguirreplans/Databases/TCGA/2022-03-28-Dataset/dummy'
#gdc_dir = '/work/ccnr/j.aguirreplans/Databases/TCGA/2022-03-28-Dataset/TCGA'
gdc_dir = '/work/ccnr/j.aguirreplans/Databases/TCGA/2022-07-27-Dataset/dummy'
#gdc_dir = '/work/ccnr/j.aguirreplans/Databases/TCGA/2022-07-27-Dataset/TCGA'
input_dir = os.path.join(gdc_dir, 'raw')
data_dir = os.path.join(input_dir, 'data')
output_dir = os.path.join(gdc_dir, 'out')
folder_list = os.listdir(data_dir)

In [4]:
# Create a dummy directory to run tests
dummy_data_dir = '/work/ccnr/j.aguirreplans/Databases/TCGA/2022-07-27-Dataset/dummy/raw/data'
tcga_data_dir = '/work/ccnr/j.aguirreplans/Databases/TCGA/2022-07-27-Dataset/TCGA/raw/data'
dummy_files = os.listdir(dummy_data_dir)
n_max = 400
n = 0
if len(dummy_files) == 0:
    for file_id in os.listdir(tcga_data_dir):
        if n >= n_max:
            break
        command = "cp -r {} {}".format(os.path.join(tcga_data_dir, file_id), dummy_data_dir)
        #print(command)
        os.system(command)
        n+=1


## Parse mappings of files and cases

In GDC data, the RNA-seq files are named using **file IDs** (e.g., a82ab02e-4894-43f3-adf3-8a9faebee4e3.htseq.counts.gz), and the patient metadata is named using **case IDs** (e.g., a614255d-9f1b-4e9a-8a50-435d4d123b3f). The mapping between file IDs and case IDs is found in the file `metadata.txt`.

In [5]:
# For data from 2022-03-28
#mappings_file = os.path.join(input_dir, 'additional/files.2022-03-28.json')
#with open(mappings_file) as f:
#    mappings = json.load(f)

#names_corresp = {}
#for file in mappings:
#    file_id = file["file_name"]
#    entity = file["cases"][0] # All cases are composed of only 1 case id
#    case_id = entity["case_id"]
#    names_corresp[file_id] = case_id


In [6]:
# For data from 2022-07-27
metadata_file = os.path.join(input_dir, 'metadata/metadata.txt')
metadata_df = pd.read_csv(metadata_file,sep=',',index_col=False,header=0)
names_corresp = dict(zip(metadata_df.file_name, metadata_df.file_id))

In [7]:
print(len(set(names_corresp.keys())))
print(len(set(names_corresp.values())))
print(len([file_id for file_id in folder_list if file_id in names_corresp.values()]))

11123
11123
400


## Parse RNA-seq data

We parse the folders inside the input directory, which store the files containing the expression.

We parse the expression files inside the folders, and keep them into a unique dataframe.

In [74]:
output_file = os.path.join(output_dir, 'dummy.csv')
#output_file = os.path.join(output_dir, 'TCGA.csv')
type_gene_expression = "tpm_unstranded"
i=1
if not fileExist(output_file):
    # Parse patients data
    main_data = None
    for patient_folder in folder_list:
        patient_folder_path = os.path.join(data_dir, os.fsdecode(patient_folder))
        if(os.path.isdir(patient_folder_path)):
            for file_id in os.listdir(patient_folder_path):
                #if file_id in names_corresp:
                    #case_id = names_corresp[file_id] # Get case ID
                    file_path = os.path.join(patient_folder_path, os.fsdecode(file_id))
                    if(len(file_id.split(".")) > 1 and (file_id.split(".")[-1] == "gz" or file_id.split(".")[-1] == "counts" or file_id.split(".")[-2] == "augmented_star_gene_counts")):
                        # Uncompress (if necessary) and read file
                        if(file_id.split(".")[-1] == "gz" and len(file_id.split(".")) > 2 and file_id.split(".")[-3] == "htseq"):
                            f = gzip.open(file_path)
                            aux_data = pd.read_csv(f,sep='\t',index_col=0,header=None, names=[file_id])
                        elif(file_id.split(".")[-1] == "counts" and file_id.split(".")[-2] == "htseq"):
                            aux_data = pd.read_csv(file_path,sep='\t',index_col=0,header=None, names=[file_id])
                        elif(file_id.split(".")[-2] == "augmented_star_gene_counts"):
                            aux_data = pd.read_csv(file_path,sep='\t',index_col=0, skiprows=1, header=0)
                            aux_data = aux_data[~aux_data.index.isin(["N_unmapped", "N_multimapping", "N_noFeature", "N_ambiguous"])]
                            aux_data = aux_data[[type_gene_expression]].rename({type_gene_expression: file_id}, axis='columns')
                        else:
                            print("Unknown file format: {}".format(file_id))
                            sys.exit(10)
                        # Concatenate previous results
                        if main_data is not None:
                            #main_data = pd.concat([main_data, aux_data],axis=1,copy=False)
                            main_data = pd.merge(main_data, aux_data, left_index=True, right_index=True)
                        else:
                            main_data = aux_data
                        if i%1000 == 0:
                            print('{} files processed'.format(i))
                        i+=1
    # Write output file
    main_data.to_csv(output_file, index=True, sep=',')
else:
    main_data = pd.read_csv(output_file, sep=',', index_col=0)

md("The final dataframe contains **{} rows (genes)** and **{} columns (samples)**".format(len(main_data), len(main_data.columns)))

The final dataframe contains **60660 rows (genes)** and **400 columns (samples)**

In [76]:
main_data

Unnamed: 0_level_0,88215dd0-5841-44f1-9393-eefd8238cbb3.rna_seq.augmented_star_gene_counts.tsv,e74f321c-217f-4bdc-ad17-f132501b5157.rna_seq.augmented_star_gene_counts.tsv,09755ce8-ed89-411a-a42f-b3edc4e41eeb.rna_seq.augmented_star_gene_counts.tsv,1e100c4d-13eb-4e5a-9117-ead385902710.rna_seq.augmented_star_gene_counts.tsv,84e40c7e-2f60-4461-b14c-0859697295a4.rna_seq.augmented_star_gene_counts.tsv,ba295155-272e-43eb-9d6a-e4c9c392e68b.rna_seq.augmented_star_gene_counts.tsv,8d1641ea-7552-4d23-9298-094e0056386a.rna_seq.augmented_star_gene_counts.tsv,523c76c5-ac4d-4a69-88b8-0857af45de7b.rna_seq.augmented_star_gene_counts.tsv,5abd235a-829b-4457-8f47-ecd1adab30ca.rna_seq.augmented_star_gene_counts.tsv,49a0c23d-e0e7-4b67-b9f0-527c14105345.rna_seq.augmented_star_gene_counts.tsv,...,e2ae0ec8-6198-4a6f-8a06-197a8ec81384.rna_seq.augmented_star_gene_counts.tsv,087c2c93-d071-41e1-a2e8-a1961ba54a86.rna_seq.augmented_star_gene_counts.tsv,12e291b4-8090-40f0-975c-4ffebb4f695a.rna_seq.augmented_star_gene_counts.tsv,3a770603-105c-45ef-9b26-49918dd7bc8e.rna_seq.augmented_star_gene_counts.tsv,d3a67b19-5e2e-48ec-a887-a2d605861717.rna_seq.augmented_star_gene_counts.tsv,e121b871-b177-47ef-a4f1-de73cb338519.rna_seq.augmented_star_gene_counts.tsv,868bc289-3c43-4a0f-bbb1-703f0e9fa33a.rna_seq.augmented_star_gene_counts.tsv,26fdaa75-cea5-4355-b3b1-6b5c2c75e542.rna_seq.augmented_star_gene_counts.tsv,c8d14d90-63a8-4576-ae80-ef8582792d16.rna_seq.augmented_star_gene_counts.tsv,ec79ad9f-dac4-49c6-8794-5168fb8fabf8.rna_seq.augmented_star_gene_counts.tsv
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003.15,54.5637,27.3874,50.3093,27.8715,37.8239,56.2216,28.5350,50.5159,138.9149,59.9764,...,3.3343,38.9980,47.3929,28.3164,25.3789,36.9583,24.4795,56.5504,41.6066,0.2762
ENSG00000000005.6,0.0966,0.0482,1.8636,0.0333,0.3392,0.2768,5.1690,0.0318,1.2867,0.0520,...,0.0000,0.0000,0.0914,0.0453,0.8806,0.0551,0.0319,3.9432,5.5485,0.0000
ENSG00000000419.13,93.9366,113.4229,85.7241,49.5446,51.3701,126.9161,101.9253,93.4030,77.2900,138.3704,...,66.3315,125.9330,168.6618,43.8181,71.2627,81.3964,70.7812,72.9181,46.3479,33.9519
ENSG00000000457.14,13.0767,12.6289,9.2839,1.8261,4.6496,25.4778,11.2845,8.1861,3.8629,6.4018,...,4.2738,7.5792,8.5848,2.6325,4.0725,8.4650,5.6966,28.2136,3.7234,9.2297
ENSG00000000460.17,2.4590,12.1395,3.2253,0.4112,1.7095,15.4251,3.6297,5.0929,1.7255,2.3896,...,0.9529,5.1320,6.0787,0.4368,1.3253,3.4165,2.5940,16.5258,0.5460,9.1274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000288669.1,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0118,0.0000,0.0000,0.0449
ENSG00000288670.1,6.0340,10.3222,23.9462,4.6733,15.1532,24.1890,6.3918,8.1920,13.6630,5.1359,...,12.1709,12.0578,8.2870,9.1025,37.7176,10.4781,5.7829,30.7163,18.7374,13.0228
ENSG00000288671.1,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
ENSG00000288674.1,0.0519,0.0888,0.0550,0.0102,0.0360,0.0910,0.0000,0.0293,0.0120,0.0239,...,0.0433,0.0484,0.0421,0.0139,0.0529,0.0169,0.0440,0.1180,0.0212,0.2232


## Process metadata

There are two folders of metadata in GDC:
- Biospecimen: Information about the samples, and how they were extracted.
    - `aliquot.tsv`
    - `analyte.tsv`
    - `portion.tsv`
    - `sample.tsv`
    - `slide.tsv`
- Clinical
    - `clinical.tsv`: Age, if there is treatment, and which type of treatment.
    - `exposure.tsv`: Exposure to substances such as alcohol or tobacco.
    - `family_history.tsv`: Empty.


Let's analyze the files in the **Biospecimen** folder:

In [11]:
biospecimen_dir = os.path.join(input_dir, 'biospecimen')
aliquot_file = os.path.join(biospecimen_dir, 'aliquot.tsv')
analyte_file = os.path.join(biospecimen_dir, 'analyte.tsv')
portion_file = os.path.join(biospecimen_dir, 'portion.tsv')
sample_file = os.path.join(biospecimen_dir, 'sample.tsv')
slide_file = os.path.join(biospecimen_dir, 'slide.tsv')


In [12]:
aliquot_df = pd.read_csv(aliquot_file,sep='\t')
analyte_df = pd.read_csv(analyte_file,sep='\t')
portion_df = pd.read_csv(portion_file,sep='\t')
sample_df = pd.read_csv(sample_file,sep='\t')
slide_df = pd.read_csv(slide_file,sep='\t')


Let's analyze the files in the **Clinical** folder:

In [13]:
clinical_dir = os.path.join(input_dir, 'clinical')
clinical_file = os.path.join(clinical_dir, 'clinical.tsv')
exposure_file = os.path.join(clinical_dir, 'exposure.tsv')
family_history_file = os.path.join(clinical_dir, 'family_history.tsv')


In [14]:
clinical_df = pd.read_csv(clinical_file,sep='\t')
exposure_df = pd.read_csv(exposure_file,sep='\t')
family_history_df = pd.read_csv(family_history_file,sep='\t')


In [15]:
clinical_df.head()

Unnamed: 0,case_id,case_submitter_id,project_id,age_at_index,age_is_obfuscated,cause_of_death,cause_of_death_source,country_of_residence_at_enrollment,days_to_birth,days_to_death,...,treatment_arm,treatment_dose,treatment_dose_units,treatment_effect,treatment_effect_indicator,treatment_frequency,treatment_intent_type,treatment_or_therapy,treatment_outcome,treatment_type
0,febe2ce5-737b-43b8-bc70-4194fe3ed5fb,TCGA-FG-A4MY,TCGA-LGG,44,'--,'--,'--,'--,-16310,'--,...,'--,'--,'--,'--,'--,'--,'--,no,'--,"Pharmaceutical Therapy, NOS"
1,febe2ce5-737b-43b8-bc70-4194fe3ed5fb,TCGA-FG-A4MY,TCGA-LGG,44,'--,'--,'--,'--,-16310,'--,...,'--,'--,'--,'--,'--,'--,'--,yes,'--,"Radiation Therapy, NOS"
2,b2bcc97f-03a6-409a-8b91-78221cb10d7f,TCGA-YL-A9WK,TCGA-PRAD,63,'--,'--,'--,'--,-23272,'--,...,'--,'--,'--,'--,'--,'--,'--,yes,'--,"Pharmaceutical Therapy, NOS"
3,b2bcc97f-03a6-409a-8b91-78221cb10d7f,TCGA-YL-A9WK,TCGA-PRAD,63,'--,'--,'--,'--,-23272,'--,...,'--,'--,'--,'--,'--,'--,'--,no,'--,"Radiation Therapy, NOS"
4,03c3ae62-d0aa-412e-bd3c-4577fc9f919c,TCGA-BB-A6UM,TCGA-HNSC,52,'--,'--,'--,'--,-19194,'--,...,'--,'--,'--,'--,'--,'--,'--,yes,'--,"Radiation Therapy, NOS"


In [16]:
clinical_df.shape

(20396, 154)

In [17]:
exposure_df.head()

Unnamed: 0,case_id,case_submitter_id,project_id,age_at_onset,alcohol_days_per_week,alcohol_drinks_per_day,alcohol_history,alcohol_intensity,alcohol_type,asbestos_exposure,...,smoking_frequency,time_between_waking_and_first_smoke,tobacco_smoking_onset_year,tobacco_smoking_quit_year,tobacco_smoking_status,tobacco_use_per_day,type_of_smoke_exposure,type_of_tobacco_used,weight,years_smoked
0,febe2ce5-737b-43b8-bc70-4194fe3ed5fb,TCGA-FG-A4MY,TCGA-LGG,'--,'--,'--,Not Reported,'--,'--,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--
1,b2bcc97f-03a6-409a-8b91-78221cb10d7f,TCGA-YL-A9WK,TCGA-PRAD,'--,'--,'--,Not Reported,'--,'--,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--
2,03c3ae62-d0aa-412e-bd3c-4577fc9f919c,TCGA-BB-A6UM,TCGA-HNSC,'--,'--,'--,No,'--,'--,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,35.0
3,588a48f7-947c-463e-8991-9a4ee82683aa,TCGA-D3-A8GS,TCGA-SKCM,'--,'--,'--,Not Reported,'--,'--,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--
4,fd3315da-c870-4ad0-9d2a-50b1647d3e46,TCGA-J8-A4HW,TCGA-THCA,'--,'--,'--,Not Reported,'--,'--,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--


In [18]:
exposure_df.shape

(10198, 33)

## Combine data and metadata

Map the data and metadata can be done using the JSON file with the mappings:

In [19]:
list(clinical_df.columns)

['case_id',
 'case_submitter_id',
 'project_id',
 'age_at_index',
 'age_is_obfuscated',
 'cause_of_death',
 'cause_of_death_source',
 'country_of_residence_at_enrollment',
 'days_to_birth',
 'days_to_death',
 'ethnicity',
 'gender',
 'occupation_duration_years',
 'premature_at_birth',
 'race',
 'vital_status',
 'weeks_gestation_at_birth',
 'year_of_birth',
 'year_of_death',
 'age_at_diagnosis',
 'ajcc_clinical_m',
 'ajcc_clinical_n',
 'ajcc_clinical_stage',
 'ajcc_clinical_t',
 'ajcc_pathologic_m',
 'ajcc_pathologic_n',
 'ajcc_pathologic_stage',
 'ajcc_pathologic_t',
 'ajcc_staging_system_edition',
 'anaplasia_present',
 'anaplasia_present_type',
 'ann_arbor_b_symptoms',
 'ann_arbor_clinical_stage',
 'ann_arbor_extranodal_involvement',
 'ann_arbor_pathologic_stage',
 'best_overall_response',
 'breslow_thickness',
 'burkitt_lymphoma_clinical_variant',
 'child_pugh_classification',
 'circumferential_resection_margin',
 'classification_of_tumor',
 'cog_liver_stage',
 'cog_neuroblastoma_ri

In [23]:
metadata = clinical_df[clinical_df["case_id"].isin(main_data.columns)]
metadata.shape

(112, 154)

In [24]:
metadata = metadata[["case_id", "case_submitter_id", "project_id", "cause_of_death", "treatment_or_therapy", "treatment_outcome", "treatment_type"]]

In [26]:
metadata

Unnamed: 0,case_id,case_submitter_id,project_id,cause_of_death,treatment_or_therapy,treatment_outcome,treatment_type
186,2fabeb98-05e3-4f55-97f5-fbc675e25a3d,TCGA-62-8398,TCGA-LUAD,'--,no,'--,"Radiation Therapy, NOS"
187,2fabeb98-05e3-4f55-97f5-fbc675e25a3d,TCGA-62-8398,TCGA-LUAD,'--,yes,'--,"Pharmaceutical Therapy, NOS"
544,5d66191d-da49-4d7e-89a4-a382db338340,TCGA-D5-6931,TCGA-COAD,'--,no,'--,"Pharmaceutical Therapy, NOS"
545,5d66191d-da49-4d7e-89a4-a382db338340,TCGA-D5-6931,TCGA-COAD,'--,no,'--,"Radiation Therapy, NOS"
666,c92696ba-f549-480d-8d4a-5f23b27aa336,TCGA-C5-A907,TCGA-CESC,'--,yes,'--,"Pharmaceutical Therapy, NOS"
...,...,...,...,...,...,...,...
19465,128fd280-9a04-4038-9269-9a945b9a4dc9,TCGA-FU-A770,TCGA-CESC,'--,not reported,'--,"Radiation Therapy, NOS"
19856,305eaef4-4644-46e3-a696-d2e4a972f691,TCGA-CZ-4865,TCGA-KIRC,'--,no,'--,"Radiation Therapy, NOS"
19857,305eaef4-4644-46e3-a696-d2e4a972f691,TCGA-CZ-4865,TCGA-KIRC,'--,no,'--,"Pharmaceutical Therapy, NOS"
19914,ef974c39-8bac-486b-a395-9d1283e08028,TCGA-ZG-A9LB,TCGA-PRAD,'--,no,'--,"Radiation Therapy, NOS"


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
​
​
import pandas as pd
import numpy as np
import sys
import os
import subprocess
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri, numpy2ri
from rpy2.robjects.packages import importr
import gzip
import json
​
​
​
ro.pandas2ri.activate()
​
ro.r('library(biomaRt)')
ro.r('human = useMart("ensembl", dataset = "hsapiens_gene_ensembl",  host="www.ensembl.org")')
​
file_folder = os.path.abspath(os.path.dirname(__file__))
gdc_folder = os.path.dirname(file_folder) +"/gdc/"
folder = file_folder + '/results/'
with open(gdc_folder + "metadata.cart.2018-10-15.json") as f:
    metadata = json.load(f)
​
ids = []
filenames = []
for file in metadata:
    filenames += [file["file_name"]]
    entity = file["associated_entities"][0]
    ids += [entity["case_id"]]
names_corresp = {filenames[i] : ids[i] for i in range(len(ids))}
clinical = pd.read_csv(gdc_folder +"clinical.tsv", delimiter='\t') 
case_id = clinical.loc[:,"case_id"]
clinical.index = case_id
​
diagnosis = clinical.loc[:,"primary_diagnosis"]
stage = clinical.loc[:,"tumor_stage"]
grade = clinical.loc[:,"tumor_grade"]
last_followup = clinical.loc[:,"days_to_last_follow_up"]
clinical.loc[:,"vital_status"] = [1  if stat == "dead" else 0 for stat in clinical.loc[:,"vital_status"]]
status = clinical.loc[:,"vital_status"]
days_to_death = clinical.loc[:,"days_to_death"]
age = clinical.loc[:,"age_at_diagnosis"] 
recurr = clinical.loc[:,"days_to_recurrence"]
names_common = clinical.loc[:,"primary_diagnosis"]
names_common.index = clinical.loc[:,"submitter_id"]
submitters = clinical.loc[:,"submitter_id"]
locations = clinical.loc[:,"project_id"]
locations = [loc.split("-")[-1] for loc in locations]
loc_corresp = {case_id[i] : locations[i] for i in range(len(case_id))}
submitters.index = clinical.loc[:,"case_id"]
i = 0
old_i=0
folder_list =os.listdir(gdc_folder)
for location in set(locations):
    print(location)
    folder_num =[]
    folder_count = 0
    data_list = []
    filenames_ordered = []
    for patient_folder in folder_list:
        patient_folder_path = gdc_folder + os.fsdecode(patient_folder)
        if(os.path.isdir(patient_folder_path)):
            for patient in os.listdir(patient_folder_path):
                filename = patient_folder_path + '/' +os.fsdecode(patient)
                
                
                if((patient.split(".")[-1] == "gz" or patient.split(".")[-1] == "counts") and loc_corresp[names_corresp[patient]] == location):
                    if(patient.split(".")[-1] == "gz"):
                        f = gzip.open(filename)
                        aux_data = pd.read_csv(f,sep='\t',index_col=0,header=None, names=[patient])
                    else:
                        aux_data = pd.read_csv(filename,sep='\t',index_col=0,header=None, names=[patient])
                    filenames_ordered += [names_corresp[patient]]
                    data_list.append(aux_data)
                    i += 1
                    folder_num += [patient_folder]
    folder_list = [fold for fold in folder_list if fold not in folder_num]
    print(location)
    print(len(folder_num))
    print(len(folder_list))
    print(i-old_i)
    if(i - old_i >1):
        data_list = pd.concat(data_list,axis=1,copy=False)
    elif i-old_i ==1:
        data_list = data_list[0]
    old_i = i
    """aux = [name for name in data_list.index if "ENSG" in name] 
    data_list = data_list.loc[aux,:]
    aux = np.array([name for name in data_list.index if "ENSG0" in name])
    aux2 = ["ENSG0" + name.split('R')[1] for name in data_list.index if "ENSGR" in name]
    aux2 = np.array([name.split(".")[0] for name in aux2])
    ro.globalenv["ind"] =aux
    ro.r("ind=as.vector(ind)")
    
    ro.r('ann <- getBM(attributes=c("hgnc_symbol","description","chromosome_name","band","strand","start_position","end_position","ensembl_gene_id_version"),  filters="ensembl_gene_id_version", values=ind, mart=human,uniqueRows=False)') 
    aux = [str(ind) for ind in ro.r('ann$hgnc_symbol')]
    print(ro.r('ann'))
    ro.globalenv["ind"] =aux2
    ro.r("ind=as.vector(ind)")
    ro.r('ann <- getBM(attributes=c("hgnc_symbol","description","chromosome_name","band","strand","start_position","end_position","ensembl_gene_id_version"),  filters="ensembl_gene_id_version", values=ind, mart=human,uniqueRows=False)') 
    aux2 = [str(ind) +'_2' for ind in ro.r('ann$hgnc_symbol')]
    print(ro.r('ann'))
    print(len(aux+aux2))
    data_list.index = aux + aux2
    print(data_list.index)
    data_list = data_list.T"""
    survival = [] 
    for j in filenames_ordered:
        if (status[j]==1 and days_to_death[j]!="--" ):
            survival += [int(days_to_death[j])]
        elif last_followup[j]!="--":
            survival += [int(last_followup[j])]
    binary = [status[k] for k in filenames_ordered]
    not_found = [k  for k in range(len(filenames_ordered)) if days_to_death[filenames_ordered[k]]=="--" and last_followup[filenames_ordered[k]]=="--"]
    data_list.to_pickle(folder + "genes_counts_" + location +"_gdc")
    diagnosis[filenames_ordered].to_pickle(folder + "diagnosis_" + location +"_gdc")
    stage[filenames_ordered].to_pickle(folder + "stage_" + location +"_gdc")
    names_common[submitters[filenames_ordered]].to_pickle(folder + "histo_" + location +"_gdc")
    
    np.save(file_folder+"/results/gdc_survival",survival)
    np.save(file_folder+"/results/gdc_survival_binary",binary)
    
    np.save(file_folder+"/results/gdc_survival_not_found_samples",not_found)    
print(i)