# Download RNA and DNA methylation data from TCGA

Reference of the TCGA portal:

https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/

```
Introduction 
The GDC mRNA quantification analysis pipeline measures gene level expression with   STAR as raw read counts. Subsequently the counts are augmented with several transformations including Fragments per Kilobase of transcript per Million mapped reads (FPKM), upper quartile normalized FPKM (FPKM-UQ), and Transcripts per Million (TPM). These values are additionally annotated with the gene symbol and gene bio-type. These data are generated through this pipeline by first aligning reads to the GRCh38   reference genome and then by quantifying the mapped reads. To facilitate harmonization across samples, all RNA-Seq reads are treated as unstranded during analyses.
```


In [1]:
# Read in the clinical files
import os

# Now we need to download the data from TCGA for the RNAseq
from scidat.api import API, APIException
from sciutil import SciUtil
import pandas as pd

u = SciUtil()
save_fig = False

annotation_file = f'hsapiens_gene_ensembl-GRCh38.p13.csv'

gene_name = 'external_gene_name'
gdc_client = f'./gdc-client'
sample_file = f'gdc_sample_sheet.2023-07-17.tsv'
manifest_file = f'gdc_manifest_20230717_100907.txt'
clinical_file = f'clinical.tsv' # Copied out from the clinical file

api = API(manifest_file, gdc_client, clinical_file, sample_file, 'downloads/', 'downloads/', annotation_file,
                            max_cnt=500, clin_cols=['primary_diagnosis', 'age_at_diagnosis', 'gender', 'race', 'vital_status',
                         'ajcc_pathologic_stage', 'days_to_death'],
          requires_lst=['counts', 'methylation_array'])

"""
If you haven't downloaded the data already you'll need to do this step!
""" 

download_rnaseq = False
if download_rnaseq:
    api.download_data_from_manifest()

  self.annotation_df = pd.read_csv(self.annotation_file)


# Build datasets

In [2]:
api.build_annotation()

[93m--------------------------------------------------------------------------------[0m
 Run: annotate.set_case_submitter_id() to setup. 
Continuing with automatic selection.	[0m
[93m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94m                    Submitter ID set as: 	case_submitter_id	                    [0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94m                              Clinical dataframe	                               [0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94m  case_submitter_id project_id age_at_index  gender   race vital_status  \
0         C3N-0117

In [3]:
data_dir = 'downloads/'
rna_df = api.build_star_rna_df(data_dir)
meth_df = api.build_sesame_meth_df(data_dir)
api.rna_df.to_csv(f'RNA_all.csv', index=False)
api.meth_df.to_csv(f'Methylation_all.csv', index=False)

In [4]:
rna_df = api.rna_df

In [5]:
rna_df.shape

(60664, 2137)

In [6]:
rna_df.shape

(60664, 2137)

In [7]:
meth_df.shape

(865918, 1767)

# Now we want to save out each of the cancers based on our clinical info for the different patients

In [8]:
clin_tcga_df = pd.read_csv(clinical_file, sep='\t')
clin_tcga_df

Unnamed: 0,case_id,case_submitter_id,project_id,age_at_index,age_is_obfuscated,cause_of_death,cause_of_death_source,country_of_residence_at_enrollment,days_to_birth,days_to_death,...,treatment_arm,treatment_dose,treatment_dose_units,treatment_effect,treatment_effect_indicator,treatment_frequency,treatment_intent_type,treatment_or_therapy,treatment_outcome,treatment_type
0,000ead0d-abf5-4606-be04-1ea31b999840,C3N-01179,CPTAC-3,'--,'--,'--,'--,'--,-26483,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--
1,003f4f85-3244-4132-8c9d-c29f09382269,C3N-02672,CPTAC-3,'--,'--,'--,'--,'--,-22056,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--
2,005d0639-c923-470f-a179-02a4dbb5cdf2,C3L-01929,CPTAC-3,'--,'--,'--,'--,'--,-19698,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--
3,006931bb-f5b1-4aa4-b0a8-af517a912db0,C3L-01838,CPTAC-3,'--,'--,'--,'--,'--,-25663,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--
4,0103db96-3e58-485c-bb9f-37b8a1f966e1,C3L-01146,CPTAC-3,'--,'--,Cancer Related,'--,'--,-21566,509,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,fdd6c700-c04d-4857-a522-8f83f67f5f95,C3N-00297,CPTAC-3,'--,'--,'--,'--,'--,-25254,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--
1146,fed33fff-0310-42ad-a7dc-d563792f18cc,C3N-03853,CPTAC-3,'--,'--,Unknown,'--,'--,-17976,393,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--
1147,fed70927-708c-479e-942c-139414370672,C3L-02856,CPTAC-3,'--,'--,'--,'--,'--,-32731,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--
1148,ff527151-6797-414e-8760-edcb73306a28,C3L-03984,CPTAC-3,'--,'--,'--,'--,'--,-21834,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--


In [11]:
clin_tcga_df['primary_diagnosis'].value_counts()

primary_diagnosis
Endometrioid adenocarcinoma, NOS    239
Adenocarcinoma, NOS                 223
Renal cell carcinoma, NOS           218
Squamous cell carcinoma, NOS        215
Infiltrating duct carcinoma, NOS    156
Glioblastoma                         99
Name: count, dtype: int64

In [12]:
# This basically means that alot of the datasets need more annoations so we'll have to use those from CPTAC files
import os
# Step through the files getting out all the clinical info
clin_df = pd.DataFrame()
bio_df = pd.DataFrame()
clin_dir = 'CPTAC_clinical'
clin_files = os.listdir(clin_dir)
for f in clin_files:
    if 'clinical' in f:
        clin_df = pd.concat([clin_df, pd.read_csv(os.path.join(clin_dir, f), sep='\t')])
    elif 'biospeci' in f:
        bio_df = pd.concat([bio_df, pd.read_csv(os.path.join(clin_dir, f), sep='\t')])
bio_df

Unnamed: 0,Aliquot ID,Aliquot Submitter ID,Sample ID,Sample Submitter ID,Case ID,Case Submitter ID,Project Name,Sample Type,Primary Site,Disease Type,...,Sample Ordinal,Shortest Dimension,Time Between Clamping And Freezing,Time Between Excision and Freezing,Tissue Collection Type,Tissue Type,Tumor Code,Tumor Code ID,Tumor Descriptor,Program Name
0,db299c0d-8891-4392-a4c9-85b1f51972b6,NCI7-1.0_pro,0d5e281f-4f2e-4567-9c66-b866f6b89a0a,NCI7-1.0_pro,54559a36-d9bc-4373-86f5-98d50a203d76,NCI-7,CPTAC3-Other,Cell Lines,Not Reported,Other,...,,,,,,,,,,Clinical Proteomic Tumor Analysis Consortium
1,20f536b7-a3be-4ae9-ac2f-706702208c19,NCI7-0.1_pro,c3176869-7ee5-484f-b816-7af6d8b98e3d,NCI7-0.1_pro,54559a36-d9bc-4373-86f5-98d50a203d76,NCI-7,CPTAC3-Other,Cell Lines,Not Reported,Other,...,,,,,,,,,,Clinical Proteomic Tumor Analysis Consortium
2,209334d7-a63d-4dba-9d2a-0ed158594c08,NCI7-1.0_pep,e798c551-1164-4690-b646-837b05b97029,NCI7-1.0_pep,54559a36-d9bc-4373-86f5-98d50a203d76,NCI-7,CPTAC3-Other,Cell Lines,Not Reported,Other,...,,,,,,,,,,Clinical Proteomic Tumor Analysis Consortium
3,a42d5403-25f1-4868-b56e-a4a5d8dc2c76,NCI7-0.5_pep,4aed607e-110d-448b-8527-6ed2816c438b,NCI7-0.5_pep,54559a36-d9bc-4373-86f5-98d50a203d76,NCI-7,CPTAC3-Other,Cell Lines,Not Reported,Other,...,,,,,,,,,,Clinical Proteomic Tumor Analysis Consortium
4,3f04b11c-ff67-476f-9ba8-1fe7b1df010f,NCI7-0.2_pro,c55a87d2-05c6-44a6-8078-4bffa1497a43,NCI7-0.2_pro,54559a36-d9bc-4373-86f5-98d50a203d76,NCI-7,CPTAC3-Other,Cell Lines,Not Reported,Other,...,,,,,,,,,,Clinical Proteomic Tumor Analysis Consortium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,1cfd38e5-0295-4ca4-9bd4-e1e96edeffde,CPT0236360004,778f5ab4-651a-4f9b-83a7-96f9243b236b,C3N-03069-03,d21fa2fa-9a2d-4baa-b7d8-601b906a16c7,C3N-03069,CPTAC3 Discovery and Confirmatory,Solid Tissue Normal,Pancreas,Pancreatic Ductal Adenocarcinoma,...,,,19.0,14.0,,Normal,,,,Clinical Proteomic Tumor Analysis Consortium
246,7328da7b-0f55-49c3-8e74-5114fcb4fc18,CPT0218220004,d962522e-e61d-446a-9393-784867a459b5,C3L-01124-04,5a3fd9f7-cd11-4ed9-b122-3096ba0e93a5,C3L-01124,CPTAC3 Discovery and Confirmatory,Solid Tissue Normal,Pancreas,Pancreatic Ductal Adenocarcinoma,...,,,,19.0,,Normal,,,,Clinical Proteomic Tumor Analysis Consortium
247,c59055c9-e724-49cb-bfc3-9f0d072eea48,CPT0094940004,dc68bdaf-21b9-43e7-9dc0-31fbaddd164c,"C3L-01703-02, C3L-01703-03",344f4536-d2cf-4162-99d8-dcc40972c80b,C3L-01703,CPTAC3 Discovery and Confirmatory,Primary Tumor,Pancreas,Pancreatic Ductal Adenocarcinoma,...,,,,,,Tumor,,,,Clinical Proteomic Tumor Analysis Consortium
248,b0efe015-7be2-4faf-a3f6-a5962316dc83,CPT0078000003,a2913672-6c34-44df-ad13-f3cf5144b481,C3N-00518-01,0f06151e-53a2-4573-84c3-76e1b5c0ce34,C3N-00518,CPTAC3 Discovery and Confirmatory,Primary Tumor,Pancreas,Pancreatic Ductal Adenocarcinoma,...,,,,6.0,,Tumor,,,,Clinical Proteomic Tumor Analysis Consortium


# Combine the two on case_submitter_id and 'Cases Submitter ID' so that we can actually find out what each patient has!


In [59]:
# Select those that actually have both RNA and also Protein
df = pd.merge(clin_df, clin_tcga_df, left_on='Cases Submitter ID', right_on='case_submitter_id', how='outer')

In [60]:
df['Disease Type'].value_counts()

Disease Type
Pancreatic Ductal Adenocarcinoma         140
Lung Adenocarcinoma                      111
Head and Neck Squamous Cell Carcinoma    110
Lung Squamous Cell Carcinoma             110
Uterine Corpus Endometrial Carcinoma     104
Clear Cell Renal Cell Carcinoma          103
Other                                     91
Breast Invasive Carcinoma                 65
Acute Myeloid Leukemia                    39
Non-Clear Cell Renal Cell Carcinoma        7
Name: count, dtype: int64

In [61]:
df['Disease Type'].value_counts()

Disease Type
Pancreatic Ductal Adenocarcinoma         140
Lung Adenocarcinoma                      111
Head and Neck Squamous Cell Carcinoma    110
Lung Squamous Cell Carcinoma             110
Uterine Corpus Endometrial Carcinoma     104
Clear Cell Renal Cell Carcinoma          103
Other                                     91
Breast Invasive Carcinoma                 65
Acute Myeloid Leukemia                    39
Non-Clear Cell Renal Cell Carcinoma        7
Name: count, dtype: int64

In [62]:
df['Race'].value_counts()

Race
White                               488
Other                               129
Asian                               126
Not Reported                         95
Black or African American            25
Unknown                              16
American Indian or Alaska Native      1
Name: count, dtype: int64

In [46]:
df['Gender'].value_counts()

Gender
Male            441
Female          354
Not Reported     85
Name: count, dtype: int64

In [47]:
df['AJCC Pathologic Stage'].value_counts()

AJCC Pathologic Stage
Stage I         140
                134
Stage III       128
Stage IIB        92
Stage IB         72
Not Reported     58
Stage IIA        51
Stage II         48
Stage IA         41
Stage IIIA       38
Stage IVA        37
Stage IV         27
Stage IVB         6
Unknown           3
Stage IIIB        2
Stage IA3         1
Name: count, dtype: int64

# For each of the Disease Types for each case in that disease type see if it exists in the RNA file and if so add that case

1. Basically go through and filter out each of the files for the cases that exist in the RNA and methylation datasets to create a dataset for each cancer.

In [48]:
set(df['Disease Type'].values)

{'Acute Myeloid Leukemia',
 'Breast Invasive Carcinoma',
 'Clear Cell Renal Cell Carcinoma',
 'Head and Neck Squamous Cell Carcinoma',
 'Lung Adenocarcinoma',
 'Lung Squamous Cell Carcinoma',
 'Non-Clear Cell Renal Cell Carcinoma',
 'Other',
 'Pancreatic Ductal Adenocarcinoma',
 'Uterine Corpus Endometrial Carcinoma',
 nan}

In [49]:
output_dir = '../../output_data/'
diseases = ['Uterine Corpus Endometrial Carcinoma']

case_to_column = []
multi_cases = []
for disease in diseases:
    cases = list(df[df['Disease Type'] == disease]['case_submitter_id'].values)
    # Save both to files
    disease = disease.replace(' ', '')
    d_rna_df = rna_df[['gene_id', 'gene_name']]
    d_meth_df = meth_df[['id']]
    # Now go through the RNA file
    for col in rna_df.columns:
        c = col.split('_')
        if len(c) > 3:
            case_id = c[-2] # Make safe for R # ALso 
            if ',' in case_id:
                multi_cases.append(case_id)
                case_id = case_id.split(',')[0]

            if case_id in cases:
                case_id = case_id.replace('-', '.')
                sample_type = 'Tumor' if 'SolidTissueNormal' not in c else 'Normal' 
                label = f'{case_id}_{sample_type}_RNA_{disease}_{c[-1].replace("-", ".")}'
                d_rna_df[label] = rna_df[col].values
                case_to_column.append([col, label])
    for col in meth_df.columns:
        c = col.split('_')
        if len(c) > 3:
            case_id = c[-2] # Make safe for R # ALso 
            if ',' in case_id:
                multi_cases.append(case_id)
                case_id = case_id.split(',')[0]

            if case_id in cases:
                case_id = case_id.replace('-', '.')
                sample_type = 'Tumor' if 'SolidTissueNormal' not in c else 'Normal' 
                label = f'{case_id}_{sample_type}_CpG_{disease}_{c[-1].replace("-", ".")}'
                d_meth_df[label] = meth_df[col].values
                case_to_column.append([col, label])

    print("DONE", disease)
    #d_meth_df.to_csv(f'{output_dir}DNAMethylation.csv', index=False)
    #d_rna_df.to_csv(f'{output_dir}RNA.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

DONE UterineCorpusEndometrialCarcinoma


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

In [54]:
cases = list(df[df['Disease Type'] == disease]['case_submitter_id'].values)


In [63]:
cases = list(clin_tcga_df['case_submitter_id'].values)

In [67]:
clin_tcga_df[clin_tcga_df['case_submitter_id'] == 'C3N-01878']['Disease Type']

KeyError: 'Disease Type'

In [64]:
[c for c in cases if 'C3N-01878' in c]

['C3N-01878']

In [167]:
output_dir = '../../output_data/'
diseases = [
 'Clear Cell Renal Cell Carcinoma',
 'Head and Neck Squamous Cell Carcinoma',
 'Lung Adenocarcinoma',
 'Lung Squamous Cell Carcinoma',
 'Pancreatic Ductal Adenocarcinoma',
 'Uterine Corpus Endometrial Carcinoma']

case_to_column = []
multi_cases = []
for disease in diseases:
    cases = list(df[df['Disease Type'] == disease]['case_submitter_id'].values)
    # Save both to files
    disease = disease.replace(' ', '')
    d_rna_df = rna_df[['gene_id', 'gene_name']]
    d_meth_df = meth_df[['id']]
    # Now go through the RNA file
    for col in rna_df.columns:
        c = col.split('_')
        if len(c) > 3:
            case_id = c[-2] # Make safe for R # ALso 
            if ',' in case_id:
                multi_cases.append(case_id)
                case_id = case_id.split(',')[0]

            if case_id in cases:
                case_id = case_id.replace('-', '.')
                sample_type = 'Tumor' if 'SolidTissueNormal' not in c else 'Normal' 
                label = f'{case_id}_{sample_type}_RNA_{disease}_{c[-1].replace("-", ".")}'
                d_rna_df[label] = rna_df[col].values
                case_to_column.append([col, label])
    for col in meth_df.columns:
        c = col.split('_')
        if len(c) > 3:
            case_id = c[-2] # Make safe for R # ALso 
            if ',' in case_id:
                multi_cases.append(case_id)
                case_id = case_id.split(',')[0]

            if case_id in cases:
                case_id = case_id.replace('-', '.')
                sample_type = 'Tumor' if 'SolidTissueNormal' not in c else 'Normal' 
                label = f'{case_id}_{sample_type}_CpG_{disease}_{c[-1].replace("-", ".")}'
                d_meth_df[label] = meth_df[col].values
                case_to_column.append([col, label])

    print("DONE", disease)
    #d_meth_df.to_csv(f'{output_dir}DNAMethylation.csv', index=False)
    #d_rna_df.to_csv(f'{output_dir}RNA.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_me

DONE ClearCellRenalCellCarcinoma


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

DONE HeadandNeckSquamousCellCarcinoma


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_me

DONE LungAdenocarcinoma


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_me

DONE LungSquamousCellCarcinoma


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_me

DONE PancreaticDuctalAdenocarcinoma


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_rna_df[label] = rna_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_meth_df[label] = meth_df[col].values


DONE UterineCorpusEndometrialCarcinoma


# Make a clean patient sample dataset for each of the cancers 

In [168]:
# Get all the cases that were included
cases = [c[1].split('_')[0] for c in case_to_column]
df['SafeCases'] = [c.split(',')[0].replace('-', '.') for c in df['case_submitter_id'].values]
case_df = df[df['SafeCases'].isin(cases)]
u.dp([len(case_df), len(df)])

[94m--------------------------------------------------------------------------------[0m
[94m                                    660	667	                                    [0m
[94m--------------------------------------------------------------------------------[0m


In [169]:
not_cases = df[~df['SafeCases'].isin(cases)]['Cases Submitter ID'].values
for c in not_cases:
    print(c)

C3N-00832
C3N-01175
C3N-01180
C3L-00359
C3N-00435
C3N-00313
C3N-00492


In [170]:
df[~df['SafeCases'].isin(cases)]['Disease Type'].value_counts()

Disease Type
Non-Clear Cell Renal Cell Carcinoma    7
Name: count, dtype: int64

## Let's add some demographics we want to keep 

1. Case ID: `Cases Submitter ID` i.e. patient ID
2. Gender: `Gender`
3. Race: `Race`
4. Ethnicity: `Ethnicity`
5. Dead/alive: `Days to Death`
6. Tumour stage: `Tumor Stage`
7. Disease type: `Disease Type`
8. Primary Site: `Primary Site`
9. Age: `Days to Birth` --> convert to years

In [171]:
features = ['Cases Submitter ID', 'Gender', 'Race', 'Ethnicity', 'Primary Site', 'Disease Type', 
            'AJCC Pathologic Stage', 'Days to Death', 'Days to Birth']
for f in features:
    u.dp([f])
    print(case_df[f].value_counts())

[94m--------------------------------------------------------------------------------[0m
[94m                              Cases Submitter ID	                               [0m
[94m--------------------------------------------------------------------------------[0m
Cases Submitter ID
C3N-01946    1
C3L-00401    1
C3L-03628    1
C3N-03430    1
C3L-03395    1
            ..
C3N-04127    1
C3N-03886    1
C3N-01025    1
C3L-00445    1
C3N-02729    1
Name: count, Length: 660, dtype: int64
[94m--------------------------------------------------------------------------------[0m
[94m                                    Gender	                                     [0m
[94m--------------------------------------------------------------------------------[0m
Gender
Male      399
Female    261
Name: count, dtype: int64
[94m--------------------------------------------------------------------------------[0m
[94m                                     Race	                                      

In [172]:
# Convert stage (i.e. AJCC Pathologic Stage to Stage I, Stage II, Stage III, Stage IV) and also early & late
# Convert age to years
stage_simple = []
stage_early_late = []
age_born_yrs = []
ages = case_df['Days to Birth'].values
stages = case_df['AJCC Pathologic Stage'].values
for i, stage in enumerate(stages):
    if stage == 'Stage I' or stage == 'Stage IA3' or stage == 'Stage IA' or stage == 'Stage IB':
        stage_simple.append('Stage I')
        stage_early_late.append('Early')
    elif stage == 'Stage IIB' or stage == 'Stage II' or stage == 'Stage IIA':
        stage_simple.append('Stage II')
        stage_early_late.append('Early')
    elif stage == 'Stage III' or stage == 'Stage IIIA' or stage == 'Stage IIIB':
        stage_simple.append('Stage III')
        stage_early_late.append('Late')
    elif stage == 'Stage IV' or stage == 'Stage IVA' or stage == 'Stage IVB':
        stage_simple.append('Stage IV')
        stage_early_late.append('Late')
    else:
        print(stage)
        stage_early_late.append(None)
        stage_simple.append(None)
    try:
        age_born_yrs.append(int(int(ages[i])/365))
    except:
        age_born_yrs.append(None)
        
case_df['TumorStage'] = stage_simple
case_df['Stage'] = stage_early_late
case_df['AgeYears'] = age_born_yrs

features = ['Cases Submitter ID', 'Gender', 'Race', 'Ethnicity', 'Primary Site', 'Disease Type', 
            'AgeYears', 'Stage', 'TumorStage',
            'AJCC Pathologic Stage', 'Days to Death', 'Days to Birth']
for f in features:
    u.dp([f])
    print(case_df[f].value_counts())

Unknown
Unknown
Unknown
Not Reported
Not Reported
Not Reported
Not Reported
Not Reported
[94m--------------------------------------------------------------------------------[0m
[94m                              Cases Submitter ID	                               [0m
[94m--------------------------------------------------------------------------------[0m
Cases Submitter ID
C3N-01946    1
C3L-00401    1
C3L-03628    1
C3N-03430    1
C3L-03395    1
            ..
C3N-04127    1
C3N-03886    1
C3N-01025    1
C3L-00445    1
C3N-02729    1
Name: count, Length: 660, dtype: int64
[94m--------------------------------------------------------------------------------[0m
[94m                                    Gender	                                     [0m
[94m--------------------------------------------------------------------------------[0m
Gender
Male      399
Female    261
Name: count, dtype: int64
[94m--------------------------------------------------------------------------------[

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  case_df['TumorStage'] = stage_simple
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  case_df['Stage'] = stage_early_late
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  case_df['AgeYears'] = age_born_yrs


## Again need to check for wierd cases

In [173]:
# Fix the ordering to make it easier and save to CSV and also add in the files that were associated with each case
case_to_files = {}
for case_value in case_to_column:
    case = case_value[1].split('_')[0]
    if case_to_files.get(case):
        case_to_files[case].append(case_value[1])
    else:
        case_to_files[case] = []
        case_to_files[case].append(case_value[1])

In [174]:
case_files = []
case_file_count = []
safe_cases = []
for case in case_df['Cases Submitter ID'].values:
    try:
        case = case.replace('-', '.')
        case = case.split(',')[0]
        safe_cases.append(case)
        if not case_to_files.get(case):
            case_file_count.append(None)
            case_files.append(None)
            print(case)
        else:
            case_file_count.append(len(case_to_files.get(case)))
            case_files.append('|'.join(case_to_files.get(case)))
    except:
        print(case)
        
case_df['SafeCases'] = safe_cases
case_df['CaseFiles'] = case_files
case_df['CaseFileCounts'] = case_file_count
case_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  case_df['SafeCases'] = safe_cases
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  case_df['CaseFiles'] = case_files
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  case_df['CaseFileCounts'] = case_file_count


Unnamed: 0,Case ID,Cases Submitter ID,Related Entities,Annotation,Genomic and Imaging Data Resource,Ethnicity,Gender,Race,Morphology,Primary Diagnosis,...,treatment_intent_type,treatment_or_therapy,treatment_outcome,treatment_type,SafeCases,TumorStage,Stage,AgeYears,CaseFiles,CaseFileCounts
0,df4ed85e-8f98-11ea-b1fd-0aad30af8a83,C3N-01946,,,GDC: https://portal.gdc.cancer.gov/cases/14b0b...,Not Reported,Male,White,8070/3,"Squamous cell carcinoma, NOS",...,'--,'--,'--,'--,C3N.01946,Stage II,Early,-64.0,C3N.01946_Normal_RNA_HeadandNeckSquamousCellCa...,3
1,df4ecd30-8f98-11ea-b1fd-0aad30af8a83,C3N-01754,,,GDC: https://portal.gdc.cancer.gov/cases/a18e0...,Not Reported,Male,White,8070/3,"Squamous cell carcinoma, NOS",...,'--,'--,'--,'--,C3N.01754,Stage III,Late,-64.0,C3N.01754_Normal_RNA_HeadandNeckSquamousCellCa...,5
2,df4e9d3d-8f98-11ea-b1fd-0aad30af8a83,C3L-01138,,,GDC: https://portal.gdc.cancer.gov/cases/93160...,Not Reported,Male,Other,8070/3,"Squamous cell carcinoma, NOS",...,'--,'--,'--,'--,C3L.01138,Stage IV,Late,-62.0,C3L.01138_Tumor_RNA_HeadandNeckSquamousCellCar...,5
3,df4f1689-8f98-11ea-b1fd-0aad30af8a83,C3N-03888,,,GDC: https://portal.gdc.cancer.gov/cases/0a1de...,Not Reported,Male,Other,8070/3,"Squamous cell carcinoma, NOS",...,'--,'--,'--,'--,C3N.03888,Stage III,Late,-58.0,C3N.03888_Tumor_RNA_HeadandNeckSquamousCellCar...,5
4,df4f238b-8f98-11ea-b1fd-0aad30af8a83,C3N-04280,,,GDC: https://portal.gdc.cancer.gov/cases/1f370...,Not Reported,Male,White,8070/3,"Squamous cell carcinoma, NOS",...,'--,'--,'--,'--,C3N.04280,Stage II,Early,-66.0,C3N.04280_Tumor_RNA_HeadandNeckSquamousCellCar...,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
662,f1ee4435-cf1e-11e9-9a07-0a80fada099c,C3N-02582,,,GDC: https://portal.gdc.cancer.gov/cases/dd186...,Not Reported,Male,Asian,8140/3,"Adenocarcinoma, NOS",...,'--,'--,'--,'--,C3N.02582,Stage II,Early,-77.0,C3N.02582_Tumor_RNA_LungAdenocarcinoma_51b7ec1...,4
663,f1ee455a-cf1e-11e9-9a07-0a80fada099c,C3N-02586,,,GDC: https://portal.gdc.cancer.gov/cases/89190...,Not Reported,Male,Asian,8140/3,"Adenocarcinoma, NOS",...,'--,'--,'--,'--,C3N.02586,Stage II,Early,-74.0,C3N.02586_Tumor_RNA_LungAdenocarcinoma_91540fe...,4
664,f1ee4684-cf1e-11e9-9a07-0a80fada099c,C3N-02587,,,GDC: https://portal.gdc.cancer.gov/cases/cd6ed...,Not Reported,Female,Asian,8140/3,"Adenocarcinoma, NOS",...,'--,'--,'--,'--,C3N.02587,Stage I,Early,-59.0,C3N.02587_Normal_RNA_LungAdenocarcinoma_8b59c6...,4
665,f1ee47a8-cf1e-11e9-9a07-0a80fada099c,C3N-02588,,,GDC: https://portal.gdc.cancer.gov/cases/86058...,Not Reported,Male,Asian,8140/3,"Adenocarcinoma, NOS",...,'--,'--,'--,'--,C3N.02588,Stage II,Early,-69.0,C3N.02588_Tumor_RNA_LungAdenocarcinoma_07045ec...,4


In [175]:
case_df['CaseFileCounts'].value_counts()

CaseFileCounts
4     273
2     160
3     152
5      30
8      23
10      9
6       4
1       3
7       3
9       2
12      1
Name: count, dtype: int64

In [176]:
case_df[case_df['CaseFileCounts'] > 8]

Unnamed: 0,Case ID,Cases Submitter ID,Related Entities,Annotation,Genomic and Imaging Data Resource,Ethnicity,Gender,Race,Morphology,Primary Diagnosis,...,treatment_intent_type,treatment_or_therapy,treatment_outcome,treatment_type,SafeCases,TumorStage,Stage,AgeYears,CaseFiles,CaseFileCounts
344,6386852c-1fb9-11e9-b7f8-0a80fada099c,C3N-00150,,,GDC: https://portal.gdc.cancer.gov/cases/6ff70...,Not Reported,Female,White,8312/3,"Renal cell carcinoma, NOS",...,'--,'--,'--,'--,C3N.00150,Stage IV,Late,-55.0,C3N.00150_Tumor_RNA_ClearCellRenalCellCarcinom...,10
361,675d0c38-1fb9-11e9-b7f8-0a80fada099c,C3N-00168,,,GDC: https://portal.gdc.cancer.gov/cases/2d1ff...,Not Reported,Male,Asian,8312/3,"Renal cell carcinoma, NOS",...,'--,'--,'--,'--,C3N.00168,Stage I,Early,-47.0,C3N.00168_Normal_RNA_ClearCellRenalCellCarcino...,10
370,8cbd7a01-1fb9-11e9-b7f8-0a80fada099c,C3N-00573,,,GDC: https://portal.gdc.cancer.gov/cases/a0d5a...,Not Reported,Male,Asian,8312/3,"Renal cell carcinoma, NOS",...,'--,'--,'--,'--,C3N.00573,Stage II,Early,-61.0,C3N.00573_Normal_RNA_ClearCellRenalCellCarcino...,10
378,8e59b323-1fb9-11e9-b7f8-0a80fada099c,C3N-00577,,,GDC: https://portal.gdc.cancer.gov/cases/262b9...,Not Reported,Male,Asian,8312/3,"Renal cell carcinoma, NOS",...,'--,'--,'--,'--,C3N.00577,Stage IV,Late,-72.0,C3N.00577_Normal_RNA_ClearCellRenalCellCarcino...,10
380,901a5db5-1fb9-11e9-b7f8-0a80fada099c,C3N-00646,,,GDC: https://portal.gdc.cancer.gov/cases/68daa...,Not Reported,Female,White,8312/3,"Renal cell carcinoma, NOS",...,'--,'--,'--,'--,C3N.00646,Stage I,Early,-57.0,C3N.00646_Tumor_RNA_ClearCellRenalCellCarcinom...,10
384,78a71616-1fb9-11e9-b7f8-0a80fada099c,C3N-00314,,,GDC: https://portal.gdc.cancer.gov/cases/3fe41...,Not Reported,Male,White,8312/3,"Renal cell carcinoma, NOS",...,'--,'--,'--,'--,C3N.00314,Stage I,Early,-79.0,C3N.00314_Normal_RNA_ClearCellRenalCellCarcino...,10
385,73b649eb-1fb9-11e9-b7f8-0a80fada099c,C3N-00310,,,GDC: https://portal.gdc.cancer.gov/cases/2f344...,Not Reported,Male,White,8312/3,"Renal cell carcinoma, NOS",...,'--,'--,'--,'--,C3N.00310,Stage III,Late,-84.0,C3N.00310_Tumor_RNA_ClearCellRenalCellCarcinom...,10
395,6039aa8e-1fb9-11e9-b7f8-0a80fada099c,C3N-00148,,,GDC: https://portal.gdc.cancer.gov/cases/0625c...,Not Reported,Male,White,8312/3,"Renal cell carcinoma, NOS",...,'--,'--,'--,'--,C3N.00148,Stage I,Early,-52.0,C3N.00148_Tumor_RNA_ClearCellRenalCellCarcinom...,10
400,810c2c81-1fb9-11e9-b7f8-0a80fada099c,C3N-00390,,,GDC: https://portal.gdc.cancer.gov/cases/a7b5e...,Not Reported,Male,Other,8312/3,"Renal cell carcinoma, NOS",...,'--,'--,'--,'--,C3N.00390,Stage IV,Late,-58.0,C3N.00390_Tumor_RNA_ClearCellRenalCellCarcinom...,10
414,7d95243f-1fb9-11e9-b7f8-0a80fada099c,C3N-00320,,,GDC: https://portal.gdc.cancer.gov/cases/ceae0...,Not Reported,Male,White,8312/3,"Renal cell carcinoma, NOS",...,'--,'--,'--,'--,C3N.00320,Stage III,Late,-67.0,C3N.00320_Tumor_RNA_ClearCellRenalCellCarcinom...,9


In [177]:
col_order = ['SafeCases', 'TumorStage', 'Stage', 'AgeYears', 'CaseFiles', 'CaseFileCounts']
case_df = case_df[col_order + [c for c in case_df.columns if c not in col_order]]
case_df.to_csv('PatientInfo.csv', index=False)

# Filter the cases and make sure for each cancer we don't have patient outliers!

Given in my previous experience there are often outliers in the patients let's remove those from both datasets.

It's also important to check teh biospecimen data - make sure patients don't have multiple submissions (since it seems like there are sometimes multiple RNAseq files for the same patient. i.e. if there are multiple submissions let's try and picl the one which is Solid Tissue not Peripheral Blood Componenets.

In [178]:
bio_df = pd.read_csv('biospecimen.cart.2023-07-17/sample.tsv', sep='\t')
bio_df['composition'].value_counts()

composition
Solid Tissue                       3219
Peripheral Blood Components NOS    1687
Buffy Coat                          102
'--                                 100
Name: count, dtype: int64

In [179]:
len(set(bio_df['sample_id'].values))
bio_sample_map = dict(zip(bio_df['sample_submitter_id'], bio_df['composition']))
sample_df = pd.read_csv(sample_file, sep='\t')
sample_map = dict(zip(sample_df['File ID'], sample_df['Sample ID']))

In [180]:
sample_map

{'8d16fe63-865d-4345-af83-8a2893a64450': 'C3L-02544-06',
 '506d7859-0365-4a18-9da1-7c45d45cc243': 'C3N-00518-01',
 'ef18d754-47c2-4b5c-b6ad-457e3d40ace4': 'C3L-00770-11',
 'ae33e6d4-c8f4-433f-a67f-dc1ba4f7888f': 'C3L-00586-03',
 '5dff30b4-5d48-45ca-aafd-3b11a76d2914': '93e30fd5-e57e-4503-a175-863c7d',
 'ee33d292-5c62-4374-aa8f-e8526a8004d9': '93e30fd5-e57e-4503-a175-863c7d',
 '2fd0ea1f-2820-4523-9c54-aa9bc26ae9ff': 'C3L-02544-06',
 '9438c797-47ec-484e-b77d-40bb3f58522e': 'C3L-00770-01',
 'c5223d57-a5a8-4da2-9753-268561694671': 'C3L-00586-13',
 '4dcc7963-52c6-42d4-b604-4c2061fbdd5e': 'C3N-01179-05',
 '91a53855-280c-4cee-8674-52aa5fc500c1': 'C3N-01179-05',
 '7dd1e40d-bc90-43e3-8afc-3c8179e71a1b': 'C3N-01179-01',
 'a1a06be4-47f1-406b-8bdd-5610049d3b2c': 'C3N-01179-01',
 '35fdab2d-e6bc-4c27-a41f-6d0c8203ab5d': 'C3L-01033-03',
 '72e851d6-01ff-4647-8d77-db52ac5243dc': 'C3L-00583-06',
 'a7100c65-02eb-4e3c-b840-f3e21e78d318': 'C3L-01033-05',
 '7a0ed3cd-c398-4115-9e80-b6f09290ec6c': 'C3N-03490-

# Build a RNA and a CpG sample DF

In [181]:
len(set(bio_df['sample_id'].values))
bio_sample_map = dict(zip(bio_df['sample_submitter_id'], bio_df['composition']))
sample_df = pd.read_csv(sample_file, sep='\t')
sample_map = dict(zip(sample_df['File ID'], sample_df['Sample ID']))

rna_sample_df = pd.DataFrame()
cpg_sample_df = pd.DataFrame()

rna_tumour_count = []
rna_normal_count = []
cpg_tumour_count = []
cpg_normal_count = []

rna_case_ids = []
rna_samples = []

cpg_case_ids = []
cpg_samples = []

for i, case_id in enumerate(case_df['SafeCases'].values):
    files = case_to_files.get(case_id)
    rna_files = [c for c in files if 'RNA' in c]
    cpg_files = [c for c in files if 'CpG' in c]
    rna_tumour_count.append(len([c for c in rna_files if 'Tumor' in c]))
    rna_normal_count.append(len([c for c in rna_files if 'Normal' in c]))
    cpg_tumour_count.append(len([c for c in cpg_files if 'Tumor' in c]))
    cpg_normal_count.append(len([c for c in cpg_files if 'Normal' in c]))
    # For each of these let's have a add the case to the case Ids
    for r in rna_files:
        s = sample_map.get(r.split('_')[-1].replace('.', '-'))
        if bio_sample_map.get(s) == 'Solid Tissue':
            rna_case_ids.append(case_id)
            rna_samples.append(r)
    for r in cpg_files:
        s = sample_map.get(r.split('_')[-1].replace('.', '-'))
        if bio_sample_map.get(s) == 'Solid Tissue':
            cpg_case_ids.append(case_id)
            cpg_samples.append(r)
rna_sample_df['SafeCases'] = rna_case_ids
rna_sample_df['Sample'] = rna_samples
cpg_sample_df['SafeCases'] = cpg_case_ids
cpg_sample_df['Sample'] = cpg_samples
cpg_sample_df

Unnamed: 0,SafeCases,Sample
0,C3N.01946,C3N.01946_Tumor_CpG_HeadandNeckSquamousCellCar...
1,C3N.01754,C3N.01754_Normal_CpG_HeadandNeckSquamousCellCa...
2,C3L.01138,C3L.01138_Tumor_CpG_HeadandNeckSquamousCellCar...
3,C3L.01138,C3L.01138_Normal_CpG_HeadandNeckSquamousCellCa...
4,C3N.03888,C3N.03888_Tumor_CpG_HeadandNeckSquamousCellCar...
...,...,...
942,C3N.02582,C3N.02582_Normal_CpG_LungAdenocarcinoma_02b912...
943,C3N.02586,C3N.02586_Normal_CpG_LungAdenocarcinoma_d4ed07...
944,C3N.02587,C3N.02587_Normal_CpG_LungAdenocarcinoma_36dec5...
945,C3N.02588,C3N.02588_Normal_CpG_LungAdenocarcinoma_496143...


In [182]:
# Now we also want to get the different sample things like sample type, disease, stage & age
cpg_sample_df = pd.merge(cpg_sample_df, case_df, how='left', on='SafeCases')
cpg_sample_df['CondID'] = [1 if 'Tumor' in c else 0 for c in cpg_sample_df['Sample'].values]
cpg_sample_df['Disease'] = [c.split('_')[3] for c in cpg_sample_df['Sample'].values]


rna_sample_df = pd.merge(rna_sample_df, case_df, how='left', on='SafeCases')
rna_sample_df['CondID'] = [1 if 'Tumor' in c else 0 for c in rna_sample_df['Sample'].values]
rna_sample_df['Disease'] = [c.split('_')[3] for c in rna_sample_df['Sample'].values]

rna_sample_df.to_csv('rna_sample_df.csv', index=False)
cpg_sample_df.to_csv('cpg_sample_df.csv', index=False)


In [183]:
cpg_sample_df['Disease'].value_counts()

Disease
ClearCellRenalCellCarcinoma          257
LungSquamousCellCarcinoma            173
LungAdenocarcinoma                   171
PancreaticDuctalAdenocarcinoma       135
HeadandNeckSquamousCellCarcinoma     120
UterineCorpusEndometrialCarcinoma     91
Name: count, dtype: int64

In [185]:

# Just want to save each of the sample dataframes to the output folder

for disease in list(set(cpg_sample_df['Disease'].values)):
    samples = list(cpg_sample_df[cpg_sample_df['Disease'] == disease]['Sample'].values)
    cpg_sample_df[cpg_sample_df['Sample'].isin(samples)].to_csv(f'{output_dir}{disease}/{disease}_samples_CpG.csv', index=False)
    rna_samples = list(rna_sample_df[rna_sample_df['Disease'] == disease]['Sample'].values)
    
    u.dp([disease, 'Samples RNA:', len(rna_samples), 'CpG:', len(samples), 'Overlap in cases:', 
          len(set(list(cpg_sample_df[cpg_sample_df['Disease'] == disease]['SafeCases'].values)) & set(list(rna_sample_df[rna_sample_df['Disease'] == disease]['SafeCases'].values)))])
    rna_sample_df[rna_sample_df['Sample'].isin(rna_samples)].to_csv(f'{output_dir}{disease}/{disease}_samples_RNA.csv', index=False)
    

[94m--------------------------------------------------------------------------------[0m
[94mUterineCorpusEndometrialCarcinoma	Samples RNA:	173	CpG:	91	Overlap in cases:	90	[0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94mHeadandNeckSquamousCellCarcinoma	Samples RNA:	203	CpG:	120	Overlap in cases:	87	[0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94m PancreaticDuctalAdenocarcinoma	Samples RNA:	115	CpG:	135	Overlap in cases:	89	 [0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94m      LungAdenocarcinoma	Samples RNA:	173	CpG:	171	Overlap in cases:	105	       [0m
[94m-----

## Lastly, save the caseDF information to a csv as well

In [193]:
for disease in set(case_df['Disease Type'].values):
    disease_case_df = case_df[case_df['Disease Type'] == disease]
    disease = disease.replace(' ', '')
    disease_case_df['Disease'] = disease
    disease_case_df.to_csv(os.path.join(output_dir, f'{disease}/{disease}_case.csv'), index=False)
    u.dp([disease])

[94m--------------------------------------------------------------------------------[0m
[94m                       UterineCorpusEndometrialCarcinoma	                       [0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94m                        PancreaticDuctalAdenocarcinoma	                         [0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94m                          ClearCellRenalCellCarcinoma	                          [0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94m                              LungAdenocarcinoma	                               [0m
[94m-----

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disease_case_df['Disease'] = disease
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disease_case_df['Disease'] = disease
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disease_case_df['Disease'] = disease
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,

In [189]:
output_dir

'../../output_data/'

In [191]:
set(case_df['Disease Type'].values)

{'Clear Cell Renal Cell Carcinoma',
 'Head and Neck Squamous Cell Carcinoma',
 'Lung Adenocarcinoma',
 'Lung Squamous Cell Carcinoma',
 'Pancreatic Ductal Adenocarcinoma',
 'Uterine Corpus Endometrial Carcinoma'}