In [1]:
import pandas as pd
import numpy as np
import glob
import time
import datetime

## Reading All CNV Data

In [2]:
#Getting Filelist
filelist_cnv = glob.glob("../../outputs/*/*copy*.tsv") #Getting path to all files, using the glob module
filelist_cnv = [(x,x.split('/')[3]) for x in filelist_cnv] #creates a tuple (x,y) x = path to file, y = name of file for accession
len(filelist_cnv)

1359

In [3]:
def read_cnv_df(filepath,filename):
    '''
    Function that reads a cnv file and the copy number of the gene with the sample name
    '''
    df1 = pd.read_csv(filepath,header=0, sep= '\t',index_col=1)
    df1 = df1[['copy_number']]
    df1 = df1.T
    df1.index = [filename]
    df1 = df1.reset_index()
    df1.rename(columns={'index':'file_name'}, inplace=True) 
    return df1

In [4]:
df_list = []

for x in filelist_cnv:
    tmp_df = read_cnv_df(x[0],x[1])
    df_list.append(tmp_df)

In [5]:
master_cnv_df = pd.concat(df_list)
master_cnv_df.to_csv("../../data/master_cnv_df_allsamples.csv", index=False)

## Reading Gene Data

In [6]:
filelist_gene = glob.glob("../../outputs/*/*rna*.tsv") #Getting path to all files, using the glob module
filelist_gene = [(x,x.split('/')[3]) for x in filelist_gene] #creates a tuple (x,y) x = path to file, y = name of file for accession
len(filelist_gene)

1755

In [7]:
def read_gene_df(filepath,filename):
    '''
    Function that reads a gene file and of the gene with the sample name
    '''
    with open(filepath) as f:
        f.readline()
        cols = f.readline().split()

    df1 = pd.read_csv(filepath,header=None, sep= '\t',skiprows=6)
    df1.columns = cols
    df1.index = df1['gene_name']
    df1 = df1[['tpm_unstranded']]
    df1 = df1.T
    df1.index = [filename]
    df1 = df1.reset_index()
    df1.rename(columns={'index':'file_name'}, inplace=True)
    df1
    return df1

In [8]:
df_list = []

for x in filelist_gene:
    tmp_df = read_gene_df(x[0],x[1])
    df_list.append(tmp_df)

In [9]:
master_gene_df = pd.concat(df_list)
master_gene_df.to_csv("../../data/master_gene_df_allsamples.csv", index=False)
master_gene_df.head()

gene_name,file_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,...,AL451106.1,AC092910.4,AC073611.1,AC136977.1,AC078856.1,AC008763.4,AL592295.6,AC006486.3,AL391628.1,AP006621.6
0,756a1731-8c8d-4d96-8e22-3f382275a48b,15.3403,0.6521,72.8801,31.7828,15.1216,2.2197,77.8267,18.7234,2.3005,...,0.0,49.7256,0.9288,0.0,86.7766,0.0,25.6719,0.0,0.1,0.6302
0,5faf8a12-a2aa-44f1-b099-02106766ec94,22.4565,0.0524,139.1817,8.2929,5.5191,75.1033,21.3909,51.1154,12.1788,...,0.0,0.0,0.5836,0.0,0.0,0.0194,9.3738,0.0,0.0563,0.4604
0,4fb72adf-dc84-4945-850a-2d2e585bdf24,27.8195,0.4071,59.9077,17.766,6.3244,20.9362,64.8969,29.5979,6.4962,...,0.0,26.7068,0.4393,0.0,88.6577,0.0,10.5305,0.0,0.2186,0.2385
0,6855a406-c085-45c7-b789-981786f0c775,63.6498,0.0,95.5001,3.1316,4.183,24.6321,25.0933,67.504,15.6196,...,0.0,0.0,0.0681,0.0,0.0,0.0,3.0949,0.0,0.0161,0.393
0,af5b21cf-20d9-4fd6-8b56-0636480d4079,16.0823,1.0132,67.0469,12.6023,4.4674,83.5035,85.2619,35.7134,9.4192,...,0.0,22.1562,0.4228,0.0,72.1893,0.0,9.7937,0.0,0.1036,0.3462


## Reading Clinical and Sample Data Now

In [10]:
sample_data = pd.read_csv("../../gdc_sample_sheet.2022-11-25.tsv", delimiter='\t') #Reading Sample Data
sample_data['filename_short'] = sample_data['File Name'].apply(lambda x: x.split('.')[0]) #Stripping File name (Removing FPKM.txt.gz)
sample_data = sample_data[~sample_data['Sample Type'].isin(['Solid Tissue Normal','Solid Tissue Normal, Solid Tissue Normal','Solid Tissue Normal, Solid Tissue Normal, Solid Tissue Normal'])].reset_index(drop = True)
sample_data.head()

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type,filename_short
0,1861fc01-237a-4601-9ef6-d39f59a864eb,5e388d08-dbe1-47bb-9adc-c0f77da29ecc.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3L-02891, C3L-02891","C3L-02891-31, C3L-02891-03","Blood Derived Normal, Primary Tumor",5e388d08-dbe1-47bb-9adc-c0f77da29ecc
1,fea943e7-0d4c-471d-8212-5ce0394e74ce,af72c249-e75f-479c-8d38-a712c1e5f8e6.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3N-01408, C3N-01408, C3N-01408","C3N-01408-02, C3N-01408-71, C3N-01408-31","Primary Tumor, Blood Derived Normal, Blood Der...",af72c249-e75f-479c-8d38-a712c1e5f8e6
2,818cd8d3-cb35-4bcc-a2cd-424bc56f43dd,1aaa3fb7-26da-4a10-88e7-43070eb38c44.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3L-00445, C3L-00445","C3L-00445-31, C3L-00445-01","Blood Derived Normal, Primary Tumor",1aaa3fb7-26da-4a10-88e7-43070eb38c44
3,abd77d95-b511-4f29-b0ef-0aa9c4d802b0,9449a9b6-b3f5-46a1-a199-57fa97606e26.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3N-02237, C3N-02237","C3N-02237-03, C3N-02237-71","Primary Tumor, Blood Derived Normal",9449a9b6-b3f5-46a1-a199-57fa97606e26
4,232e8263-5736-4382-af93-e75368c8ab09,3ba9a0ec-797f-4196-9fe4-f68089aceff1.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3N-04155, C3N-04155","C3N-04155-71, C3N-04155-01","Blood Derived Normal, Primary Tumor",3ba9a0ec-797f-4196-9fe4-f68089aceff1


In [11]:
#Removing Multiple Case IDs
sample_data['Case_ID_Final'] = sample_data.apply(lambda x: x['Case ID'].split(',')[0], axis = 1)
sample_data.head()

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type,filename_short,Case_ID_Final
0,1861fc01-237a-4601-9ef6-d39f59a864eb,5e388d08-dbe1-47bb-9adc-c0f77da29ecc.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3L-02891, C3L-02891","C3L-02891-31, C3L-02891-03","Blood Derived Normal, Primary Tumor",5e388d08-dbe1-47bb-9adc-c0f77da29ecc,C3L-02891
1,fea943e7-0d4c-471d-8212-5ce0394e74ce,af72c249-e75f-479c-8d38-a712c1e5f8e6.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3N-01408, C3N-01408, C3N-01408","C3N-01408-02, C3N-01408-71, C3N-01408-31","Primary Tumor, Blood Derived Normal, Blood Der...",af72c249-e75f-479c-8d38-a712c1e5f8e6,C3N-01408
2,818cd8d3-cb35-4bcc-a2cd-424bc56f43dd,1aaa3fb7-26da-4a10-88e7-43070eb38c44.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3L-00445, C3L-00445","C3L-00445-31, C3L-00445-01","Blood Derived Normal, Primary Tumor",1aaa3fb7-26da-4a10-88e7-43070eb38c44,C3L-00445
3,abd77d95-b511-4f29-b0ef-0aa9c4d802b0,9449a9b6-b3f5-46a1-a199-57fa97606e26.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3N-02237, C3N-02237","C3N-02237-03, C3N-02237-71","Primary Tumor, Blood Derived Normal",9449a9b6-b3f5-46a1-a199-57fa97606e26,C3N-02237
4,232e8263-5736-4382-af93-e75368c8ab09,3ba9a0ec-797f-4196-9fe4-f68089aceff1.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3N-04155, C3N-04155","C3N-04155-71, C3N-04155-01","Blood Derived Normal, Primary Tumor",3ba9a0ec-797f-4196-9fe4-f68089aceff1,C3N-04155


## Getting in created time of sample

In [12]:
time_df = pd.read_json("../../metadata.cart.2022-11-25.json")
time_df['created_date'] = time_df.apply(lambda x: x['analysis']['created_datetime'].split('T')[0], axis = 1)
time_df = time_df[['file_name','created_date']]
time_df.columns = ['File Name', 'created_date']
time_df.head()

Unnamed: 0,File Name,created_date
0,5e388d08-dbe1-47bb-9adc-c0f77da29ecc.wgs.ASCAT...,2020-09-11
1,af72c249-e75f-479c-8d38-a712c1e5f8e6.wgs.ASCAT...,2020-09-11
2,e64ff57d-0034-49e6-b0fd-bc70425aa047.rna_seq.a...,2022-04-21
3,d9c920d7-47c1-48ca-bc66-8115ea58acfd.rna_seq.a...,2019-07-01
4,3bd0dffc-5a25-4ff8-8dbb-aa96e6070291.rna_seq.a...,2019-03-29


### Adding timestamp to sample data

In [13]:
sample_data = pd.merge(left=sample_data, right=time_df, how='inner', on='File Name')
sample_data.head()

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type,filename_short,Case_ID_Final,created_date
0,1861fc01-237a-4601-9ef6-d39f59a864eb,5e388d08-dbe1-47bb-9adc-c0f77da29ecc.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3L-02891, C3L-02891","C3L-02891-31, C3L-02891-03","Blood Derived Normal, Primary Tumor",5e388d08-dbe1-47bb-9adc-c0f77da29ecc,C3L-02891,2020-09-11
1,fea943e7-0d4c-471d-8212-5ce0394e74ce,af72c249-e75f-479c-8d38-a712c1e5f8e6.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3N-01408, C3N-01408, C3N-01408","C3N-01408-02, C3N-01408-71, C3N-01408-31","Primary Tumor, Blood Derived Normal, Blood Der...",af72c249-e75f-479c-8d38-a712c1e5f8e6,C3N-01408,2020-09-11
2,818cd8d3-cb35-4bcc-a2cd-424bc56f43dd,1aaa3fb7-26da-4a10-88e7-43070eb38c44.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3L-00445, C3L-00445","C3L-00445-31, C3L-00445-01","Blood Derived Normal, Primary Tumor",1aaa3fb7-26da-4a10-88e7-43070eb38c44,C3L-00445,2020-09-11
3,abd77d95-b511-4f29-b0ef-0aa9c4d802b0,9449a9b6-b3f5-46a1-a199-57fa97606e26.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3N-02237, C3N-02237","C3N-02237-03, C3N-02237-71","Primary Tumor, Blood Derived Normal",9449a9b6-b3f5-46a1-a199-57fa97606e26,C3N-02237,2020-09-11
4,232e8263-5736-4382-af93-e75368c8ab09,3ba9a0ec-797f-4196-9fe4-f68089aceff1.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3N-04155, C3N-04155","C3N-04155-71, C3N-04155-01","Blood Derived Normal, Primary Tumor",3ba9a0ec-797f-4196-9fe4-f68089aceff1,C3N-04155,2020-09-11


## Adding Labels to Data

In [14]:
clinical_data = pd.read_csv("../../clinical.cases_selection.2022-10-25/clinical.tsv", delimiter='\t')
clinical_data = clinical_data[['case_submitter_id','primary_diagnosis']]
clinical_data

u = clinical_data.groupby("case_submitter_id").agg(list).reset_index() #Grouping all samples and their diagnosis to a list
u['len'] = u['primary_diagnosis'].apply(lambda x: len(x)) #Getting length of each list to check how many times samples have their clinical data
u['all_equal'] = u['primary_diagnosis'].apply(lambda x: len(set(x))) #Checking if every time all the entries added in primary tumor are the same
u['final_label'] = u['primary_diagnosis'].apply(lambda x: x[0]) #creating the final label for the dataset

sample_data = sample_data.merge(u[['case_submitter_id','final_label']], how='left', left_on='Case_ID_Final', right_on='case_submitter_id')
sample_data.head()

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type,filename_short,Case_ID_Final,created_date,case_submitter_id,final_label
0,1861fc01-237a-4601-9ef6-d39f59a864eb,5e388d08-dbe1-47bb-9adc-c0f77da29ecc.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3L-02891, C3L-02891","C3L-02891-31, C3L-02891-03","Blood Derived Normal, Primary Tumor",5e388d08-dbe1-47bb-9adc-c0f77da29ecc,C3L-02891,2020-09-11,C3L-02891,"Squamous cell carcinoma, NOS"
1,fea943e7-0d4c-471d-8212-5ce0394e74ce,af72c249-e75f-479c-8d38-a712c1e5f8e6.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3N-01408, C3N-01408, C3N-01408","C3N-01408-02, C3N-01408-71, C3N-01408-31","Primary Tumor, Blood Derived Normal, Blood Der...",af72c249-e75f-479c-8d38-a712c1e5f8e6,C3N-01408,2020-09-11,C3N-01408,"Adenocarcinoma, NOS"
2,818cd8d3-cb35-4bcc-a2cd-424bc56f43dd,1aaa3fb7-26da-4a10-88e7-43070eb38c44.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3L-00445, C3L-00445","C3L-00445-31, C3L-00445-01","Blood Derived Normal, Primary Tumor",1aaa3fb7-26da-4a10-88e7-43070eb38c44,C3L-00445,2020-09-11,C3L-00445,"Squamous cell carcinoma, NOS"
3,abd77d95-b511-4f29-b0ef-0aa9c4d802b0,9449a9b6-b3f5-46a1-a199-57fa97606e26.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3N-02237, C3N-02237","C3N-02237-03, C3N-02237-71","Primary Tumor, Blood Derived Normal",9449a9b6-b3f5-46a1-a199-57fa97606e26,C3N-02237,2020-09-11,C3N-02237,"Adenocarcinoma, NOS"
4,232e8263-5736-4382-af93-e75368c8ab09,3ba9a0ec-797f-4196-9fe4-f68089aceff1.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"C3N-04155, C3N-04155","C3N-04155-71, C3N-04155-01","Blood Derived Normal, Primary Tumor",3ba9a0ec-797f-4196-9fe4-f68089aceff1,C3N-04155,2020-09-11,C3N-04155,"Squamous cell carcinoma, NOS"


## Selecting most recent gene and cnv file for a case ID

In [15]:
sample_data_latest = sample_data.sort_values(by=['Case_ID_Final','Data Category', 'created_date'])
sample_data_latest = sample_data_latest.drop_duplicates(subset=['Case_ID_Final','Data Category'], keep='last').reset_index(drop = True)
sample_data_latest

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type,filename_short,Case_ID_Final,created_date,case_submitter_id,final_label
0,441aef96-e1eb-43e9-ac91-5e90d1eb4129,7793713b-560d-4c1d-945d-863a7e03e566.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"11LU013, 11LU013","9f905736-f662-41d6-b3ac-16758d, c7788b82-8190-...","Primary Tumor, Blood Derived Normal",7793713b-560d-4c1d-945d-863a7e03e566,11LU013,2020-09-11,11LU013,"Adenocarcinoma, NOS"
1,58b434ee-4ef7-45ca-a100-84f43a514697,e50a994e-ffc1-4059-b103-34c6aa5a6c67.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,11LU013,9f905736-f662-41d6-b3ac-16758d,Primary Tumor,e50a994e-ffc1-4059-b103-34c6aa5a6c67,11LU013,2019-03-29,11LU013,"Adenocarcinoma, NOS"
2,159a8fbe-53fe-490f-ab79-9640563f107b,8718ef3b-f92d-4761-8da9-eff8222ee7f2.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"11LU016, 11LU016","93e30fd5-e57e-4503-a175-863c7d, 953d4247-a33a-...","Primary Tumor, Blood Derived Normal",8718ef3b-f92d-4761-8da9-eff8222ee7f2,11LU016,2020-09-11,11LU016,"Adenocarcinoma, NOS"
3,5dff30b4-5d48-45ca-aafd-3b11a76d2914,17488f1e-ff21-486e-bc32-5aa1379672ae.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,11LU016,93e30fd5-e57e-4503-a175-863c7d,Primary Tumor,17488f1e-ff21-486e-bc32-5aa1379672ae,11LU016,2019-03-29,11LU016,"Adenocarcinoma, NOS"
4,3bd39abd-a912-47ef-a65e-27df8aab9b81,1e8cf46a-1b64-45c4-b200-41c766916fba.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"11LU022, 11LU022","a55e011d-f91f-4dbd-98e1-328e05, 5a84eae1-197e-...","Blood Derived Normal, Primary Tumor",1e8cf46a-1b64-45c4-b200-41c766916fba,11LU022,2020-09-11,11LU022,"Adenocarcinoma, NOS"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614,17ddff0c-a5c2-4f82-b638-fe13c20de1d3,385d5ffe-02ae-4fed-8590-b332d08597bb.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-LUSC,TCGA-O2-A5IB,TCGA-O2-A5IB-01A,Primary Tumor,385d5ffe-02ae-4fed-8590-b332d08597bb,TCGA-O2-A5IB,2021-12-13,TCGA-O2-A5IB,"Squamous cell carcinoma, NOS"
2615,fd55e7da-8204-495a-943b-391c2227159e,TCGA-LUAD.198a4ba5-8395-404a-8e9c-25e6bcf008af...,Copy Number Variation,Gene Level Copy Number,TCGA-LUAD,"TCGA-S2-AA1A, TCGA-S2-AA1A","TCGA-S2-AA1A-01A, TCGA-S2-AA1A-10A","Primary Tumor, Blood Derived Normal",TCGA-LUAD,TCGA-S2-AA1A,2020-03-11,TCGA-S2-AA1A,"Bronchio-alveolar carcinoma, mucinous"
2616,deb2d529-5ea7-4a41-be07-f9239d7ac759,5d081953-cf3a-40cb-95bf-f418ed3879ac.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-LUAD,TCGA-S2-AA1A,TCGA-S2-AA1A-01A,Primary Tumor,5d081953-cf3a-40cb-95bf-f418ed3879ac,TCGA-S2-AA1A,2021-12-13,TCGA-S2-AA1A,"Bronchio-alveolar carcinoma, mucinous"
2617,9e5ffe15-cdbf-4593-915e-a655d1da1c91,TCGA-LUSC.2f00e27e-99a1-4334-85f3-730cc2c84c27...,Copy Number Variation,Gene Level Copy Number,TCGA-LUSC,"TCGA-XC-AA0X, TCGA-XC-AA0X","TCGA-XC-AA0X-01A, TCGA-XC-AA0X-10A","Primary Tumor, Blood Derived Normal",TCGA-LUSC,TCGA-XC-AA0X,2020-03-11,TCGA-XC-AA0X,"Squamous cell carcinoma, NOS"


### Keeping only those patients that have both CNV and Gene Expression data

In [16]:
test = sample_data_latest.groupby('Case_ID_Final').agg({"File Name":'count'})
removed_list = set(test[test['File Name']<2].reset_index()['Case_ID_Final'])

sample_data_latest = sample_data_latest[~sample_data_latest['Case_ID_Final'].isin(removed_list)].reset_index(drop = True)
sample_data_latest

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type,filename_short,Case_ID_Final,created_date,case_submitter_id,final_label
0,441aef96-e1eb-43e9-ac91-5e90d1eb4129,7793713b-560d-4c1d-945d-863a7e03e566.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"11LU013, 11LU013","9f905736-f662-41d6-b3ac-16758d, c7788b82-8190-...","Primary Tumor, Blood Derived Normal",7793713b-560d-4c1d-945d-863a7e03e566,11LU013,2020-09-11,11LU013,"Adenocarcinoma, NOS"
1,58b434ee-4ef7-45ca-a100-84f43a514697,e50a994e-ffc1-4059-b103-34c6aa5a6c67.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,11LU013,9f905736-f662-41d6-b3ac-16758d,Primary Tumor,e50a994e-ffc1-4059-b103-34c6aa5a6c67,11LU013,2019-03-29,11LU013,"Adenocarcinoma, NOS"
2,159a8fbe-53fe-490f-ab79-9640563f107b,8718ef3b-f92d-4761-8da9-eff8222ee7f2.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"11LU016, 11LU016","93e30fd5-e57e-4503-a175-863c7d, 953d4247-a33a-...","Primary Tumor, Blood Derived Normal",8718ef3b-f92d-4761-8da9-eff8222ee7f2,11LU016,2020-09-11,11LU016,"Adenocarcinoma, NOS"
3,5dff30b4-5d48-45ca-aafd-3b11a76d2914,17488f1e-ff21-486e-bc32-5aa1379672ae.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,11LU016,93e30fd5-e57e-4503-a175-863c7d,Primary Tumor,17488f1e-ff21-486e-bc32-5aa1379672ae,11LU016,2019-03-29,11LU016,"Adenocarcinoma, NOS"
4,3bd39abd-a912-47ef-a65e-27df8aab9b81,1e8cf46a-1b64-45c4-b200-41c766916fba.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"11LU022, 11LU022","a55e011d-f91f-4dbd-98e1-328e05, 5a84eae1-197e-...","Blood Derived Normal, Primary Tumor",1e8cf46a-1b64-45c4-b200-41c766916fba,11LU022,2020-09-11,11LU022,"Adenocarcinoma, NOS"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2587,17ddff0c-a5c2-4f82-b638-fe13c20de1d3,385d5ffe-02ae-4fed-8590-b332d08597bb.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-LUSC,TCGA-O2-A5IB,TCGA-O2-A5IB-01A,Primary Tumor,385d5ffe-02ae-4fed-8590-b332d08597bb,TCGA-O2-A5IB,2021-12-13,TCGA-O2-A5IB,"Squamous cell carcinoma, NOS"
2588,fd55e7da-8204-495a-943b-391c2227159e,TCGA-LUAD.198a4ba5-8395-404a-8e9c-25e6bcf008af...,Copy Number Variation,Gene Level Copy Number,TCGA-LUAD,"TCGA-S2-AA1A, TCGA-S2-AA1A","TCGA-S2-AA1A-01A, TCGA-S2-AA1A-10A","Primary Tumor, Blood Derived Normal",TCGA-LUAD,TCGA-S2-AA1A,2020-03-11,TCGA-S2-AA1A,"Bronchio-alveolar carcinoma, mucinous"
2589,deb2d529-5ea7-4a41-be07-f9239d7ac759,5d081953-cf3a-40cb-95bf-f418ed3879ac.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-LUAD,TCGA-S2-AA1A,TCGA-S2-AA1A-01A,Primary Tumor,5d081953-cf3a-40cb-95bf-f418ed3879ac,TCGA-S2-AA1A,2021-12-13,TCGA-S2-AA1A,"Bronchio-alveolar carcinoma, mucinous"
2590,9e5ffe15-cdbf-4593-915e-a655d1da1c91,TCGA-LUSC.2f00e27e-99a1-4334-85f3-730cc2c84c27...,Copy Number Variation,Gene Level Copy Number,TCGA-LUSC,"TCGA-XC-AA0X, TCGA-XC-AA0X","TCGA-XC-AA0X-01A, TCGA-XC-AA0X-10A","Primary Tumor, Blood Derived Normal",TCGA-LUSC,TCGA-XC-AA0X,2020-03-11,TCGA-XC-AA0X,"Squamous cell carcinoma, NOS"


## Bucketing into ALC and SCLC and removing Solid Tumours which can be mixed

In [17]:
sample_data_latest = sample_data_latest[sample_data_latest['final_label']!='Solid carcinoma, NOS'].reset_index(drop = True)

In [18]:
def mapping(x):
    ALC = ['Adenocarcinoma with mixed subtypes','Adenocarcinoma, NOS','Bronchio-alveolar carcinoma, mucinous','Bronchiolo-alveolar adenocarcinoma, NOS',
        'Bronchiolo-alveolar carcinoma, non-mucinous','Clear cell adenocarcinoma, NOS','Micropapillary carcinoma, NOS','Papillary adenocarcinoma, NOS']
    SCLC = ['Basaloid squamous cell carcinoma','Papillary squamous cell carcinoma','Squamous cell carcinoma, NOS','Squamous cell carcinoma, keratinizing, NOS',
        'Squamous cell carcinoma, large cell, nonkeratinizing, NOS','Squamous cell carcinoma, small cell, nonkeratinizing']
    
    if x in ALC:
        return 'ALC'
    else:
        return 'SCLC'

In [19]:
sample_data_latest['LABEL'] = sample_data_latest.apply(lambda x: mapping(x['final_label']), axis = 1)
sample_data_latest.head()

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type,filename_short,Case_ID_Final,created_date,case_submitter_id,final_label,LABEL
0,441aef96-e1eb-43e9-ac91-5e90d1eb4129,7793713b-560d-4c1d-945d-863a7e03e566.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"11LU013, 11LU013","9f905736-f662-41d6-b3ac-16758d, c7788b82-8190-...","Primary Tumor, Blood Derived Normal",7793713b-560d-4c1d-945d-863a7e03e566,11LU013,2020-09-11,11LU013,"Adenocarcinoma, NOS",ALC
1,58b434ee-4ef7-45ca-a100-84f43a514697,e50a994e-ffc1-4059-b103-34c6aa5a6c67.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,11LU013,9f905736-f662-41d6-b3ac-16758d,Primary Tumor,e50a994e-ffc1-4059-b103-34c6aa5a6c67,11LU013,2019-03-29,11LU013,"Adenocarcinoma, NOS",ALC
2,159a8fbe-53fe-490f-ab79-9640563f107b,8718ef3b-f92d-4761-8da9-eff8222ee7f2.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"11LU016, 11LU016","93e30fd5-e57e-4503-a175-863c7d, 953d4247-a33a-...","Primary Tumor, Blood Derived Normal",8718ef3b-f92d-4761-8da9-eff8222ee7f2,11LU016,2020-09-11,11LU016,"Adenocarcinoma, NOS",ALC
3,5dff30b4-5d48-45ca-aafd-3b11a76d2914,17488f1e-ff21-486e-bc32-5aa1379672ae.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,11LU016,93e30fd5-e57e-4503-a175-863c7d,Primary Tumor,17488f1e-ff21-486e-bc32-5aa1379672ae,11LU016,2019-03-29,11LU016,"Adenocarcinoma, NOS",ALC
4,3bd39abd-a912-47ef-a65e-27df8aab9b81,1e8cf46a-1b64-45c4-b200-41c766916fba.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,CPTAC-3,"11LU022, 11LU022","a55e011d-f91f-4dbd-98e1-328e05, 5a84eae1-197e-...","Blood Derived Normal, Primary Tumor",1e8cf46a-1b64-45c4-b200-41c766916fba,11LU022,2020-09-11,11LU022,"Adenocarcinoma, NOS",ALC


In [20]:
final_labels = sample_data_latest[['File ID','File Name','Case_ID_Final','LABEL','final_label','Project ID']]
final_labels = final_labels[final_labels['Project ID'] != 'CPTAC-3'].reset_index(drop = True)
final_labels.to_csv("../../data/final_labels.csv",index = False)
final_labels.head()

Unnamed: 0,File ID,File Name,Case_ID_Final,LABEL,final_label,Project ID
0,c757c1b8-044a-4780-8189-aae4092e77a9,TCGA-LUAD.e29ae073-4f2d-4159-b64b-8ab0400ce788...,TCGA-05-4244,ALC,"Adenocarcinoma, NOS",TCGA-LUAD
1,ef337612-6a73-4c29-a8b0-85557cbeaff4,e0e055b6-6800-40e7-bde5-718823408f0c.rna_seq.a...,TCGA-05-4244,ALC,"Adenocarcinoma, NOS",TCGA-LUAD
2,6f64ce44-af1b-4444-a236-4c745f1a8025,TCGA-LUAD.65e93af7-f914-4fe6-bc55-1bd5fcfb5eb1...,TCGA-05-4249,ALC,"Adenocarcinoma, NOS",TCGA-LUAD
3,320660cd-100a-4f5a-a604-4de3ced7f042,258b0b5e-2b09-4378-9606-83955ca19d7c.rna_seq.a...,TCGA-05-4249,ALC,"Adenocarcinoma, NOS",TCGA-LUAD
4,9747f770-3192-4d3b-8a95-65c5cae8fb2b,TCGA-LUAD.47ed8f8d-e76a-41e8-8bd7-580f3e0a589e...,TCGA-05-4250,ALC,"Adenocarcinoma, NOS",TCGA-LUAD


## Filtering CNV_master_df

#### Using a reference for encoding copy number values, I referenced this paper to encode the copy number of a gene

In [21]:
from gtfparse import read_gtf
gtf = read_gtf("../../data/Homo_sapiens.GRCh38.108.chr.gtf")
gtf = gtf[gtf['transcript_biotype']=='protein_coding'].reset_index(drop = True)
gtf[['gene_name','transcript_id','transcript_version']]

#Subsetting all protein coding genes
protein_coding = set(gtf['gene_name'])
a = master_cnv_df.iloc[:,1:].columns
protein_coding_selected = list(a.intersection(protein_coding))
protein_coding_selected.insert(0,'file_name')

#protein_coding_selected

INFO:root:Extracted GTF attributes: ['gene_id', 'gene_version', 'gene_name', 'gene_source', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_source', 'transcript_biotype', 'tag', 'ccds_id', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version', 'transcript_support_level']


In [22]:
#Subsetting only those file ids that are required
master_cnv_df_sub = master_cnv_df[master_cnv_df['file_name'].isin(final_labels['File ID'])].reset_index(drop = True)

#getting all protein coding genes only
master_cnv_df_sub = master_cnv_df_sub[protein_coding_selected]

#Removing all Nans
master_cnv_df_sub.dropna(axis=1, how='all',inplace = True)

#Adding Zero Value for al NA
master_cnv_df_sub.fillna(0, inplace = True)

#Decribing the remaining
master_cnv_df_sub.head()


gene_name,file_name,OR4F5,OR4F29,OR4F16,SAMD11,NOC2L,KLHL17,PLEKHN1,PERM1,HES4,...,PRY,BPY2,DAZ1,DAZ2,CDY1B,BPY2B,DAZ3,DAZ4,BPY2C,CDY1
0,41fc6849-724a-4bac-8775-5e703fe74184,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,0f6b347c-30da-495b-b103-36af30df77d7,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6568ed2b-0018-4750-9b3b-6c414dba60ae,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
3,e18b7869-0049-4be7-9611-d08db28df33d,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,361cc367-f85d-402a-a19d-999d33f7667a,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
master_cnv_df_sub.to_csv("../../data/master_cnv_df.csv",index = False)

### Subsetting protein coding genes for Gene Expression dataset

In [24]:
master_gene_df.head()

gene_name,file_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,...,AL451106.1,AC092910.4,AC073611.1,AC136977.1,AC078856.1,AC008763.4,AL592295.6,AC006486.3,AL391628.1,AP006621.6
0,756a1731-8c8d-4d96-8e22-3f382275a48b,15.3403,0.6521,72.8801,31.7828,15.1216,2.2197,77.8267,18.7234,2.3005,...,0.0,49.7256,0.9288,0.0,86.7766,0.0,25.6719,0.0,0.1,0.6302
0,5faf8a12-a2aa-44f1-b099-02106766ec94,22.4565,0.0524,139.1817,8.2929,5.5191,75.1033,21.3909,51.1154,12.1788,...,0.0,0.0,0.5836,0.0,0.0,0.0194,9.3738,0.0,0.0563,0.4604
0,4fb72adf-dc84-4945-850a-2d2e585bdf24,27.8195,0.4071,59.9077,17.766,6.3244,20.9362,64.8969,29.5979,6.4962,...,0.0,26.7068,0.4393,0.0,88.6577,0.0,10.5305,0.0,0.2186,0.2385
0,6855a406-c085-45c7-b789-981786f0c775,63.6498,0.0,95.5001,3.1316,4.183,24.6321,25.0933,67.504,15.6196,...,0.0,0.0,0.0681,0.0,0.0,0.0,3.0949,0.0,0.0161,0.393
0,af5b21cf-20d9-4fd6-8b56-0636480d4079,16.0823,1.0132,67.0469,12.6023,4.4674,83.5035,85.2619,35.7134,9.4192,...,0.0,22.1562,0.4228,0.0,72.1893,0.0,9.7937,0.0,0.1036,0.3462


In [25]:
all_genes = set(master_gene_df.iloc[:,1:].columns)
selected_genes = list(all_genes.intersection(protein_coding))
selected_genes.insert(0,'file_name')
len(selected_genes)

19160

In [26]:
master_gene_df_sub = master_gene_df[master_gene_df['file_name'].isin(final_labels['File ID'])].reset_index(drop = True)

#Selecting ony selected genes
master_gene_df_sub = master_gene_df_sub[selected_genes]

#Removing genes with all zero values
master_gene_df_sub.dropna(axis=1, how='all',inplace = True)

master_gene_df_sub

master_gene_df_sub.to_csv("../../data/master_gene_df.csv",index=False)

In [29]:
master_gene_df

gene_name,file_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,...,AL451106.1,AC092910.4,AC073611.1,AC136977.1,AC078856.1,AC008763.4,AL592295.6,AC006486.3,AL391628.1,AP006621.6
0,756a1731-8c8d-4d96-8e22-3f382275a48b,15.3403,0.6521,72.8801,31.7828,15.1216,2.2197,77.8267,18.7234,2.3005,...,0.0,49.7256,0.9288,0.0,86.7766,0.0000,25.6719,0.0,0.1000,0.6302
0,5faf8a12-a2aa-44f1-b099-02106766ec94,22.4565,0.0524,139.1817,8.2929,5.5191,75.1033,21.3909,51.1154,12.1788,...,0.0,0.0000,0.5836,0.0,0.0000,0.0194,9.3738,0.0,0.0563,0.4604
0,4fb72adf-dc84-4945-850a-2d2e585bdf24,27.8195,0.4071,59.9077,17.7660,6.3244,20.9362,64.8969,29.5979,6.4962,...,0.0,26.7068,0.4393,0.0,88.6577,0.0000,10.5305,0.0,0.2186,0.2385
0,6855a406-c085-45c7-b789-981786f0c775,63.6498,0.0000,95.5001,3.1316,4.1830,24.6321,25.0933,67.5040,15.6196,...,0.0,0.0000,0.0681,0.0,0.0000,0.0000,3.0949,0.0,0.0161,0.3930
0,af5b21cf-20d9-4fd6-8b56-0636480d4079,16.0823,1.0132,67.0469,12.6023,4.4674,83.5035,85.2619,35.7134,9.4192,...,0.0,22.1562,0.4228,0.0,72.1893,0.0000,9.7937,0.0,0.1036,0.3462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,d1f98c65-fd27-4893-a5c5-bc0bfe16ac5f,15.1731,0.0649,78.0779,6.3348,1.1705,144.4064,24.4287,48.4555,7.4697,...,0.0,0.7977,0.3191,0.0,0.0000,0.0120,12.1575,0.0,0.0597,0.3419
0,f9cd1c16-3be3-415c-8d7b-9249c3b1c7fa,57.0216,0.0388,146.1812,8.7416,10.4812,6.0600,15.7757,69.9076,345.9039,...,0.0,0.0000,0.2913,0.0,0.0000,0.0000,5.3225,0.0,0.0238,0.7156
0,218eb2f2-24b1-4e41-9071-26ed97a2edc1,38.5318,0.0000,112.0636,9.6324,2.6744,4.0178,217.8734,30.7166,1.5045,...,0.0,0.0000,0.1888,0.0,0.0000,0.0000,11.4991,0.0,0.0471,1.7188
0,978939fa-a27f-46ff-9120-7a43e3588b28,16.5019,1.7162,40.1894,16.0273,9.8014,7.6397,58.2959,21.4980,21.0895,...,0.0,10.5545,0.5334,0.0,41.5260,0.0000,23.2702,0.0,0.0395,0.0754


In [39]:
final_labels.groupby('LABEL').agg({'Case_ID_Final':'nunique'})

Unnamed: 0_level_0,Case_ID_Final
LABEL,Unnamed: 1_level_1
ALC,463
SCLC,491
