# Analysis of the Cancer dataset

### The task is to infer the relationship between types of cancer and the features of the dataset.

### The main challenge is to use only relevant and useful features.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.express as px

### Loading the data

In [2]:
df_donor_info = pd.read_excel('pcawg_donor_clinical_August2016_v9.xlsx')

In [3]:
df_rel = pd.read_csv('pcawg_sample_sheet.tsv', sep='\t')

In [4]:
df_transcription = pd.read_csv('pcawg_rnaseq_gene_expr_tpm.tsv.gz', sep='\t', compression='gzip', header='infer')

In [5]:
df_driver_mutations = pd.read_csv('TableS3_panorama_driver_mutations_ICGC_samples.public.tsv.gz', sep='\t', compression='gzip', header='infer')

In [6]:
df_signature = pd.read_csv('SignatureAnalyzer_COMPOSITE.SBS.txt.gz', sep='\t', compression='gzip', header='infer')

### Take a look at the data

In [7]:
df_signature

Unnamed: 0,signature,feature,Biliary_AdenoCA__SP117655,Biliary_AdenoCA__SP117556,Biliary_AdenoCA__SP117627,Biliary_AdenoCA__SP117775,Biliary_AdenoCA__SP117332,Biliary_AdenoCA__SP117712,Biliary_AdenoCA__SP117017,Biliary_AdenoCA__SP117031,...,Skin_Melanoma__SP104056,Skin_Melanoma__SP83083,Skin_Melanoma__SP82433,Skin_Melanoma__SP82780,Skin_Melanoma__SP83019,Skin_Melanoma__SP83099,Skin_Melanoma__SP83146,Skin_Melanoma__SP103866,Skin_Melanoma__SP83844,Skin_Melanoma__SP83027
0,BI_COMPOSITE_SNV_SBS1_P,T>G_at_TTT,0.023508,0.058562,0.098343,0.054648,0.066848,0.014144,0.098530,0.055715,...,0.000860,0.004672,0.017219,0.006761,0.003202,0.012528,0.001080,0.003147,0.002654,0.036375
1,BI_COMPOSITE_SNV_SBS1_P,T>G_at_GTT,0.004766,0.016927,0.031589,0.013261,0.018877,0.003355,0.036790,0.011522,...,0.000149,0.000932,0.001993,0.000872,0.000482,0.002394,0.000175,0.000545,0.000454,0.009805
2,BI_COMPOSITE_SNV_SBS1_P,T>G_at_CTT,0.000024,0.000088,0.000164,0.000076,0.000140,0.000017,0.000195,0.000058,...,0.000005,0.000024,0.000053,0.000032,0.000012,0.000044,0.000005,0.000012,0.000011,0.000082
3,BI_COMPOSITE_SNV_SBS1_P,T>G_at_ATT,0.024124,0.067120,0.099988,0.057556,0.067954,0.014029,0.126438,0.062543,...,0.001364,0.008305,0.013511,0.006454,0.004417,0.017503,0.001853,0.004373,0.003429,0.044664
4,BI_COMPOSITE_SNV_SBS1_P,T>G_at_TTG,0.017716,0.040233,0.082326,0.031971,0.029459,0.007126,0.063224,0.040027,...,0.000902,0.004680,0.005584,0.003231,0.003003,0.011053,0.001184,0.002245,0.002447,0.014844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5755,BI_COMPOSITE_SNV_SBS83_P,C>T_at_GCT,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5756,BI_COMPOSITE_SNV_SBS83_P,C>T_at_TCA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5757,BI_COMPOSITE_SNV_SBS83_P,C>T_at_TCC,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5758,BI_COMPOSITE_SNV_SBS83_P,C>T_at_TCG,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [8]:
df_driver_mutations

Unnamed: 0,sample_id,ttype,chr,pos,ref,alt,gene,driver,driver_statement,category,top_category,biallelic
0,03c3c692-8a86-4843-85ae-e045f0fa6f88,Panc-AdenoCA,x,x,x,x,TSC1,known,known,coding_tsg_breakpoint,SV,no
1,09497b9b-6fca-48cb-af97-161a3e434a51,Eso-AdenoCa,x,x,x,x,NOTCH1,known,known,coding_tsg_breakpoint,SV,no
2,0bfd1043-8170-e3e4-e050-11ac0c4860c5,Prost-AdenoCA,x,x,x,x,CBLB,known,known,coding_tsg_breakpoint,SV,no
3,0bfd1043-8183-e3e4-e050-11ac0c4860c5,Prost-AdenoCA,x,x,x,x,PTEN,known,known,coding_tsg_breakpoint,SV,no
4,0e872e0d-4711-4364-a5d0-6beeb6fd3ff2,Eso-AdenoCa,x,x,x,x,ASXL1,known,known,coding_tsg_breakpoint,SV,no
...,...,...,...,...,...,...,...,...,...,...,...,...
8464,fca07e09-1fd2-9ef5-e040-11ac0d485bbd,Breast-AdenoCa,x,x,x,x,DEPDC1B,known,known,coding_tsg_breakpoint,SV,SDeletion/SSV
8465,fca07e09-1fd2-9ef5-e040-11ac0d485bbd,Breast-AdenoCa,x,x,x,x,FBXW7,known,known,coding_tsg_breakpoint,SV,SDeletion/SSV
8466,fca07e09-1fd2-9ef5-e040-11ac0d485bbd,Breast-AdenoCa,x,x,x,x,PTEN,known,known,coding_tsg_breakpoint,SV,SDeletion/SSV
8467,fca07e09-1fd2-9ef5-e040-11ac0d485bbd,Breast-AdenoCa,x,x,x,x,RAD51B,known,known,coding_tsg_breakpoint,SV,SDeletion/SSV


In [9]:
df_transcription

Unnamed: 0,Symbol,Gene,SA136091,SA138738,SA141272,SA144576,SA146105,SA151709,SA155940,SA164579,...,SA512386,SA442206,SA440047,SA441129,SA513126,SA511031,SA511841,SA443538,SA443538.1,SA551062
0,A1BG,ENSG00000121410,13.28,6.02,6.51,2.25,11.39,6.82,1.53,6.13,...,0.52,0.27,2.37,5.67,1.88,0.43,0.89,0.96,0.72,10.50
1,A1CF,ENSG00000148584,0.02,0.06,0.01,0.02,0.02,0.07,0.06,0.05,...,5.76,0.75,0.49,20.99,0.45,3.85,6.66,3.76,0.03,0.04
2,A2M,ENSG00000175899,214.83,934.31,447.41,2113.66,173.22,1178.25,328.41,224.15,...,101.22,56.02,668.34,509.76,257.15,41.01,170.62,306.56,57.40,110.72
3,A2ML1,ENSG00000166535,0.38,0.74,1.08,5.66,2.33,1.35,0.87,1.05,...,0.59,0.80,0.65,0.69,0.76,0.64,1.03,91.97,1538.59,0.25
4,A3GALT2,ENSG00000184389,0.29,0.19,0.18,0.00,0.06,0.13,0.04,0.03,...,0.09,0.23,0.21,0.09,0.13,0.00,0.25,0.00,0.11,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19616,ZYG11B,ENSG00000162378,13.94,8.59,9.35,15.32,9.67,12.02,13.68,13.24,...,9.02,10.19,13.38,15.89,13.12,9.72,11.24,9.60,7.99,9.68
19617,ZYX,ENSG00000159840,725.65,600.51,206.38,244.78,183.42,292.85,152.48,281.71,...,165.15,172.15,227.18,400.57,191.56,121.46,233.90,254.49,114.33,125.33
19618,ZZEF1,ENSG00000074755,8.29,7.76,6.76,8.69,4.06,20.99,7.87,7.53,...,19.09,12.34,24.18,12.50,26.81,10.94,12.43,13.35,10.17,6.67
19619,ZZZ3,ENSG00000036549,17.75,15.04,12.53,29.33,14.00,14.66,13.44,13.65,...,18.51,13.52,16.40,27.72,25.66,17.40,20.71,24.63,8.14,12.03


In [10]:
df_rel

Unnamed: 0,donor_unique_id,donor_wgs_exclusion_white_gray,submitter_donor_id,icgc_donor_id,dcc_project_code,aliquot_id,submitter_specimen_id,icgc_specimen_id,submitter_sample_id,icgc_sample_id,dcc_specimen_type,library_strategy
0,BLCA-US::096b4f32-10c1-4737-a0dd-cae04c54ee33,Whitelist,096b4f32-10c1-4737-a0dd-cae04c54ee33,DO804,BLCA-US,e0fccaf5-925a-41f9-b87c-cd5ee4aecb59,27461a27-26eb-4c2c-9c54-e16fbd32c615,SP1682,e0fccaf5-925a-41f9-b87c-cd5ee4aecb59,SA5237,Normal - solid tissue,WGS
1,BLCA-US::096b4f32-10c1-4737-a0dd-cae04c54ee33,Whitelist,096b4f32-10c1-4737-a0dd-cae04c54ee33,DO804,BLCA-US,301d6ce3-4099-4c1d-8e50-c04b7ce91450,52f538ef-b05d-4c76-9976-ce6d49158016,SP1677,301d6ce3-4099-4c1d-8e50-c04b7ce91450,SA5195,Primary tumour - solid tissue,WGS
2,BLCA-US::096b4f32-10c1-4737-a0dd-cae04c54ee33,Whitelist,096b4f32-10c1-4737-a0dd-cae04c54ee33,DO804,BLCA-US,22e154de-0e3b-443b-8420-48d68d6c1ce4,52f538ef-b05d-4c76-9976-ce6d49158016,SP1677,22e154de-0e3b-443b-8420-48d68d6c1ce4,SA5213,Primary tumour - solid tissue,RNA-Seq
3,BLCA-US::178b28cd-99c3-48dc-8d09-1ef71b4cee80,Whitelist,178b28cd-99c3-48dc-8d09-1ef71b4cee80,DO555,BLCA-US,c1da8eed-4919-4ba5-a735-3fba476c18a7,cd3cfb26-e66f-408e-81f6-3b61c247c976,SP1135,c1da8eed-4919-4ba5-a735-3fba476c18a7,SA1598,Normal - blood derived,WGS
4,BLCA-US::178b28cd-99c3-48dc-8d09-1ef71b4cee80,Whitelist,178b28cd-99c3-48dc-8d09-1ef71b4cee80,DO555,BLCA-US,4838b5a9-968c-4178-bffb-3fafe1f6dc09,59d6683f-5eb7-493d-8e8e-78b88be2cd70,SP1132,4838b5a9-968c-4178-bffb-3fafe1f6dc09,SA1556,Primary tumour - solid tissue,WGS
...,...,...,...,...,...,...,...,...,...,...,...,...
7250,UCEC-US::fba80122-d8b2-4d8d-a032-9767e8160f9f,Whitelist,fba80122-d8b2-4d8d-a032-9767e8160f9f,DO42544,UCEC-US,e54b7e44-82a3-4016-bc32-129799097b4c,ddd2f9e0-0aa3-425b-9d74-74fcc638cb08,SP92947,e54b7e44-82a3-4016-bc32-129799097b4c,SA462448,Primary tumour - solid tissue,RNA-Seq
7251,UCEC-US::fba80122-d8b2-4d8d-a032-9767e8160f9f,Whitelist,fba80122-d8b2-4d8d-a032-9767e8160f9f,DO42544,UCEC-US,ce5b0ba0-2777-4c92-ac50-483174cc5dca,cad6c89d-f722-470b-93bc-e8b24c033f0f,SP92955,ce5b0ba0-2777-4c92-ac50-483174cc5dca,SA462509,Normal - tissue adjacent to primary,RNA-Seq
7252,UCEC-US::ffaa98a0-2b69-46dc-aee5-c5c3f2abbc38,Whitelist,ffaa98a0-2b69-46dc-aee5-c5c3f2abbc38,DO42432,UCEC-US,47f826a1-96ed-4f4d-94e0-49f4460ef44f,2729ed97-f971-4d98-8baa-f99404dd2b9f,SP92731,47f826a1-96ed-4f4d-94e0-49f4460ef44f,SA461078,Normal - blood derived,WGS
7253,UCEC-US::ffaa98a0-2b69-46dc-aee5-c5c3f2abbc38,Whitelist,ffaa98a0-2b69-46dc-aee5-c5c3f2abbc38,DO42432,UCEC-US,712ba532-fb1a-43fa-a356-b446b509ceb7,8bb3a057-8958-4f62-af81-976da2e92df7,SP92723,712ba532-fb1a-43fa-a356-b446b509ceb7,SA461016,Primary tumour - solid tissue,WGS


In [11]:
df_donor_info

Unnamed: 0,# donor_unique_id,project_code,icgc_donor_id,submitted_donor_id,tcga_donor_uuid,donor_sex,donor_vital_status,donor_diagnosis_icd10,first_therapy_type,first_therapy_response,donor_age_at_diagnosis,donor_survival_time,donor_interval_of_last_followup,tobacco_smoking_history_indicator,tobacco_smoking_intensity,alcohol_history,alcohol_history_intensity,donor_wgs_included_excluded
0,BRCA-UK::CGP_donor_1114930,BRCA-UK,DO1000,CGP_donor_1114930,,female,alive,,other therapy,,61.0,,,Smoking history not documented,,Don't know/Not sure,Not Documented,Included
1,BRCA-UK::CGP_donor_1069291,BRCA-UK,DO1001,CGP_donor_1069291,,female,,,other therapy,,41.0,,,Smoking history not documented,,Don't know/Not sure,Not Documented,Included
2,BRCA-UK::CGP_donor_1114881,BRCA-UK,DO1002,CGP_donor_1114881,,female,alive,,other therapy,unknown,39.0,,,Smoking history not documented,,Don't know/Not sure,Not Documented,Included
3,BRCA-UK::CGP_donor_1114929,BRCA-UK,DO1003,CGP_donor_1114929,,female,alive,C50.4,chemotherapy,unknown,34.0,,,Smoking history not documented,,Don't know/Not sure,Not Documented,Included
4,BRCA-UK::CGP_donor_1167078,BRCA-UK,DO1004,CGP_donor_1167078,,female,deceased,,other therapy,,59.0,,0.0,Smoking history not documented,,Don't know/Not sure,Not Documented,Included
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2829,COAD-US::b08b5f49-9434-4653-9772-097ec29b2ca3,COAD-US,DO9708,TCGA-D5-6540,b08b5f49-9434-4653-9772-097ec29b2ca3,male,alive,C18.0,,,66.0,,186.0,,,,,GrayList
2830,COAD-US::bfb07784-693b-4c25-874e-4ad6e04a5d46,COAD-US,DO9732,TCGA-AA-3529,bfb07784-693b-4c25-874e-4ad6e04a5d46,female,deceased,C18.7,,,78.0,0.0,,,,,,Included
2831,COAD-US::7d8eab0a-e6c8-4449-9ebf-50c41db94a06,COAD-US,DO9788,TCGA-A6-2681,7d8eab0a-e6c8-4449-9ebf-50c41db94a06,female,alive,C18.9,,,73.0,,552.0,,,,,Included
2832,COAD-US::e457344d-76fb-46bf-b362-61a6e811d131,COAD-US,DO9876,TCGA-AA-A00N,e457344d-76fb-46bf-b362-61a6e811d131,male,deceased,C18.0,,,75.0,122.0,,,,,,Included


### After basic exploration of the data, I concluded that the most efficient way to analise it is PCA of transcription levels.

#### The ```df_transcription``` dataframe contains the transcription levels of around 20,000 genes for each sample. PCA will allow us to observe the relation between different types of cancer.

In [15]:
df_transcription_rest = df_transcription.drop(columns=['Symbol', 'Gene'])

In [13]:
df_transcription_array = df_transcription_rest.to_numpy().transpose()   

In [16]:
normed_matrix = StandardScaler().fit_transform(df_transcription_array)

In [17]:
pca = PCA(n_components=3)
pca_features = pca.fit_transform(normed_matrix)

pca_df = pd.DataFrame(data = pca_features, columns = ['PC1', 'PC2', 'PC3'])

### Connecting PCA of transcription levels with the types of cancer.

In [18]:
transaction_dict = dict(zip(df_rel['icgc_sample_id'], df_rel['dcc_project_code']))

In [19]:
pca_df['samples'] = df_transcription.columns[2:]
pca_df['cancer_type'] = pca_df['samples'].map(transaction_dict)

In [20]:
pca_df

Unnamed: 0,PC1,PC2,PC3,samples,cancer_type
0,29.093405,-64.995288,-36.259665,SA136091,GBM-US
1,7.918782,-34.429618,-14.369309,SA138738,GBM-US
2,11.502220,-46.810274,-30.764988,SA141272,GBM-US
3,21.022601,-54.168199,-11.752567,SA144576,GBM-US
4,-7.712671,-33.636717,-10.936530,SA146105,GBM-US
...,...,...,...,...,...
1354,0.180999,-13.531088,-2.575378,SA511031,STAD-US
1355,23.838257,-22.776924,-11.049217,SA511841,STAD-US
1356,-1.690245,-9.127438,-1.160509,SA443538,STAD-US
1357,-36.553516,1.510105,5.093141,SA443538.1,


### Plotting the PCA of transcription levels

In [21]:
fig = px.scatter_3d(pca_df, x='PC1', y='PC2', z='PC3', color='cancer_type', hover_data=['samples'])
fig.show()

#### The plot shows that the most types of cancer are not clearly separated, except for the couple of them. This means that the PCA of the dimentionality of transcription levels of the genes are not the best way to distinguish between the types of cancer. It can be used to distinguish the limited number of types of cancer, but not all of them.