# Welcome to Polly Python3 Notebook.

### to install polly python

In [2]:
!sudo pip3 install polly-python --quiet 

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


### importing modules

In [1]:
import os
from polly.auth import Polly
from polly.omixatlas import OmixAtlas
from polly.cohort import Cohort
from polly.workspaces import Workspaces
import pandas as pd
import urllib.parse as urlparse
from urllib.parse import parse_qs
import numpy as np
import cmapPy

### Authentication

In [2]:
AUTH_TOKEN=(os.environ['POLLY_REFRESH_TOKEN'])
Polly.auth(AUTH_TOKEN)
omixatlas = OmixAtlas()
cohort1 = Cohort()
cohort2 = Cohort()
workspaces = Workspaces()

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 500)

### querying metadata

In [66]:
query = """SELECT src_dataset_id,  protein_position, amino_acids, sequencer, impact, variant_class, consequence, name
        FROM tcga.features AS features
        
        JOIN (
        SELECT dataset_id AS dataset_id FROM tcga.datasets WHERE data_type LIKE 'Mutation'
         ) AS datasets
         ON features.src_dataset_id = datasets.dataset_id
         
         WHERE hugo_symbol IN ('TP53','PIK3CA','CDH1','GATA3') AND features.src_dataset_id LIKE '%BRCA%'
    ORDER BY features.src_dataset_id"""
results=omixatlas.query_metadata(query, query_api_version="v2")
results

Query execution succeeded (time taken: 37.20 seconds, data scanned: 12.440 MB)
Fetched 652 rows


Unnamed: 0,src_dataset_id,protein_position,amino_acids,sequencer,impact,variant_class,consequence,name
0,BRCA_Mutation_TCGA-3C-AALI-01A-11D-A41F-09,183/393,S/*,Illumina HiSeq 2000,HIGH,SNV,stop_gained,TP53_c.548C>A
1,BRCA_Mutation_TCGA-3C-AALK-01A-11D-A41F-09,542/1068,E/K,Illumina HiSeq 2000,MODERATE,SNV,missense_variant,PIK3CA_c.1624G>A
2,BRCA_Mutation_TCGA-3C-AALK-01A-11D-A41F-09,1004/1068,M/I,Illumina HiSeq 2000,MODERATE,SNV,missense_variant,PIK3CA_c.3012G>T
3,BRCA_Mutation_TCGA-4H-AAAK-01A-12D-A41F-09,571/882,,Illumina HiSeq 2000,HIGH,SNV,splice_donor_variant,CDH1_c.1711+1G>T
4,BRCA_Mutation_TCGA-5L-AAT0-01A-12D-A41F-09,1047/1068,H/L,Illumina HiSeq 2000,MODERATE,SNV,missense_variant,PIK3CA_c.3140A>T
...,...,...,...,...,...,...,...,...
647,BRCA_Mutation_TCGA-XX-A899-01A-11D-A36J-09,83/1068,F/S,Illumina HiSeq 2000,MODERATE,SNV,missense_variant,PIK3CA_c.248T>C
648,BRCA_Mutation_TCGA-XX-A899-01A-11D-A36J-09,511/882,Q/*,Illumina HiSeq 2000,HIGH,SNV,stop_gained,CDH1_c.1531C>T
649,BRCA_Mutation_TCGA-XX-A89A-01A-11D-A36J-09,542/1068,E/K,Illumina HiSeq 2000,MODERATE,SNV,missense_variant,PIK3CA_c.1624G>A
650,BRCA_Mutation_TCGA-XX-A89A-01A-11D-A36J-09,129/882,Q/*,Illumina HiSeq 2000,HIGH,SNV,stop_gained;splice_region_variant,CDH1_c.385C>T


### making list of dataset

In [4]:
moderate_data = results.loc[results['impact'] == 'MODERATE', 'src_dataset_id'].tolist()
high_data = results.loc[results['impact'] == 'HIGH', 'src_dataset_id'].tolist()

In [5]:
high_data

['BRCA_Mutation_TCGA-3C-AALI-01A-11D-A41F-09',
 'BRCA_Mutation_TCGA-4H-AAAK-01A-12D-A41F-09',
 'BRCA_Mutation_TCGA-5L-AAT0-01A-12D-A41F-09',
 'BRCA_Mutation_TCGA-A1-A0SP-01A-11D-A099-09',
 'BRCA_Mutation_TCGA-A2-A0CK-01A-11D-A228-09',
 'BRCA_Mutation_TCGA-A2-A0CW-01A-21D-A10Y-09',
 'BRCA_Mutation_TCGA-A2-A0D0-01A-11W-A019-09',
 'BRCA_Mutation_TCGA-A2-A0D1-01A-11W-A050-09',
 'BRCA_Mutation_TCGA-A2-A0EW-01A-21D-A10Y-09',
 'BRCA_Mutation_TCGA-A2-A0EX-01A-21W-A050-09',
 'BRCA_Mutation_TCGA-A2-A0YG-01A-21D-A10G-09',
 'BRCA_Mutation_TCGA-A2-A0YJ-01A-11D-A10G-09',
 'BRCA_Mutation_TCGA-A2-A1G1-01A-21D-A13L-09',
 'BRCA_Mutation_TCGA-A2-A3Y0-01A-11D-A23C-09',
 'BRCA_Mutation_TCGA-A2-A4S2-01A-12D-A25Q-09',
 'BRCA_Mutation_TCGA-A7-A13D-01A-13D-A272-09',
 'BRCA_Mutation_TCGA-A7-A4SE-01A-11D-A25Q-09',
 'BRCA_Mutation_TCGA-A7-A6VV-01A-22D-A33E-09',
 'BRCA_Mutation_TCGA-A8-A075-01A-11D-A099-09',
 'BRCA_Mutation_TCGA-A8-A07O-01A-11W-A019-09',
 'BRCA_Mutation_TCGA-A8-A09Q-01A-11W-A019-09',
 'BRCA_Mutati

In [6]:
moderate_data

['BRCA_Mutation_TCGA-3C-AALK-01A-11D-A41F-09',
 'BRCA_Mutation_TCGA-3C-AALK-01A-11D-A41F-09',
 'BRCA_Mutation_TCGA-5L-AAT0-01A-12D-A41F-09',
 'BRCA_Mutation_TCGA-5L-AAT1-01A-12D-A41F-09',
 'BRCA_Mutation_TCGA-5L-AAT1-01A-12D-A41F-09',
 'BRCA_Mutation_TCGA-5L-AAT1-01A-12D-A41F-09',
 'BRCA_Mutation_TCGA-A1-A0SE-01A-11D-A099-09',
 'BRCA_Mutation_TCGA-A1-A0SI-01A-11D-A142-09',
 'BRCA_Mutation_TCGA-A1-A0SI-01A-11D-A142-09',
 'BRCA_Mutation_TCGA-A1-A0SI-01A-11D-A142-09',
 'BRCA_Mutation_TCGA-A1-A0SK-01A-12D-A099-09',
 'BRCA_Mutation_TCGA-A1-A0SO-01A-22D-A099-09',
 'BRCA_Mutation_TCGA-A2-A04N-01A-11D-A10Y-09',
 'BRCA_Mutation_TCGA-A2-A04P-01A-31D-A128-09',
 'BRCA_Mutation_TCGA-A2-A04T-01A-21W-A050-09',
 'BRCA_Mutation_TCGA-A2-A04T-01A-21W-A050-09',
 'BRCA_Mutation_TCGA-A2-A04T-01A-21W-A050-09',
 'BRCA_Mutation_TCGA-A2-A04V-01A-21W-A050-09',
 'BRCA_Mutation_TCGA-A2-A04W-01A-31D-A10Y-09',
 'BRCA_Mutation_TCGA-A2-A04W-01A-31D-A10Y-09',
 'BRCA_Mutation_TCGA-A2-A0CL-01A-11D-A10Y-09',
 'BRCA_Mutati

### create_cohort
### Input:
local_path(str): local path to instantiate the cohort.<br>
cohort_name(str): identifier name for the cohort.<br>
description(str): description about the cohort.<br>
repo_key(str): Optional argument: repo_key(repo_name/repo_id) for the omixatlas to be added.<br>
entity_id(list): Optional argument: list of sample_id or dataset_id to be added to the cohort.<br>
### Output:
A confirmation message on creation of cohort.

In [38]:
cohort1.create_cohort("/import/tcga_cohort","moderate","tcga_tp53_brca_mutation_moderate","tcga",moderate_data[:10])

### summarize_cohort
### Input:
None
### Output:
A tuple with the first value as cohort metadata information (name, description and number of dataset(s) or sample(s) in the cohort) and the second value as dataframe containing the source, dataset_id or sample_id and data type available in the cohort.

In [39]:
cohort1.summarize_cohort()

({'cohort_name': 'moderate',
  'number_of_samples': 5,
  'description': 'tcga_tp53_brca_mutation_moderate'},
   source_omixatlas  datatype                                  dataset_id
 0             tcga  Mutation  BRCA_Mutation_TCGA-3C-AALK-01A-11D-A41F-09
 1             tcga  Mutation  BRCA_Mutation_TCGA-5L-AAT0-01A-12D-A41F-09
 2             tcga  Mutation  BRCA_Mutation_TCGA-A1-A0SE-01A-11D-A099-09
 3             tcga  Mutation  BRCA_Mutation_TCGA-5L-AAT1-01A-12D-A41F-09
 4             tcga  Mutation  BRCA_Mutation_TCGA-A1-A0SI-01A-11D-A142-09)

In [41]:
cohort2.create_cohort("/import/tcga_cohort","higher","tcga_tp53_brca_mutation_high")

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
FileExistsError: [Errno 17] File exists: '/import/tcga_cohort/higher.pco'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
AttributeError: 'FileExistsError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
AssertionError


FileExistsError: [Errno 17] File exists: '/import/tcga_cohort/higher.pco'

### add_to_cohort
### Args:
repo_key(str): repo_key(repo_name/repo_id) for the omixatlas to be added.<br>
entity_id(list): list of entity_ids to be added to the cohort.<br>
### Returns:
A confirmation message for number of dataset(s) or sample(s) which are added to the cohort.

In [42]:
cohort2.add_to_cohort("tcga",high_data)

In [43]:
metadata, cohort_details = cohort2.summarize_cohort()

In [44]:
metadata

{'cohort_name': 'higher',
 'number_of_samples': 98,
 'description': 'tcga_tp53_brca_mutation_high'}

In [46]:
cohort_details

Unnamed: 0,source_omixatlas,datatype,dataset_id
0,tcga,Mutation,BRCA_Mutation_TCGA-E2-A15D-01A-11D-A10Y-09
1,tcga,Mutation,BRCA_Mutation_TCGA-BH-A18V-01A-11D-A12B-09
2,tcga,Mutation,BRCA_Mutation_TCGA-OL-A5RV-01A-12D-A28B-09
3,tcga,Mutation,BRCA_Mutation_TCGA-AC-A6IV-01A-12D-A33E-09
4,tcga,Mutation,BRCA_Mutation_TCGA-A8-A09Q-01A-11W-A019-09
5,tcga,Mutation,BRCA_Mutation_TCGA-A2-A3Y0-01A-11D-A23C-09
6,tcga,Mutation,BRCA_Mutation_TCGA-AN-A03Y-01A-21W-A019-09
7,tcga,Mutation,BRCA_Mutation_TCGA-EW-A1OY-01A-11D-A142-09
8,tcga,Mutation,BRCA_Mutation_TCGA-EW-A1OZ-01A-11D-A142-09
9,tcga,Mutation,BRCA_Mutation_TCGA-LL-A9Q3-01A-11D-A41F-09


### merge_data
### Input:
data_level(str): identifier to specify the data to be merged - sample, dataset,feature or data_matrix.
### Output:
A tuple of pandas dataframe containing the merged data for analysis. in the sequence - feature_level, sample_level, dataset_level, datamatrix dataframes.

In [47]:
merge_sample = cohort1.merge_data('sample')

In [48]:
merge_sample

chd,dataset_id,patient,barcode,sample,shortLetterCode,definition,sample_submitter_id,sample_type_id,oct_embedded,sample_id,submitter_id,state,is_ffpe,sample_type,tissue_type,days_to_collection,initial_weight,pathology_report_uuid,synchronous_malignancy,ajcc_pathologic_stage,tumor_stage,days_to_diagnosis,last_known_disease_status,tissue_or_organ_of_origin,days_to_last_follow_up,primary_diagnosis,age_at_diagnosis,prior_malignancy,year_of_diagnosis,prior_treatment,ajcc_staging_system_edition,ajcc_pathologic_t,morphology,ajcc_pathologic_n,ajcc_pathologic_m,classification_of_tumor,diagnosis_id,icd_10_code,site_of_resection_or_biopsy,tumor_grade,progression_or_recurrence,alcohol_history,exposure_id,race,ethnicity,gender,vital_status.x,age_at_index,days_to_birth.x,year_of_birth,demographic_id,bcr_patient_barcode,primary_site,project_id,disease_type,name,releasable,released,paper_patient,paper_Tumor.Type,paper_Included_in_previous_marker_papers,paper_vital_status,paper_days_to_birth,paper_days_to_death,paper_days_to_last_followup,paper_age_at_initial_pathologic_diagnosis,paper_pathologic_stage,paper_Tumor_Grade,paper_BRCA_Pathology,paper_BRCA_Subtype_PAM50,paper_MSI_status,paper_HPV_Status,paper_tobacco_smoking_history,paper_CNV.Clusters,paper_Mutation.Clusters,paper_DNA.Methylation.Clusters,paper_mRNA.Clusters,paper_miRNA.Clusters,paper_lncRNA.Clusters,paper_Protein.Clusters,paper_PARADIGM.Clusters,paper_Pan.Gyn.Clusters,Tumor.Type,Included_in_previous_marker_papers,vital_status.y,days_to_birth.y,days_to_death.y,days_to_last_followup,age_at_initial_pathologic_diagnosis,pathologic_stage,Tumor_Grade,BRCA_Pathology,BRCA_Subtype_PAM50,MSI_status,HPV_Status,tobacco_smoking_history,CNV.Clusters,Mutation.Clusters,DNA.Methylation.Clusters,mRNA.Clusters,miRNA.Clusters,lncRNA.Clusters,Protein.Clusters,PARADIGM.Clusters,Pan.Gyn.Clusters,subtype,kw_curated_cell_line,kw_curated_cell_type,kw_curated_genetic_mod_type,kw_curated_modified_gene,kw_curated_tissue,kw_curated_drug,kw_curated_disease,curated_age_unit,curated_min_age,curated_max_age
cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1
TCGA-3C-AALK-01A-11D-A41F-09,BRCA_Mutation_TCGA-3C-AALK-01A-11D-A41F-09,TCGA-3C-AALK,TCGA-3C-AALK-01A-11R-A41B-07,TCGA-3C-AALK-01A,TP,Primary solid Tumor,TCGA-3C-AALK-01A,1,True,2a532346-0008-4af8-99bc-d709585da1d6,TCGA-3C-AALK,released,False,Primary Tumor,Not Reported,1022,140,F43B01E6-E1DB-44B1-8003-93870606346A,No,Stage IA,stage ia,0,not reported,Breast-- NOS,1448,Infiltrating duct carcinoma-- NOS,19074,no,2011,No,7th,T1c,8500/3,N0 (i+),M0,not reported,8c90b19d-54f7-5788-a5eb-49abe239ef0b,C50.9,Breast-- NOS,not reported,not reported,Not Reported,a97db788-0772-5d32-878c-d2080d979c37,black or african american,not hispanic or latino,female,Alive,52,-19074,1959,e54b1469-fffc-5291-a8e3-df2092ab5f34,TCGA-3C-AALK-01A,Breast,TCGA-BRCA,"c(""Ductal and Lobular Neoplasms""-- ""Complex Ep...",Breast Invasive Carcinoma,True,True,TCGA-3C-AALK,BRCA,NO,Alive,-19074,,1448,52,Stage_I,,,LumA,,,,C1,C5,C2,C2,C2,,C2,C6,C4,BRCA,NO,Alive,-19074,,1448,52,Stage_I,,,LumA,,,,C1,C5,C2,C2,C2,,C2,C6,C4,LumA,none,none,none,none,breast,none,Breast Neoplasms,Years,52.0,52.0
TCGA-5L-AAT0-01A-12D-A41F-09,BRCA_Mutation_TCGA-5L-AAT0-01A-12D-A41F-09,TCGA-5L-AAT0,TCGA-5L-AAT0-01A-12R-A41B-07,TCGA-5L-AAT0-01A,TP,Primary solid Tumor,TCGA-5L-AAT0-01A,1,False,8434f4a2-0ead-40f6-9b63-9c958a3fb812,TCGA-5L-AAT0,released,False,Primary Tumor,Not Reported,1320,80,F9B6971F-23C0-465F-BFEC-778BF228A1AE,Not Reported,Stage IIA,stage iia,0,not reported,Breast-- NOS,1477,Lobular carcinoma-- NOS,15393,yes,2010,No,7th,T2,8520/3,N0,M0,not reported,ebff6a7b-3b6c-5f71-86b0-1bdd3b78edd4,C50.9,Breast-- NOS,not reported,not reported,Not Reported,47c9213f-b2b8-5297-a0b6-21bce2cfa3f8,white,hispanic or latino,female,Alive,42,-15393,1968,776bb6f9-f5c8-57b6-bf4d-4014b8025a06,TCGA-5L-AAT0-01A,Breast,TCGA-BRCA,"c(""Ductal and Lobular Neoplasms""-- ""Complex Ep...",Breast Invasive Carcinoma,True,True,TCGA-5L-AAT0,BRCA,NO,Alive,-15393,,1477,42,Stage_II,,,LumA,,,,C1,C9,C1,C2,C2,,,C6,,BRCA,NO,Alive,-15393,,1477,42,Stage_II,,,LumA,,,,C1,C9,C1,C2,C2,,,C6,,LumA,none,none,none,none,breast,none,Breast Neoplasms,Years,42.0,42.0
TCGA-A1-A0SE-01A-11D-A099-09,BRCA_Mutation_TCGA-A1-A0SE-01A-11D-A099-09,TCGA-A1-A0SE,TCGA-A1-A0SE-01A-11R-A084-07,TCGA-A1-A0SE-01A,TP,Primary solid Tumor,TCGA-A1-A0SE-01A,1,True,2fd1998c-1ea4-42eb-84b9-db352da9cf25,TCGA-A1-A0SE,released,False,Primary Tumor,Not Reported,1672,90,CFF567FC-8867-4CA4-85AE-81A7D30095E7,No,Stage I,stage i,0,not reported,Breast-- NOS,1321,Infiltrating duct and lobular carcinoma,20717,no,2005,No,6th,T1c,8522/3,N0 (i-),M0,not reported,003f1965-f1c7-5613-87cd-c286b6a9eec5,C50.9,Breast-- NOS,not reported,not reported,Not Reported,3bc5fdfe-9a63-5f7e-888e-71d4724a7f4b,white,not hispanic or latino,female,Alive,56,-20717,1949,875022cb-fd30-5660-97b1-f85db82461c2,TCGA-A1-A0SE-01A,Breast,TCGA-BRCA,"c(""Ductal and Lobular Neoplasms""-- ""Complex Ep...",Breast Invasive Carcinoma,True,True,TCGA-A1-A0SE,BRCA,YES,Alive,-20717,,1321,56,Stage_I,,ILC,LumA,,,,C1,C4,C1,C1,C3,,,C5,,BRCA,YES,Alive,-20717,,1321,56,Stage_I,,ILC,LumA,,,,C1,C4,C1,C1,C3,,,C5,,LumA,none,none,none,none,breast,none,Breast Neoplasms,Years,56.0,56.0
TCGA-5L-AAT1-01A-12D-A41F-09,BRCA_Mutation_TCGA-5L-AAT1-01A-12D-A41F-09,TCGA-5L-AAT1,TCGA-5L-AAT1-01A-12R-A41B-07,TCGA-5L-AAT1-01A,TP,Primary solid Tumor,TCGA-5L-AAT1-01A,1,False,a31dec9b-3626-4cb0-8265-1425d5ce1c1e,TCGA-5L-AAT1,released,False,Primary Tumor,Not Reported,1259,130,B5CA42BB-9514-42C6-9FB0-C8889C1DC51A,Not Reported,Stage IV,stage iv,0,not reported,Breast-- NOS,1471,Lobular carcinoma-- NOS,23225,yes,2010,No,7th,T2,8520/3,N0,M1,not reported,b213c52a-983e-5907-80c9-b30320eee559,C50.9,Breast-- NOS,not reported,not reported,Not Reported,760f5e90-3d54-53d0-bd5c-f9e4e36c86b8,white,hispanic or latino,female,Alive,63,-23225,1947,8c9b63e8-9c76-5eaa-9acd-04f2b8e2449d,TCGA-5L-AAT1-01A,Breast,TCGA-BRCA,"c(""Ductal and Lobular Neoplasms""-- ""Complex Ep...",Breast Invasive Carcinoma,True,True,TCGA-5L-AAT1,BRCA,NO,Alive,-23225,,1471,63,Stage_IV,,,LumA,,,,,C9,C1,C2,,,C2,,C1,BRCA,NO,Alive,-23225,,1471,63,Stage_IV,,,LumA,,,,,C9,C1,C2,,,C2,,C1,LumA,none,none,none,none,breast,none,Breast Neoplasms,Years,63.0,63.0
TCGA-A1-A0SI-01A-11D-A142-09,BRCA_Mutation_TCGA-A1-A0SI-01A-11D-A142-09,TCGA-A1-A0SI,TCGA-A1-A0SI-01A-11R-A144-07,TCGA-A1-A0SI-01A,TP,Primary solid Tumor,TCGA-A1-A0SI-01A,1,True,c5181ae5-bd05-459f-920d-26bd31bd9088,TCGA-A1-A0SI,released,False,Primary Tumor,Not Reported,1267,40,4CD6BDB7-9629-41C6-8514-7661E3D33A66,No,Stage IIB,stage iib,0,not reported,Breast-- NOS,635,Infiltrating duct carcinoma-- NOS,19250,no,2007,No,6th,T2,8500/3,N1a,M0,not reported,ce2b4bc1-d626-51df-92ee-96bc823134e0,C50.9,Breast-- NOS,not reported,not reported,Not Reported,c579dcd1-acba-5235-8004-a4ae1c285231,white,not hispanic or latino,female,Alive,52,-19250,1955,ba9e0d8a-b060-5988-815a-43e28464ad3f,TCGA-A1-A0SI-01A,Breast,TCGA-BRCA,"c(""Ductal and Lobular Neoplasms""-- ""Complex Ep...",Breast Invasive Carcinoma,True,True,TCGA-A1-A0SI,BRCA,YES,Alive,-19250,,635,52,Stage_II,,IDC,LumB,,,,C1,C9,C1,C1,C3,,,C6,,BRCA,YES,Alive,-19250,,635,52,Stage_II,,IDC,LumB,,,,C1,C9,C1,C1,C3,,,C6,,LumB,none,none,none,none,breast,none,Breast Neoplasms,Years,52.0,52.0


In [49]:
merge_dataset = cohort1.merge_data('dataset')

In [50]:
merge_dataset

Unnamed: 0,dataset_id,disease,kw_disease_type,kw_disease_stage,kw_sample_type,gender_x,kw_vital_status,kw_molecular_subtype,kw_drug,publication,patient_id,tissue,organism,kw_data_type,description,kw_cell_line,kw_cell_type,dataset_source,curation_version,total_num_samples,kw_repo,kw_package,kw_key,kw_bucket,kw_filetype,kw_region,kw_location,kw_timestamp
0,BRCA_Mutation_TCGA-3C-AALK-01A-11D-A41F-09,[Breast Neoplasms],,,Tumor,gender,vital_status.x,LumA,[none],https://www.cell.com/cancer-cell/fulltext/S153...,TCGA-3C-AALK-01A,[breast],Homo sapiens,Mutation,Breast invasive carcinoma RNASeq data for TCGA...,[None],[None],TCGA,g3,1,tcga,TCGA_data_lake/data,TCGA_data_lake/data/BRCA/Mutation/BRCA_Mutatio...,discover-prod-datalake-v1,gct,us-west-2,https://discover-prod-datalake-v1.s3-us-west-2...,1642897542005
1,BRCA_Mutation_TCGA-5L-AAT0-01A-12D-A41F-09,[Breast Neoplasms],,,Tumor,gender,vital_status.x,LumA,[none],https://www.cell.com/cancer-cell/fulltext/S153...,TCGA-5L-AAT0-01A,[breast],Homo sapiens,Mutation,Breast invasive carcinoma RNASeq data for TCGA...,[None],[None],TCGA,g3,1,tcga,TCGA_data_lake/data,TCGA_data_lake/data/BRCA/Mutation/BRCA_Mutatio...,discover-prod-datalake-v1,gct,us-west-2,https://discover-prod-datalake-v1.s3-us-west-2...,1642897526634
2,BRCA_Mutation_TCGA-A1-A0SE-01A-11D-A099-09,[Breast Neoplasms],,,Tumor,gender,vital_status.x,LumA,[none],https://www.cell.com/cancer-cell/fulltext/S153...,TCGA-A1-A0SE-01A,[breast],Homo sapiens,Mutation,Breast invasive carcinoma RNASeq data for TCGA...,[None],[None],TCGA,g3,1,tcga,TCGA_data_lake/data,TCGA_data_lake/data/BRCA/Mutation/BRCA_Mutatio...,discover-prod-datalake-v1,gct,us-west-2,https://discover-prod-datalake-v1.s3-us-west-2...,1642897576012
3,BRCA_Mutation_TCGA-5L-AAT1-01A-12D-A41F-09,[Breast Neoplasms],,,Tumor,gender,vital_status.x,LumA,[none],https://www.cell.com/cancer-cell/fulltext/S153...,TCGA-5L-AAT1-01A,[breast],Homo sapiens,Mutation,Breast invasive carcinoma RNASeq data for TCGA...,[None],[None],TCGA,g3,1,tcga,TCGA_data_lake/data,TCGA_data_lake/data/BRCA/Mutation/BRCA_Mutatio...,discover-prod-datalake-v1,gct,us-west-2,https://discover-prod-datalake-v1.s3-us-west-2...,1642897558231
4,BRCA_Mutation_TCGA-A1-A0SI-01A-11D-A142-09,[Breast Neoplasms],,,Tumor,gender,vital_status.x,LumB,[none],https://www.cell.com/cancer-cell/fulltext/S153...,TCGA-A1-A0SI-01A,[breast],Homo sapiens,Mutation,Breast invasive carcinoma RNASeq data for TCGA...,[None],[None],TCGA,g3,1,tcga,TCGA_data_lake/data,TCGA_data_lake/data/BRCA/Mutation/BRCA_Mutatio...,discover-prod-datalake-v1,gct,us-west-2,https://discover-prod-datalake-v1.s3-us-west-2...,1642897543766


In [51]:
merge_feature = cohort1.merge_data('feature')

In [52]:
merge_feature

rhd,dataset_id,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,dbSNP_RS,dbSNP_Val_Status,Tumor_Sample_Barcode,Matched_Norm_Sample_Barcode,Match_Norm_Seq_Allele1,Match_Norm_Seq_Allele2,Tumor_Validation_Allele1,Tumor_Validation_Allele2,Match_Norm_Validation_Allele1,Match_Norm_Validation_Allele2,Verification_Status,Validation_Status,Mutation_Status,Sequencing_Phase,Sequence_Source,Validation_Method,Score,BAM_File,Sequencer,Tumor_Sample_UUID,Matched_Norm_Sample_UUID,HGVSc,HGVSp,HGVSp_Short,Transcript_ID,Exon_Number,t_depth,t_ref_count,t_alt_count,n_depth,n_ref_count,n_alt_count,Allele,Gene,Feature,Feature_type,One_Consequence,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,ALLELE_NUM,DISTANCE,TRANSCRIPT_STRAND,SYMBOL,SYMBOL_SOURCE,HGNC_ID,BIOTYPE,CANONICAL,CCDS,ENSP,SWISSPROT,TREMBL,UNIPARC,RefSeq,SIFT,PolyPhen,EXON,INTRON,DOMAINS,GMAF,AFR_MAF,AMR_MAF,ASN_MAF,EAS_MAF,EUR_MAF,SAS_MAF,AA_MAF,EA_MAF,CLIN_SIG,SOMATIC,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,IMPACT,PICK,VARIANT_CLASS,TSL,HGVS_OFFSET,PHENO,MINIMISED,ExAC_AF,ExAC_AF_Adj,ExAC_AF_AFR,ExAC_AF_AMR,ExAC_AF_EAS,ExAC_AF_FIN,ExAC_AF_NFE,ExAC_AF_OTH,ExAC_AF_SAS,GENE_PHENO,FILTER,CONTEXT,src_vcf_id,tumor_bam_uuid,normal_bam_uuid,case_id,GDC_FILTER,COSMIC,MC3_Overlap,GDC_Validation_Status
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1
RBMXL1_c.479C>G,BRCA_Mutation_TCGA-3C-AALK-01A-11D-A41F-09,RBMXL1,494115,WUGSC,GRCh38,chr1,88983348,88983348,+,Missense_Mutation,SNP,G,G,C,novel,,TCGA-3C-AALK-01A-11D-A41F-09,TCGA-3C-AALK-10A-01D-A41F-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,e15261e6-6ab7-4a24-a04e-4f992a1da251,bcf4018a-98f3-4f38-a23a-7bb3e84d596c,c.479C>G,p.Pro160Arg,p.P160R,ENST00000321792,2/2,43,29,14,114,,,C,ENSG00000213516,ENST00000321792,Transcript,missense_variant,missense_variant,907/4799,479/1173,160/390,P/R,cCt/cGt,,1,,-1,RBMXL1,HGNC,HGNC:25073,protein_coding,,CCDS716.1,ENSP00000318415,Q96E39,,UPI000006DA18,NM_019610.5,deleterious(0),probably_damaging(0.953),2/2,,Low_complexity_(Seg):Seg,,,,,,,,,,,,,,,,,MODERATE,,SNV,1.0,,,1,,,,,,,,,,,PASS,GAGAAGGACCC,1cf25007-3904-4bdf-8223-0aeef0f54560,26eaaf91-6652-47fe-8c31-15c9ace634b5,c6a1556d-6bbf-47a7-9797-e076c5534b45,c31900a4-5dcd-4022-97ac-638e86e889e4,,,TRUE,Unknown
FSIP2_c.17434T>C,BRCA_Mutation_TCGA-3C-AALK-01A-11D-A41F-09,FSIP2,401024,WUGSC,GRCh38,chr2,185806740,185806740,+,Missense_Mutation,SNP,T,T,C,novel,,TCGA-3C-AALK-01A-11D-A41F-09,TCGA-3C-AALK-10A-01D-A41F-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,e15261e6-6ab7-4a24-a04e-4f992a1da251,bcf4018a-98f3-4f38-a23a-7bb3e84d596c,c.17434T>C,p.Cys5812Arg,p.C5812R,ENST00000424728,17/23,24,20,4,70,,,C,ENSG00000188738,ENST00000424728,Transcript,missense_variant,missense_variant,17434/20788,17434/20724,5812/6907,C/R,Tgt/Cgt,,1,,1,FSIP2,HGNC,HGNC:21675,protein_coding,,,ENSP00000401306,Q5CZC0,,UPI000198D023,,,benign(0.003),17/23,,,,,,,,,,,,,,,,,,,MODERATE,,SNV,5.0,,,1,,,,,,,,,,,PASS,ATAGATGTCAA,1cf25007-3904-4bdf-8223-0aeef0f54560,26eaaf91-6652-47fe-8c31-15c9ace634b5,c6a1556d-6bbf-47a7-9797-e076c5534b45,c31900a4-5dcd-4022-97ac-638e86e889e4,,,TRUE,Unknown
PCNP_c.*847C>T,BRCA_Mutation_TCGA-3C-AALK-01A-11D-A41F-09,PCNP,57092,WUGSC,GRCh38,chr3,101593600,101593600,+,3'UTR,SNP,C,C,T,novel,,TCGA-3C-AALK-01A-11D-A41F-09,TCGA-3C-AALK-10A-01D-A41F-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,e15261e6-6ab7-4a24-a04e-4f992a1da251,bcf4018a-98f3-4f38-a23a-7bb3e84d596c,c.*847C>T,,,ENST00000265260,5/5,57,42,15,128,,,T,ENSG00000081154,ENST00000265260,Transcript,3_prime_UTR_variant,3_prime_UTR_variant,1505/2342,-/537,-/178,,,,1,,1,PCNP,HGNC,HGNC:30023,protein_coding,YES,CCDS2942.1,ENSP00000265260,Q8WW12,,UPI0000070241,NM_020357.1,,,5/5,,,,,,,,,,,,,,,,,,,MODIFIER,1.0,SNV,1.0,,,1,,,,,,,,,,,PASS,TTTGGCTGTTT,1cf25007-3904-4bdf-8223-0aeef0f54560,26eaaf91-6652-47fe-8c31-15c9ace634b5,c6a1556d-6bbf-47a7-9797-e076c5534b45,c31900a4-5dcd-4022-97ac-638e86e889e4,,,TRUE,Unknown
WDR49_c.1079G>A,BRCA_Mutation_TCGA-3C-AALK-01A-11D-A41F-09,WDR49,151790,WUGSC,GRCh38,chr3,167531198,167531198,+,Missense_Mutation,SNP,C,C,T,rs199677479,byCluster;byFrequency,TCGA-3C-AALK-01A-11D-A41F-09,TCGA-3C-AALK-10A-01D-A41F-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,e15261e6-6ab7-4a24-a04e-4f992a1da251,bcf4018a-98f3-4f38-a23a-7bb3e84d596c,c.1079G>A,p.Arg360His,p.R360H,ENST00000308378,9/15,76,50,26,172,,,T,ENSG00000174776,ENST00000308378,Transcript,missense_variant,missense_variant,1385/2594,1079/2094,360/697,R/H,cGc/cAc,rs199677479,1,,-1,WDR49,HGNC,HGNC:26587,protein_coding,YES,CCDS3201.1,ENSP00000311343,Q8IV35,,,NM_178824.3,tolerated(0.56),benign(0.001),9/15,,SMART_domains:SM00320;PROSITE_profiles:PS50294...,,,,,,,,0.0002,0.0001,,,,,,,,MODERATE,1.0,SNV,1.0,,,1,0.000091,0.000091,0.000192,0.0,0.0,0.0,0.00012,0.0,0.000061,,PASS,AGTTGCGGACT,1cf25007-3904-4bdf-8223-0aeef0f54560,26eaaf91-6652-47fe-8c31-15c9ace634b5,c6a1556d-6bbf-47a7-9797-e076c5534b45,c31900a4-5dcd-4022-97ac-638e86e889e4,,COSM165576;COSM4115070,TRUE,Unknown
PIK3CA_c.1624G>A,BRCA_Mutation_TCGA-3C-AALK-01A-11D-A41F-09,PIK3CA,5290,WUGSC,GRCh38,chr3,179218294,179218294,+,Missense_Mutation,SNP,G,G,A,rs121913273,,TCGA-3C-AALK-01A-11D-A41F-09,TCGA-3C-AALK-10A-01D-A41F-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,e15261e6-6ab7-4a24-a04e-4f992a1da251,bcf4018a-98f3-4f38-a23a-7bb3e84d596c,c.1624G>A,p.Glu542Lys,p.E542K,ENST00000263967,10/21,115,90,25,229,,,A,ENSG00000121879,ENST00000263967,Transcript,missense_variant,missense_variant,1781/9093,1624/3207,542/1068,E/K,Gaa/Aaa,rs121913273,1,,1,PIK3CA,HGNC,HGNC:8975,protein_coding,YES,CCDS43171.1,ENSP00000263967,P42336,,UPI000013D494,NM_006218.2,deleterious(0.04),probably_damaging(0.96),10/21,,Pfam_domain:PF00613;SMART_domains:SM00145;Supe...,,,,,,,,,,pathogenic,,,,,,,MODERATE,1.0,SNV,2.0,,1.0,1,,,,,,,,,,,PASS,TCTCTGAAATC,1cf25007-3904-4bdf-8223-0aeef0f54560,26eaaf91-6652-47fe-8c31-15c9ace634b5,c6a1556d-6bbf-47a7-9797-e076c5534b45,c31900a4-5dcd-4022-97ac-638e86e889e4,,COSM125369;COSM760,TRUE,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WNK3_c.4279G>A,BRCA_Mutation_TCGA-A1-A0SI-01A-11D-A142-09,WNK3,65267,WUGSC,GRCh38,chrX,54237287,54237287,+,Missense_Mutation,SNP,C,C,T,,,TCGA-A1-A0SI-01A-11D-A142-09,TCGA-A1-A0SI-10B-01D-A142-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,e218c272-a7e1-4bc9-b8c5-d2d1c903550f,fbcab9dc-4a6b-4928-9459-699c9932e3e1,c.4279G>A,p.Glu1427Lys,p.E1427K,ENST00000354646,20/24,66,49,17,54,,,T,ENSG00000196632,ENST00000354646,Transcript,missense_variant,missense_variant,4718/11341,4279/5403,1427/1800,E/K,Gag/Aag,,1,,-1,WNK3,HGNC,HGNC:14543,protein_coding,YES,CCDS14357.1,ENSP00000346667,Q9BYP7,,UPI00001AF003,NM_020922.4,tolerated(0.13),benign(0.033),20/24,,,,,,,,,,,,,,,,,,,MODERATE,1.0,SNV,1.0,,,1,,,,,,,,,,,PASS,AGTCTCACAAG,d555a7a4-1ef4-429d-a66a-a6d5334e4737,d547b222-941c-4072-9f44-777b4627f14e,c8ae556d-d5ef-4112-9b14-03ada7e6d6cb,16368c32-2118-4fcf-8693-6c89995e49d8,,COSM1491120,TRUE,Unknown
TEX11_c.2518G>T,BRCA_Mutation_TCGA-A1-A0SI-01A-11D-A142-09,TEX11,56159,WUGSC,GRCh38,chrX,70552173,70552173,+,Nonsense_Mutation,SNP,C,C,A,,,TCGA-A1-A0SI-01A-11D-A142-09,TCGA-A1-A0SI-10B-01D-A142-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,e218c272-a7e1-4bc9-b8c5-d2d1c903550f,fbcab9dc-4a6b-4928-9459-699c9932e3e1,c.2518G>T,p.Glu840Ter,p.E840*,ENST00000344304,27/29,41,29,12,41,,,A,ENSG00000120498,ENST00000344304,Transcript,stop_gained,stop_gained,2518/2977,2518/2823,840/940,E/*,Gaa/Taa,,1,,-1,TEX11,HGNC,HGNC:11733,protein_coding,,CCDS35323.1,ENSP00000340995,Q8IYF3,,UPI000013CA89,,,,27/29,,,,,,,,,,,,,,,,,,,HIGH,,SNV,5.0,,,1,,,,,,,,,,,PASS,AACTTCTTCCA,d555a7a4-1ef4-429d-a66a-a6d5334e4737,d547b222-941c-4072-9f44-777b4627f14e,c8ae556d-d5ef-4112-9b14-03ada7e6d6cb,16368c32-2118-4fcf-8693-6c89995e49d8,,COSM1491202;COSM4813867,TRUE,Unknown
RBM41_c.526G>T,BRCA_Mutation_TCGA-A1-A0SI-01A-11D-A142-09,RBM41,55285,WUGSC,GRCh38,chrX,107088837,107088837,+,Nonsense_Mutation,SNP,C,C,A,,,TCGA-A1-A0SI-01A-11D-A142-09,TCGA-A1-A0SI-10B-01D-A142-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,e218c272-a7e1-4bc9-b8c5-d2d1c903550f,fbcab9dc-4a6b-4928-9459-699c9932e3e1,c.526G>T,p.Glu176Ter,p.E176*,ENST00000372479,5/7,55,39,16,23,,,A,ENSG00000089682,ENST00000372479,Transcript,stop_gained,stop_gained;splice_region_variant,557/1662,526/1242,176/413,E/*,Gaa/Taa,,1,,-1,RBM41,HGNC,HGNC:25617,protein_coding,YES,CCDS14526.1,ENSP00000361557,Q96IZ5,,UPI000013CC0E,NM_018301.3,,,5/7,,,,,,,,,,,,,,,,,,,HIGH,1.0,SNV,1.0,,,1,,,,,,,,,,,PASS,GGGTTCATCTG,d555a7a4-1ef4-429d-a66a-a6d5334e4737,d547b222-941c-4072-9f44-777b4627f14e,c8ae556d-d5ef-4112-9b14-03ada7e6d6cb,16368c32-2118-4fcf-8693-6c89995e49d8,,COSM385759;COSM4813851;COSM5835849,TRUE,Unknown
GRIA3_c.880G>A,BRCA_Mutation_TCGA-A1-A0SI-01A-11D-A142-09,GRIA3,2892,WUGSC,GRCh38,chrX,123395097,123395097,+,Missense_Mutation,SNP,G,G,A,,,TCGA-A1-A0SI-01A-11D-A142-09,TCGA-A1-A0SI-10B-01D-A142-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,e218c272-a7e1-4bc9-b8c5-d2d1c903550f,fbcab9dc-4a6b-4928-9459-699c9932e3e1,c.880G>A,p.Glu294Lys,p.E294K,ENST00000541091,6/15,31,21,10,35,,,A,ENSG00000125675,ENST00000541091,Transcript,missense_variant,missense_variant,933/2740,880/2685,294/894,E/K,Gaa/Aaa,,1,,1,GRIA3,HGNC,HGNC:4573,protein_coding,,CCDS14604.1,ENSP00000446440,P42263,,UPI000013D503,,tolerated(0.18),probably_damaging(0.997),6/15,,Pfam_domain:PF01094;Superfamily_domains:SSF53822,,,,,,,,,,,,,,,,,MODERATE,,SNV,1.0,,,1,,,,,,,,,,,PASS,AAAGGGAATTC,d555a7a4-1ef4-429d-a66a-a6d5334e4737,d547b222-941c-4072-9f44-777b4627f14e,c8ae556d-d5ef-4112-9b14-03ada7e6d6cb,16368c32-2118-4fcf-8693-6c89995e49d8,,COSM1490440;COSM1490441;COSM1490442;COSM5192452,TRUE,Unknown


In [53]:
merge_data_matrix = cohort1.merge_data('data_matrix')

In [54]:
merge_data_matrix

cid,TCGA-3C-AALK-01A-11D-A41F-09,TCGA-5L-AAT0-01A-12D-A41F-09,TCGA-5L-AAT1-01A-12D-A41F-09,TCGA-A1-A0SE-01A-11D-A099-09,TCGA-A1-A0SI-01A-11D-A142-09
AARS_c.2569C>A,,,1.0,,
ABAT_c.423G>A,,,,,1.0
ABCA10_c.4056G>T,,,1.0,,
ABCA13_c.10798G>A,,,1.0,,
ABCA13_c.6769C>T,,,1.0,,
...,...,...,...,...,...
ZRANB3_c.2719G>T,,,1.0,,
ZSCAN1_c.279G>C,,,1.0,,
ZSCAN23_c.547C>G,,,1.0,,
ZSCAN2_c.406+2347C>T,,,1.0,,


### edit_cohort
### Input:
new_cohort_name(str): Optional Argument: new identifier name for the cohort.<br>
new_description(str): Optional Argument: new description about the cohort.<br>
### Output:
A confirmation message on updation of cohort.

In [55]:
cohort2.edit_cohort("high_edited","tcga_tp53_brca_mutation_high_edited")

### remove_from_cohort
### Input:
entity_id(list): list of dataset_id or sample_id to be removed from the cohort.
### output:
A confirmation message on removal of dataset_id or sample_id from cohort.

In [56]:
cohort2.remove_from_cohort(high_data[0:2])

### is_valid()
This function is used to check if a cohort is valid or not.
### input:
None
### Output:
A boolean result based on the validity of the cohort.

In [57]:
cohort2.is_valid()

True

### delete_cohort
### Input:
None
### Output:
A confirmation message on deletion of cohort.

In [58]:
cohort1.delete_cohort()

### upload cohort file to workspace

In [59]:
workspaces.upload_to_workspaces(workspace_id = 9009, workspace_path = "/high_edited.pco", 
                                local_path = "/import/tcga_cohort/high_edited.pco")

### download cohort file from workspace

In [61]:
workspaces.download_from_workspaces(workspace_id = 9009, workspace_path = "/high_edited.pco")

### load_cohort
### Input:
local_path(str): local path of the cohort.
### Output:
A confirmation message on instantiation of the cohort.

In [62]:
cohort1.load_cohort("/import/high_edited.pco")

### summarizing cohort

In [63]:
metadata, cohort_details = cohort1.summarize_cohort()

### taking out details

In [64]:
metadata

{'cohort_name': 'high_edited',
 'number_of_samples': 96,
 'description': 'tcga_tp53_brca_mutation_high_edited'}

In [65]:
cohort_details

Unnamed: 0,source_omixatlas,datatype,dataset_id
0,tcga,Mutation,BRCA_Mutation_TCGA-E2-A15D-01A-11D-A10Y-09
1,tcga,Mutation,BRCA_Mutation_TCGA-BH-A18V-01A-11D-A12B-09
2,tcga,Mutation,BRCA_Mutation_TCGA-OL-A5RV-01A-12D-A28B-09
3,tcga,Mutation,BRCA_Mutation_TCGA-AC-A6IV-01A-12D-A33E-09
4,tcga,Mutation,BRCA_Mutation_TCGA-A8-A09Q-01A-11W-A019-09
5,tcga,Mutation,BRCA_Mutation_TCGA-A2-A3Y0-01A-11D-A23C-09
6,tcga,Mutation,BRCA_Mutation_TCGA-AN-A03Y-01A-21W-A019-09
7,tcga,Mutation,BRCA_Mutation_TCGA-EW-A1OY-01A-11D-A142-09
8,tcga,Mutation,BRCA_Mutation_TCGA-EW-A1OZ-01A-11D-A142-09
9,tcga,Mutation,BRCA_Mutation_TCGA-LL-A9Q3-01A-11D-A41F-09
