# 

In [1]:
import pandas as pd
import pathlib
import os
import json
import itertools
import numpy as np

root = pathlib.Path("/data/rbg/shared/datasets/TCGA/")

In [2]:

def add_outcome(elem):
    vital_status = elem['demographic'].get('vital_status')

    if vital_status == 'Dead':
        patient_death = elem['demographic'].get('days_to_death')
        return [1, int(patient_death)]

    elif vital_status == "Alive":
        follow_ups = elem['follow_ups']
        follow_up_days = [
            f.get('days_to_follow_up')
            for f in follow_ups
            if f.get('days_to_follow_up') is not None
        ]
        diagnosis_days = [
            d.get('days_to_last_follow_up')
            for d in elem['diagnoses']
            if d.get('days_to_last_follow_up') is not None
        ]
        all_days = follow_up_days + diagnosis_days
        date_of_last_checkup = max(all_days) if all_days else None

        return [0, int(date_of_last_checkup)]
    else:
        typing.assert_never("vital status not in Alive/Dead")


def remove_sparse_features(X, threshold=0.05):
    non_zero_fraction = (X != 0).sum(axis=0) / X.shape[0]
    keep = non_zero_fraction > threshold
    return X.loc[:, keep]


def remove_perfect_correlations(X):
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] == 1)]
    return X.drop(columns=to_drop)


# ids = pd.read_json("ids/tcga_cohort.json")["case_id"]
ids = pd.read_csv("sample_ids.csv")["Sample"]

case_ids_raw = pd.read_json("./clinical-tcga-brca.json").set_index("submitter_id")
case_ids = case_ids_raw.loc[ids]
case_ids["outcome"] = case_ids.apply(add_outcome, axis=1)

case_ids = case_ids.reset_index()
print(case_ids.shape)

case_ids = case_ids.drop_duplicates(subset=["case_id"])
print(case_ids.shape)

case_ids.head()


(133, 16)
(133, 16)


Unnamed: 0,submitter_id,disease_type,project,days_to_consent,diagnoses,consent_type,demographic,primary_site,updated_datetime,case_id,follow_ups,index_date,state,lost_to_followup,exposures,outcome
0,TCGA-A1-A0SP,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},55.0,"[{'tissue_or_organ_of_origin': 'Breast, NOS', ...",Informed Consent,{'demographic_id': 'f7ed8691-ed3d-54bf-94c5-7f...,Breast,2025-01-06T07:38:56.031656-06:00,a9bb8159-32f0-454c-a946-b3286a52b9d5,"[{'timepoint_category': 'Last Contact', 'follo...",Diagnosis,released,,,"[0, 584]"
1,TCGA-A2-A04P,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},22.0,"[{'morphology': '8500/3', 'submitter_id': 'TCG...",Informed Consent,{'demographic_id': '92faa22e-8e19-5502-987c-a6...,Breast,2025-01-06T07:38:16.470569-06:00,ccd4a24b-d8cc-4686-9dee-c98b0c5a8d21,"[{'days_to_progression': 102, 'timepoint_categ...",Diagnosis,released,No,,"[1, 548]"
2,TCGA-A2-A04T,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},29.0,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",Informed Consent,{'demographic_id': 'dd68f2a1-f279-567f-83b6-4c...,Breast,2025-01-05T21:21:42.777520-06:00,b58ad350-5140-4fa8-bc2c-24bca8395f3a,"[{'timepoint_category': 'Last Contact', 'follo...",Diagnosis,released,,,"[0, 2246]"
3,TCGA-A2-A04U,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},23.0,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",Informed Consent,{'demographic_id': '9145b76c-61b5-56e6-876e-58...,Breast,2025-01-05T23:51:14.306282-06:00,1c3610f7-e0aa-48d7-9a27-0dbaf6e244f9,"[{'timepoint_category': 'Follow-up', 'follow_u...",Diagnosis,released,Yes,,"[0, 2654]"
4,TCGA-A2-A0CM,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},-212.0,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",Informed Consent,{'demographic_id': '0f8728a4-3abb-5a78-84c3-3d...,Breast,2025-01-06T07:33:56.519794-06:00,eb2dbb4f-66b6-4525-8323-431970f7a64e,"[{'timepoint_category': 'Last Contact', 'follo...",Diagnosis,released,,,"[1, 754]"


In [3]:
dna_mapping = "/data/rbg/shared/datasets/TCGA/Genomic/copy_number.gdc_sample_sheet.2025-07-09.tsv"
caseid_copynum_mapping = pd.read_csv(dna_mapping, delimiter='\t')

caseid_copynum_mapping['Case ID'] = caseid_copynum_mapping['Case ID'].str.split(',')
# print(caseid_copynum_mapping.shape)
caseid_copynum_mapping = caseid_copynum_mapping.explode('Case ID')
caseid_copynum_mapping['Case ID'] = caseid_copynum_mapping['Case ID'].str.strip()
# print(caseid_copynum_mapping.shape)
caseid_copynum_mapping = caseid_copynum_mapping.drop_duplicates(subset=["Case ID"]) # I'm pretty sure this is valid bc I explode only on Case ID - meaning everything else is duplicated. This whole process might actualyl be unncesary. TODO: check if all "Case IDs pairs" are just twins
print(caseid_copynum_mapping.shape)

caseid_copynum_mapping.head()

# stupid test to check whether any of hte records had duplicate sample IDs
# def foo(x):
#     a = list(map(lambda x: x.strip(), x["Sample ID"].split(",")))
#     print(a[0])
#     print(a[1])
#     # return a[0] == a[1]
# caseid_copynum_mapping = caseid_copynum_mapping[caseid_copynum_mapping.apply(foo, axis=1)]

(2690, 11)
(5380, 11)
(1845, 11)


Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Tissue Type,Tumor Descriptor,Specimen Type,Preservation Method
0,b260bc0a-bf82-4fe8-b1ca-1ba43a4d741a,TCGA-C8-A27B-01A-11D-A89A-36.WholeGenome.RP-16...,Copy Number Variation,Copy Number Segment,TCGA-BRCA,TCGA-C8-A27B,"TCGA-C8-A27B-10A, TCGA-C8-A27B-01A","Normal, Tumor","Not Applicable, Primary","Peripheral Blood NOS, Solid Tissue","Unknown, OCT"
1,8ca11193-5883-4b75-9e34-f9dcbf7040a7,TCGA-B6-A0WW-01A-11D-A893-36.WholeGenome.RP-16...,Copy Number Variation,Copy Number Segment,TCGA-BRCA,TCGA-B6-A0WW,"TCGA-B6-A0WW-01A, TCGA-B6-A0WW-10A","Tumor, Normal","Primary, Not Applicable","Solid Tissue, Peripheral Blood NOS","OCT, Unknown"
2,d9de4dd4-6dd8-4a78-901a-35251c228ec6,TCGA-E9-A1R0-01A-22D-A898-36.WholeGenome.RP-16...,Copy Number Variation,Copy Number Segment,TCGA-BRCA,TCGA-E9-A1R0,"TCGA-E9-A1R0-10A, TCGA-E9-A1R0-01A","Normal, Tumor","Not Applicable, Primary","Peripheral Blood NOS, Solid Tissue","Unknown, OCT"
3,1eb79fa2-5f92-40f7-b36f-8a9844ebedf1,66a9e6d2-6dea-4475-ab39-7b5ffa3d9faf.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,TCGA-BRCA,TCGA-BH-A0C1,"TCGA-BH-A0C1-01B, TCGA-BH-A0C1-10A","Tumor, Normal","Primary, Not Applicable","Solid Tissue, Peripheral Blood NOS","OCT, Unknown"
5,42e20b03-cc2a-40e2-ba66-d34ca7890b96,TCGA-AC-A5EH-01A-11D-A89D-36.WholeGenome.RP-16...,Copy Number Variation,Copy Number Segment,TCGA-BRCA,TCGA-AC-A5EH,"TCGA-AC-A5EH-01A, TCGA-AC-A5EH-10A","Tumor, Normal","Primary, Not Applicable","Solid Tissue, Peripheral Blood NOS","Unknown, Unknown"


In [34]:
# add copynumbesr to "IDs" dataset
# FIXME: figure out why 21 records are being dropped - the final result should still be 133
print(case_ids.shape)
data= pd.merge(
    case_ids,
    caseid_copynum_mapping[['Case ID', 'Sample ID', 'File ID', "File Name"]],  # select only the needed column
    left_on='submitter_id',
    right_on='Case ID',
    how="inner"
)
print(data.shape)
data.head()

(133, 16)
(112, 20)


Unnamed: 0,submitter_id,disease_type,project,days_to_consent,diagnoses,consent_type,demographic,primary_site,updated_datetime,case_id,follow_ups,index_date,state,lost_to_followup,exposures,outcome,Case ID,Sample ID,File ID,File Name
0,TCGA-A1-A0SP,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},55.0,"[{'tissue_or_organ_of_origin': 'Breast, NOS', ...",Informed Consent,{'demographic_id': 'f7ed8691-ed3d-54bf-94c5-7f...,Breast,2025-01-06T07:38:56.031656-06:00,a9bb8159-32f0-454c-a946-b3286a52b9d5,"[{'timepoint_category': 'Last Contact', 'follo...",Diagnosis,released,,,"[0, 584]",TCGA-A1-A0SP,"TCGA-A1-A0SP-01A, TCGA-A1-A0SP-10A",4dfa8f5b-e9de-46d5-9aa5-15a2dc535ec8,TCGA-A1-A0SP-01A-11D-A893-36.WholeGenome.RP-16...
1,TCGA-A2-A04P,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},22.0,"[{'morphology': '8500/3', 'submitter_id': 'TCG...",Informed Consent,{'demographic_id': '92faa22e-8e19-5502-987c-a6...,Breast,2025-01-06T07:38:16.470569-06:00,ccd4a24b-d8cc-4686-9dee-c98b0c5a8d21,"[{'days_to_progression': 102, 'timepoint_categ...",Diagnosis,released,No,,"[1, 548]",TCGA-A2-A04P,"TCGA-A2-A04P-10A, TCGA-A2-A04P-01A",e175ae5e-11da-4bd1-bfb4-5919c85a72fc,TCGA-A2-A04P-01A-31D-A89H-36.WholeGenome.RP-16...
2,TCGA-A2-A04T,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},29.0,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",Informed Consent,{'demographic_id': 'dd68f2a1-f279-567f-83b6-4c...,Breast,2025-01-05T21:21:42.777520-06:00,b58ad350-5140-4fa8-bc2c-24bca8395f3a,"[{'timepoint_category': 'Last Contact', 'follo...",Diagnosis,released,,,"[0, 2246]",TCGA-A2-A04T,"TCGA-A2-A04T-01A, TCGA-A2-A04T-10A",017a228b-af6a-46a0-840f-f50d64f4e434,b752b444-f033-4be4-9d24-e5e80b4181af_wgs_gdc_r...
3,TCGA-A2-A04U,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},23.0,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",Informed Consent,{'demographic_id': '9145b76c-61b5-56e6-876e-58...,Breast,2025-01-05T23:51:14.306282-06:00,1c3610f7-e0aa-48d7-9a27-0dbaf6e244f9,"[{'timepoint_category': 'Follow-up', 'follow_u...",Diagnosis,released,Yes,,"[0, 2654]",TCGA-A2-A04U,"TCGA-A2-A04U-10A, TCGA-A2-A04U-01A",bf33ef54-ff2a-49f6-a24e-6d6052df4e73,TCGA-A2-A04U-01A-11D-A89H-36.WholeGenome.RP-16...
4,TCGA-A2-A0CM,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},-212.0,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",Informed Consent,{'demographic_id': '0f8728a4-3abb-5a78-84c3-3d...,Breast,2025-01-06T07:33:56.519794-06:00,eb2dbb4f-66b6-4525-8323-431970f7a64e,"[{'timepoint_category': 'Last Contact', 'follo...",Diagnosis,released,,,"[1, 754]",TCGA-A2-A0CM,"TCGA-A2-A0CM-01A, TCGA-A2-A0CM-10A",a59ae3ed-af09-4cb2-bf60-518f85798d53,TCGA-A2-A0CM-01A-31D-A890-36.WholeGenome.RP-16...


In [7]:
elem = data.iloc[10]
path = f"{root}/Genomic/downloads/{elem['File ID']}/{elem['File Name']}"
CNAs = pd.read_csv(path, delimiter="\t")
CNAs

Unnamed: 0,GDC_Aliquot_ID,Chromosome,Start,End,Num_Probes,Segment_Mean
0,TCGA-A2-A0YM-01A-11D-A894-36,chr1,17001,3723000,2754,-0.048360
1,TCGA-A2-A0YM-01A-11D-A894-36,chr1,3723001,5795000,2061,-0.080328
2,TCGA-A2-A0YM-01A-11D-A894-36,chr1,5795001,7432000,1634,-0.028879
3,TCGA-A2-A0YM-01A-11D-A894-36,chr1,7432001,9532000,2094,-0.017117
4,TCGA-A2-A0YM-01A-11D-A894-36,chr1,9532001,9540000,8,-0.181266
...,...,...,...,...,...,...
3737,TCGA-A2-A0YM-01A-11D-A894-36,chrY,26403001,26442000,5,-1.650024
3738,TCGA-A2-A0YM-01A-11D-A894-36,chrY,26638001,56708000,6,-9.341383
3739,TCGA-A2-A0YM-01A-11D-A894-36,chrY,56821001,56882000,61,-0.278835
3740,TCGA-A2-A0YM-01A-11D-A894-36,chrY,56882001,56887000,5,-2.367349


In [129]:
# Joins CNAs (CHR, start, end) w/ a generated (CHR, start, end -> gene_name) mapping based on overlaping ranges
# several genes can exist in a range -> increase df size

import pandas as pd
import pyranges as pr

if isinstance(CNAs, pd.DataFrame):
    CNAs = pr.PyRanges(CNAs)
elif isinstance(CNAs, pr.PyRanges):
    CNAs = CNAs
else:
    typing.assert_never("impossible CNAs datatype")

def parse_gtf(filepath):
    # GTF files are tab-separated, and comments start with '#'
    gtf = pd.read_csv(filepath, sep='\t', comment='#', header=None,
                      names=["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"])
    
    genes = gtf[gtf["feature"] == "gene"].copy()
    
    def extract_attr(attr_str, key):
        import re
        match = re.search(f'{key} "([^"]+)"', attr_str)
        return match.group(1) if match else None
    
    genes["gene_name"] = genes["attribute"].apply(lambda x: extract_attr(x, "gene_name"))
    genes["gene_id"] = genes["attribute"].apply(lambda x: extract_attr(x, "gene_id"))

    genes = genes[["seqname", "start", "end", "gene_name", "gene_id"]]
    genes.columns = ["Chromosome", "Start", "End", "gene_name", "gene_id"]

    genes["Chromosome"] = genes["Chromosome"].astype(str)
    genes["Chromosome"] = genes["Chromosome"].apply(lambda x: x if x.startswith("chr") else f"chr{x}")
    
    return genes

genes_df = parse_gtf("gencode.v38.annotation.gtf.gz")
gene_ranges = pr.PyRanges(genes_df)

CNAs_with_gene = CNAs.join( gene_ranges ).df

print(CNAs_with_gene.shape)
CNAs_with_gene

(62307, 10)


Unnamed: 0,GDC_Aliquot_ID,Chromosome,Start,End,Num_Probes,Segment_Mean,Start_b,End_b,gene_name,gene_id
0,TCGA-A2-A0YM-01A-11D-A894-36,chr1,17001,3723000,2754,-0.048360,14404,29570,WASH7P,ENSG00000227232.5
1,TCGA-A2-A0YM-01A-11D-A894-36,chr1,17001,3723000,2754,-0.048360,17369,17436,MIR6859-1,ENSG00000278267.1
2,TCGA-A2-A0YM-01A-11D-A894-36,chr1,17001,3723000,2754,-0.048360,29554,31109,MIR1302-2HG,ENSG00000243485.5
3,TCGA-A2-A0YM-01A-11D-A894-36,chr1,17001,3723000,2754,-0.048360,30366,30503,MIR1302-2,ENSG00000284332.1
4,TCGA-A2-A0YM-01A-11D-A894-36,chr1,17001,3723000,2754,-0.048360,34554,36081,FAM138A,ENSG00000237613.2
...,...,...,...,...,...,...,...,...,...,...
62302,TCGA-A2-A0YM-01A-11D-A894-36,chrY,11643001,15897000,11,-0.629715,14941746,14942715,AC006989.2,ENSG00000224518.2
62303,TCGA-A2-A0YM-01A-11D-A894-36,chrY,11643001,15897000,11,-0.629715,15348662,15456074,PUDPP1,ENSG00000234620.1
62304,TCGA-A2-A0YM-01A-11D-A894-36,chrY,11643001,15897000,11,-0.629715,15547216,15593331,STSP1,ENSG00000227166.1
62305,TCGA-A2-A0YM-01A-11D-A894-36,chrY,26403001,26442000,5,-1.650024,26409815,26420535,ANKRD36P1,ENSG00000188399.5


In [130]:
# Build Entrez/Gene-name -> HG38 Mapping

import shelve
import functools
import types
import mygene
import pandas as pd


def wrap_class_method_with_disk_cache(cls, method_name, cache_file):
    orig = getattr(cls, method_name)

    @functools.wraps(orig)
    def wrapper(self, *args, **kwargs):
        key = (args, tuple(sorted(kwargs.items())))
        key_str = str(key)
        with shelve.open(cache_file) as db:
            if key_str in db:
                return db[key_str]
            result = orig(self, *args, **kwargs)
            db[key_str] = result
            return result

    setattr(cls, method_name, wrapper)


# import mygene

# mg = mygene.MyGeneInfo()
# ids = overlap.df['gene_id'].tolist()
# res = mg.querymany(ids, scopes='ensembl.gene', fields='entrezgene', species='human')

# # entrez_df = pd.DataFrame(res)[["query", "entrezgene"]].rename(columns={"query": "gene_id"})
# # final_df = overlap.df.merge(entrez_df, on="gene_id", how="left")


wrap_class_method_with_disk_cache(mygene.MyGeneInfo, 'querymany', 'querymany_cache.db')
mg = mygene.MyGeneInfo()
gene_to_id = mg.querymany(CNAs_with_gene['gene_name'].tolist(), scopes='symbol', fields='entrezgene', species='human')
gene_to_id = pd.DataFrame(gene_to_id)

print(gene_to_id.shape)
gene_to_id


Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
8062 input query terms found dup hits:	[('WASH7P', 2), ('MIR1302-2HG', 2), ('DDX11L17', 2), ('WASH9P', 2), ('WBP1LP7', 2), ('CICP7', 2), ('
20958 input query terms found no hit:	['RP11-34P13.7', 'RP11-34P13.8', 'RP11-34P13.15', 'RP11-34P13.16', 'RP11-34P13.14', 'RP11-34P13.13',


(79361, 5)


Unnamed: 0,query,_id,_score,entrezgene,notfound
0,WASH7P,653635,8.569510,653635,
1,WASH7P,ENSG00000227232,8.569510,,
2,MIR6859-1,102466751,27.357260,102466751,
3,MIR1302-2HG,ENSG00000243485,26.562847,,
4,MIR1302-2HG,107985730,26.562847,107985730,
...,...,...,...,...,...
79356,ANKRD36P1,ENSG00000188399,8.568661,,
79357,ANKRD36P1,100132420,8.568661,100132420,
79358,CTBP2P1,ENSG00000235857,8.401274,,
79359,CTBP2P1,ENSG00000281211,8.401274,,


In [131]:

CNAs_with_gene_dd = CNAs_with_gene.drop_duplicates()
gene_to_id_dd = gene_to_id.drop_duplicates(subset=["query"])

CNAs_with_entrez = pd.merge(
    CNAs_with_gene_dd,
    gene_to_id_dd,
    left_on='gene_name',
    right_on='query',
    how='inner'
)

print(CNAs_with_gene.shape)
print(gene_to_id.shape)
print(CNAs_with_entrez.shape)
df_with_entrez["entrezgene"].isna().sum()

# # making sure that this is about to be annoying
# (CNAs_with_id.groupby(by=["query"]).apply(lambda x: len(set(x["entrezgene"])) == 1))
CNAs_with_entrez.head()

(62307, 10)
(79361, 5)
(62307, 15)


Unnamed: 0,GDC_Aliquot_ID,Chromosome,Start,End,Num_Probes,Segment_Mean,Start_b,End_b,gene_name,gene_id,query,_id,_score,entrezgene,notfound
0,TCGA-A2-A0YM-01A-11D-A894-36,chr1,17001,3723000,2754,-0.04836,14404,29570,WASH7P,ENSG00000227232.5,WASH7P,653635,8.56951,653635.0,
1,TCGA-A2-A0YM-01A-11D-A894-36,chr1,17001,3723000,2754,-0.04836,17369,17436,MIR6859-1,ENSG00000278267.1,MIR6859-1,102466751,27.35726,102466751.0,
2,TCGA-A2-A0YM-01A-11D-A894-36,chr1,17001,3723000,2754,-0.04836,29554,31109,MIR1302-2HG,ENSG00000243485.5,MIR1302-2HG,ENSG00000243485,26.562847,,
3,TCGA-A2-A0YM-01A-11D-A894-36,chr1,17001,3723000,2754,-0.04836,30366,30503,MIR1302-2,ENSG00000284332.1,MIR1302-2,100302278,26.043947,100302278.0,
4,TCGA-A2-A0YM-01A-11D-A894-36,chr1,17001,3723000,2754,-0.04836,34554,36081,FAM138A,ENSG00000237613.2,FAM138A,645520,27.357826,645520.0,


In [136]:




signature_gene_mapping = pd.read_excel("copynum_signatures.xlsx").drop(0).rename(columns={"Supplementary Data 2. Annotation of copy number segments": "Signature"}).set_index("Signature")

# a =  foo["Supplementary Data 2. Annotation of copy number segments"]
# output = pd.DataFrame(columns=a)

# perfect intersection rarely possible -> introduces error

for idx, row in list(signature_gene_mapping.iterrows())[0:1]:
    a = row.dropna().apply(lambda x: str(int(x))).to_numpy()

    b = CNAs_with_entrez.set_index("entrezgene").index.to_numpy().astype(str)

    # c = list(set(a).intersection(set(b)))
    
    # filtered = CNAs_with_entrez[CNAs_with_entrez["entrezgene"].astype(str).isin(c)]
    # print(filtered["Segment_Mean"].hist())

    # print(len(a))
    # print(len(b))
    # print(len(c))
    # print(type(a[0]))
    # print(type(b[0]))
    # print(a)
    # print(filtered.head())
    # print(type(a))
    # print(type(b))
    
    # row.dropna().to_list()]

    
    

['26751' '52' '285016' '129787' '391343' '54221' '7173' '7837' '23040'
 '730811' '7260' '51112' '55256' '246243' '6201' '78989' '55821' '6664'
 '150622' '400940' '129607' '91543' '9781' '339788' '3398' '57498'
 '129642' '8853' '9270' '51692' '285148' '6868' '10971' '9014' '29841'
 '8462' '192668' '6241' '348738' '3241' '4953' '79954' '245973' '10130'
 '3754' '130814' '9475' '1876' '9687' '23620' '23175' '28951' '151354'
 '51594']


In [137]:
CNAs_with_entrez

Unnamed: 0,GDC_Aliquot_ID,Chromosome,Start,End,Num_Probes,Segment_Mean,Start_b,End_b,gene_name,gene_id,query,_id,_score,entrezgene,notfound
0,TCGA-A2-A0YM-01A-11D-A894-36,chr1,17001,3723000,2754,-0.048360,14404,29570,WASH7P,ENSG00000227232.5,WASH7P,653635,8.569510,653635,
1,TCGA-A2-A0YM-01A-11D-A894-36,chr1,17001,3723000,2754,-0.048360,17369,17436,MIR6859-1,ENSG00000278267.1,MIR6859-1,102466751,27.357260,102466751,
2,TCGA-A2-A0YM-01A-11D-A894-36,chr1,17001,3723000,2754,-0.048360,29554,31109,MIR1302-2HG,ENSG00000243485.5,MIR1302-2HG,ENSG00000243485,26.562847,,
3,TCGA-A2-A0YM-01A-11D-A894-36,chr1,17001,3723000,2754,-0.048360,30366,30503,MIR1302-2,ENSG00000284332.1,MIR1302-2,100302278,26.043947,100302278,
4,TCGA-A2-A0YM-01A-11D-A894-36,chr1,17001,3723000,2754,-0.048360,34554,36081,FAM138A,ENSG00000237613.2,FAM138A,645520,27.357826,645520,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62302,TCGA-A2-A0YM-01A-11D-A894-36,chrY,11643001,15897000,11,-0.629715,14941746,14942715,AC006989.2,ENSG00000224518.2,AC006989.2,,,,True
62303,TCGA-A2-A0YM-01A-11D-A894-36,chrY,11643001,15897000,11,-0.629715,15348662,15456074,PUDPP1,ENSG00000234620.1,PUDPP1,ENSG00000234620,8.569693,,
62304,TCGA-A2-A0YM-01A-11D-A894-36,chrY,11643001,15897000,11,-0.629715,15547216,15593331,STSP1,ENSG00000227166.1,STSP1,6803,8.824922,6803,
62305,TCGA-A2-A0YM-01A-11D-A894-36,chrY,26403001,26442000,5,-1.650024,26409815,26420535,ANKRD36P1,ENSG00000188399.5,ANKRD36P1,ENSG00000188399,8.568661,,
