# facts
Signatures are continuous segments of chromosomes
Genes can (and often do) belong to multiple signatures


In [1]:
import pandas as pd
import pathlib
import os
import json
import itertools
import numpy as np

root = pathlib.Path("/data/rbg/shared/datasets/TCGA/")

In [2]:

def add_outcome(elem):
    vital_status = elem['demographic'].get('vital_status')

    if vital_status == 'Dead':
        patient_death = elem['demographic'].get('days_to_death')
        return [1, int(patient_death)]

    elif vital_status == "Alive":
        follow_ups = elem['follow_ups']
        follow_up_days = [
            f.get('days_to_follow_up')
            for f in follow_ups
            if f.get('days_to_follow_up') is not None
        ]
        diagnosis_days = [
            d.get('days_to_last_follow_up')
            for d in elem['diagnoses']
            if d.get('days_to_last_follow_up') is not None
        ]
        all_days = follow_up_days + diagnosis_days
        date_of_last_checkup = max(all_days) if all_days else None

        return [0, int(date_of_last_checkup)]
    else:
        typing.assert_never("vital status not in Alive/Dead")


def remove_sparse_features(X, threshold=0.05):
    non_zero_fraction = (X != 0).sum(axis=0) / X.shape[0]
    keep = non_zero_fraction > threshold
    return X.loc[:, keep]


def remove_perfect_correlations(X):
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] == 1)]
    return X.drop(columns=to_drop)


# ids = pd.read_json("ids/tcga_cohort.json")["case_id"]
ids = pd.read_csv("sample_ids.csv")["Sample"]

case_ids_raw = pd.read_json("./clinical-tcga-brca.json").set_index("submitter_id")
case_ids = case_ids_raw.loc[ids]
case_ids["outcome"] = case_ids.apply(add_outcome, axis=1)

case_ids = case_ids.reset_index()
print(case_ids.shape)

case_ids = case_ids.drop_duplicates(subset=["case_id"])
print(case_ids.shape)

case_ids.head()


(133, 16)
(133, 16)


Unnamed: 0,submitter_id,disease_type,project,days_to_consent,diagnoses,consent_type,demographic,primary_site,updated_datetime,case_id,follow_ups,index_date,state,lost_to_followup,exposures,outcome
0,TCGA-A1-A0SP,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},55.0,"[{'tissue_or_organ_of_origin': 'Breast, NOS', ...",Informed Consent,{'demographic_id': 'f7ed8691-ed3d-54bf-94c5-7f...,Breast,2025-01-06T07:38:56.031656-06:00,a9bb8159-32f0-454c-a946-b3286a52b9d5,"[{'timepoint_category': 'Last Contact', 'follo...",Diagnosis,released,,,"[0, 584]"
1,TCGA-A2-A04P,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},22.0,"[{'morphology': '8500/3', 'submitter_id': 'TCG...",Informed Consent,{'demographic_id': '92faa22e-8e19-5502-987c-a6...,Breast,2025-01-06T07:38:16.470569-06:00,ccd4a24b-d8cc-4686-9dee-c98b0c5a8d21,"[{'days_to_progression': 102, 'timepoint_categ...",Diagnosis,released,No,,"[1, 548]"
2,TCGA-A2-A04T,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},29.0,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",Informed Consent,{'demographic_id': 'dd68f2a1-f279-567f-83b6-4c...,Breast,2025-01-05T21:21:42.777520-06:00,b58ad350-5140-4fa8-bc2c-24bca8395f3a,"[{'timepoint_category': 'Last Contact', 'follo...",Diagnosis,released,,,"[0, 2246]"
3,TCGA-A2-A04U,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},23.0,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",Informed Consent,{'demographic_id': '9145b76c-61b5-56e6-876e-58...,Breast,2025-01-05T23:51:14.306282-06:00,1c3610f7-e0aa-48d7-9a27-0dbaf6e244f9,"[{'timepoint_category': 'Follow-up', 'follow_u...",Diagnosis,released,Yes,,"[0, 2654]"
4,TCGA-A2-A0CM,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},-212.0,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",Informed Consent,{'demographic_id': '0f8728a4-3abb-5a78-84c3-3d...,Breast,2025-01-06T07:33:56.519794-06:00,eb2dbb4f-66b6-4525-8323-431970f7a64e,"[{'timepoint_category': 'Last Contact', 'follo...",Diagnosis,released,,,"[1, 754]"


In [3]:
dna_mapping = "/data/rbg/shared/datasets/TCGA/Genomic/copy_number.gdc_sample_sheet.2025-07-09.tsv"
caseid_copynum_mapping = pd.read_csv(dna_mapping, delimiter='\t')

caseid_copynum_mapping['Case ID'] = caseid_copynum_mapping['Case ID'].str.split(',')
# print(caseid_copynum_mapping.shape)
caseid_copynum_mapping = caseid_copynum_mapping.explode('Case ID')
caseid_copynum_mapping['Case ID'] = caseid_copynum_mapping['Case ID'].str.strip()
# print(caseid_copynum_mapping.shape)
caseid_copynum_mapping = caseid_copynum_mapping.drop_duplicates(subset=["Case ID"]) # I'm pretty sure this is valid bc I explode only on Case ID - meaning everything else is duplicated. This whole process might actualyl be unncesary. TODO: check if all "Case IDs pairs" are just twins
print(caseid_copynum_mapping.shape)

caseid_copynum_mapping.head()

# stupid test to check whether any of hte records had duplicate sample IDs
# def foo(x):
#     a = list(map(lambda x: x.strip(), x["Sample ID"].split(",")))
#     print(a[0])
#     print(a[1])
#     # return a[0] == a[1]
# caseid_copynum_mapping = caseid_copynum_mapping[caseid_copynum_mapping.apply(foo, axis=1)]

(2690, 11)
(5380, 11)
(1845, 11)


Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Tissue Type,Tumor Descriptor,Specimen Type,Preservation Method
0,b260bc0a-bf82-4fe8-b1ca-1ba43a4d741a,TCGA-C8-A27B-01A-11D-A89A-36.WholeGenome.RP-16...,Copy Number Variation,Copy Number Segment,TCGA-BRCA,TCGA-C8-A27B,"TCGA-C8-A27B-10A, TCGA-C8-A27B-01A","Normal, Tumor","Not Applicable, Primary","Peripheral Blood NOS, Solid Tissue","Unknown, OCT"
1,8ca11193-5883-4b75-9e34-f9dcbf7040a7,TCGA-B6-A0WW-01A-11D-A893-36.WholeGenome.RP-16...,Copy Number Variation,Copy Number Segment,TCGA-BRCA,TCGA-B6-A0WW,"TCGA-B6-A0WW-01A, TCGA-B6-A0WW-10A","Tumor, Normal","Primary, Not Applicable","Solid Tissue, Peripheral Blood NOS","OCT, Unknown"
2,d9de4dd4-6dd8-4a78-901a-35251c228ec6,TCGA-E9-A1R0-01A-22D-A898-36.WholeGenome.RP-16...,Copy Number Variation,Copy Number Segment,TCGA-BRCA,TCGA-E9-A1R0,"TCGA-E9-A1R0-10A, TCGA-E9-A1R0-01A","Normal, Tumor","Not Applicable, Primary","Peripheral Blood NOS, Solid Tissue","Unknown, OCT"
3,1eb79fa2-5f92-40f7-b36f-8a9844ebedf1,66a9e6d2-6dea-4475-ab39-7b5ffa3d9faf.wgs.ASCAT...,Copy Number Variation,Gene Level Copy Number,TCGA-BRCA,TCGA-BH-A0C1,"TCGA-BH-A0C1-01B, TCGA-BH-A0C1-10A","Tumor, Normal","Primary, Not Applicable","Solid Tissue, Peripheral Blood NOS","OCT, Unknown"
5,42e20b03-cc2a-40e2-ba66-d34ca7890b96,TCGA-AC-A5EH-01A-11D-A89D-36.WholeGenome.RP-16...,Copy Number Variation,Copy Number Segment,TCGA-BRCA,TCGA-AC-A5EH,"TCGA-AC-A5EH-01A, TCGA-AC-A5EH-10A","Tumor, Normal","Primary, Not Applicable","Solid Tissue, Peripheral Blood NOS","Unknown, Unknown"


In [34]:
# add copynumbesr to "IDs" dataset
# FIXME: figure out why 21 records are being dropped - the final result should still be 133
print(case_ids.shape)
data= pd.merge(
    case_ids,
    caseid_copynum_mapping[['Case ID', 'Sample ID', 'File ID', "File Name"]],  # select only the needed column
    left_on='submitter_id',
    right_on='Case ID',
    how="inner"
)
print(data.shape)
data.head()

(133, 16)
(112, 20)


Unnamed: 0,submitter_id,disease_type,project,days_to_consent,diagnoses,consent_type,demographic,primary_site,updated_datetime,case_id,follow_ups,index_date,state,lost_to_followup,exposures,outcome,Case ID,Sample ID,File ID,File Name
0,TCGA-A1-A0SP,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},55.0,"[{'tissue_or_organ_of_origin': 'Breast, NOS', ...",Informed Consent,{'demographic_id': 'f7ed8691-ed3d-54bf-94c5-7f...,Breast,2025-01-06T07:38:56.031656-06:00,a9bb8159-32f0-454c-a946-b3286a52b9d5,"[{'timepoint_category': 'Last Contact', 'follo...",Diagnosis,released,,,"[0, 584]",TCGA-A1-A0SP,"TCGA-A1-A0SP-01A, TCGA-A1-A0SP-10A",4dfa8f5b-e9de-46d5-9aa5-15a2dc535ec8,TCGA-A1-A0SP-01A-11D-A893-36.WholeGenome.RP-16...
1,TCGA-A2-A04P,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},22.0,"[{'morphology': '8500/3', 'submitter_id': 'TCG...",Informed Consent,{'demographic_id': '92faa22e-8e19-5502-987c-a6...,Breast,2025-01-06T07:38:16.470569-06:00,ccd4a24b-d8cc-4686-9dee-c98b0c5a8d21,"[{'days_to_progression': 102, 'timepoint_categ...",Diagnosis,released,No,,"[1, 548]",TCGA-A2-A04P,"TCGA-A2-A04P-10A, TCGA-A2-A04P-01A",e175ae5e-11da-4bd1-bfb4-5919c85a72fc,TCGA-A2-A04P-01A-31D-A89H-36.WholeGenome.RP-16...
2,TCGA-A2-A04T,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},29.0,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",Informed Consent,{'demographic_id': 'dd68f2a1-f279-567f-83b6-4c...,Breast,2025-01-05T21:21:42.777520-06:00,b58ad350-5140-4fa8-bc2c-24bca8395f3a,"[{'timepoint_category': 'Last Contact', 'follo...",Diagnosis,released,,,"[0, 2246]",TCGA-A2-A04T,"TCGA-A2-A04T-01A, TCGA-A2-A04T-10A",017a228b-af6a-46a0-840f-f50d64f4e434,b752b444-f033-4be4-9d24-e5e80b4181af_wgs_gdc_r...
3,TCGA-A2-A04U,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},23.0,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",Informed Consent,{'demographic_id': '9145b76c-61b5-56e6-876e-58...,Breast,2025-01-05T23:51:14.306282-06:00,1c3610f7-e0aa-48d7-9a27-0dbaf6e244f9,"[{'timepoint_category': 'Follow-up', 'follow_u...",Diagnosis,released,Yes,,"[0, 2654]",TCGA-A2-A04U,"TCGA-A2-A04U-10A, TCGA-A2-A04U-01A",bf33ef54-ff2a-49f6-a24e-6d6052df4e73,TCGA-A2-A04U-01A-11D-A89H-36.WholeGenome.RP-16...
4,TCGA-A2-A0CM,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},-212.0,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",Informed Consent,{'demographic_id': '0f8728a4-3abb-5a78-84c3-3d...,Breast,2025-01-06T07:33:56.519794-06:00,eb2dbb4f-66b6-4525-8323-431970f7a64e,"[{'timepoint_category': 'Last Contact', 'follo...",Diagnosis,released,,,"[1, 754]",TCGA-A2-A0CM,"TCGA-A2-A0CM-01A, TCGA-A2-A0CM-10A",a59ae3ed-af09-4cb2-bf60-518f85798d53,TCGA-A2-A0CM-01A-31D-A890-36.WholeGenome.RP-16...


In [7]:
elem = data.iloc[10]
path = f"{root}/Genomic/downloads/{elem['File ID']}/{elem['File Name']}"
CNAs = pd.read_csv(path, delimiter="\t")
CNAs

Unnamed: 0,GDC_Aliquot_ID,Chromosome,Start,End,Num_Probes,Segment_Mean
0,TCGA-A2-A0YM-01A-11D-A894-36,chr1,17001,3723000,2754,-0.048360
1,TCGA-A2-A0YM-01A-11D-A894-36,chr1,3723001,5795000,2061,-0.080328
2,TCGA-A2-A0YM-01A-11D-A894-36,chr1,5795001,7432000,1634,-0.028879
3,TCGA-A2-A0YM-01A-11D-A894-36,chr1,7432001,9532000,2094,-0.017117
4,TCGA-A2-A0YM-01A-11D-A894-36,chr1,9532001,9540000,8,-0.181266
...,...,...,...,...,...,...
3737,TCGA-A2-A0YM-01A-11D-A894-36,chrY,26403001,26442000,5,-1.650024
3738,TCGA-A2-A0YM-01A-11D-A894-36,chrY,26638001,56708000,6,-9.341383
3739,TCGA-A2-A0YM-01A-11D-A894-36,chrY,56821001,56882000,61,-0.278835
3740,TCGA-A2-A0YM-01A-11D-A894-36,chrY,56882001,56887000,5,-2.367349


In [164]:
import re
import pandas as pd

# Example
# signatures = pd.DataFrame({"signature": [...]})
signatures = pd.read_excel("copynum_signatures.xlsx").drop(0).rename(columns={"Supplementary Data 2. Annotation of copy number segments": "signature"})

def extract_region(sig):
    # Match first chr/start-end pattern
    match = re.search(r"(chr)?(\d+|X|Y|M):(\d+)-(\d+)", sig)
    if match:
        chrom = match.group(1) + match.group(2) if match.group(1) else "chr" + match.group(2)
        start = int(match.group(3))
        end = int(match.group(4))
        return chrom, start, end
    return None, None, None

signatures = pd.concat([
    signatures["signature"],
    signatures["signature"].apply( lambda x: pd.Series(extract_region(x)) )
], axis=1)

signatures = signatures.rename(columns={0: "chromosome", 1: "start", 2: "end"})

signatures.head()

Unnamed: 0,signature,chromosome,start,end
1,chr2:1-15244284.BeroukhimS2.2p25.3.del.,chr2,1,15244284
2,2.p wholearm.chr2:1-93300000,chr2,1,93300000
3,chr15:56490060-57176541.Basal.15q22.1-107.del,chr15,56490060,57176541
4,chr22:23139372-23249329.BeroukhimS5..amp.,chr22,23139372,23249329
5,chr12:97551177-99047626.BeroukhimS2.12q23.1.de...,chr12,97551177,99047626


In [173]:
merged = CNAs.df.merge(
    signatures, 
    left_on='Chromosome', 
    right_on='chromosome', 
    how='left'
)
mask = (merged['Start'] >= merged['start'])

result = merged.loc[mask, CNAs.columns.tolist() + ['signature']]

In [177]:
result

Unnamed: 0,GDC_Aliquot_ID,Chromosome,Start,End,Num_Probes,Segment_Mean,signature
25,TCGA-A2-A0YM-01A-11D-A894-36,chr1,17001,3723000,2754,-0.048360,1.p wholearm.chr1:1-124300000
53,TCGA-A2-A0YM-01A-11D-A894-36,chr1,3723001,5795000,2061,-0.080328,1.p wholearm.chr1:1-124300000
57,TCGA-A2-A0YM-01A-11D-A894-36,chr1,5795001,7432000,1634,-0.028879,chr1:3756302-6867390.BeroukhimS2.1p36.31.del.
81,TCGA-A2-A0YM-01A-11D-A894-36,chr1,5795001,7432000,1634,-0.028879,1.p wholearm.chr1:1-124300000
85,TCGA-A2-A0YM-01A-11D-A894-36,chr1,7432001,9532000,2094,-0.017117,chr1:3756302-6867390.BeroukhimS2.1p36.31.del.
...,...,...,...,...,...,...,...
82592,TCGA-A2-A0YM-01A-11D-A894-36,chrX,155967001,156026000,43,-0.374156,X.qwholearm.chrX:60000000-154913754
82593,TCGA-A2-A0YM-01A-11D-A894-36,chrX,155967001,156026000,43,-0.374156,chrX:31041721-34564697.BeroukhimS2.Xp21.2.del.DMD
82594,TCGA-A2-A0YM-01A-11D-A894-36,chrX,155967001,156026000,43,-0.374156,chrX:8707411-9925292.BeroukhimS5..del.
82595,TCGA-A2-A0YM-01A-11D-A894-36,chrX,155967001,156026000,43,-0.374156,chrX:148267454-148753845.BeroukhimS5..amp.


In [181]:
result.groupby(by=["signature"])["Segment_Mean"].apply(lambda x: x.unique())

signature
1.p   wholearm.chr1:1-124300000                   [-0.04836, -0.080328, -0.028879, -0.017117, -0...
1.q   wholearm.chr1:124300000-247249719           [1.125795, -3.244125, 1.782699, 0.448627, 1.30...
10.p wholearm.chr10:1-40300000                    [0.282643, 0.220832, 0.302888, 0.250967, 0.263...
10.q wholearm.chr10:40300000-135374737            [0.470294, -0.080945, -0.786112, -0.08005, 0.5...
11.p wholearm.chr11:1-52900000                    [-0.073111, -0.3931, -0.108993, -0.088036, -0....
                                                                        ...                        
chrX:43505580-46847125.BeroukhimS5..amp.          [0.582329, -0.374064, 0.554671, -0.376845, -0....
chrX:66436234-67090514.BeroukhimS2.Xq12.amp.AR    [0.582329, -0.374064, 0.554671, -0.376845, -0....
chrX:6695466-8345492.BeroukhimS5..del.            [0.031391, 0.582329, -0.374064, 0.554671, -0.3...
chrX:69545961-73353657.BeroukhimS5..amp.          [0.582329, -0.374064, 0.554671, -0.37684

In [190]:
signature_scores = result.groupby(by=["signature"])["Segment_Mean"].apply(lambda x: x.mean())
signature_scores

signature
1.p   wholearm.chr1:1-124300000                   0.562351
1.q   wholearm.chr1:124300000-247249719           0.512755
10.p wholearm.chr10:1-40300000                    0.043622
10.q wholearm.chr10:40300000-135374737           -0.006903
11.p wholearm.chr11:1-52900000                   -0.221676
                                                    ...   
chrX:43505580-46847125.BeroukhimS5..amp.         -0.184523
chrX:66436234-67090514.BeroukhimS2.Xq12.amp.AR   -0.184523
chrX:6695466-8345492.BeroukhimS5..del.           -0.164894
chrX:69545961-73353657.BeroukhimS5..amp.         -0.184523
chrX:8707411-9925292.BeroukhimS5..del.           -0.164894
Name: Segment_Mean, Length: 528, dtype: float64

In [193]:
(signature_scores > 0.3).sum() / signature_scores.shape[0]

np.float64(0.22348484848484848)

In [194]:
signature_scores.loc["Case ID"] = elem["submitter_id"]

In [195]:
signature_scores

signature
1.p   wholearm.chr1:1-124300000                       0.562351
1.q   wholearm.chr1:124300000-247249719               0.512755
10.p wholearm.chr10:1-40300000                        0.043622
10.q wholearm.chr10:40300000-135374737               -0.006903
11.p wholearm.chr11:1-52900000                       -0.221676
                                                      ...     
chrX:66436234-67090514.BeroukhimS2.Xq12.amp.AR       -0.184523
chrX:6695466-8345492.BeroukhimS5..del.               -0.164894
chrX:69545961-73353657.BeroukhimS5..amp.             -0.184523
chrX:8707411-9925292.BeroukhimS5..del.               -0.164894
Case ID                                           TCGA-A2-A0YM
Name: Segment_Mean, Length: 529, dtype: object