This notebook takes the ClinVar, GPD and Annovar annotations for each dataset and generates processed files for training. Each variant is put into 1 of 6 bins - PIU/LU/NCU and Pathogenic/VUS/Benign. Within each bucket, all variants in the same gene for a patient are aggregated using count(variants), max(Annovar score), sum(Annovar score) and mean(Annovar score).

In [1]:
import pandas as pd
import numpy as np

In [23]:
import warnings
warnings.filterwarnings("ignore")

In [105]:
PATHOGENIC_ANNOTATIONS = [
    "Pathogenic",
    "Pathogenic|drug_response|other",
    "Pathogenic/Likely_pathogenic",
    "Likely_pathogenic",
    "Pathogenic/Likely_pathogenic|other",
    "drug_response",
    "Likely_pathogenic|other",
    "Pathogenic|risk_factor",
    "Pathogenic/Likely_pathogenic|drug_response",
    "Likely_risk_allele",
    "risk_factor",
]
VUS_ANNOTATIONS = [
    ".",
    "Uncertain_significance",
    "Conflicting_interpretations_of_pathogenicity",
    "not_provided",
    "Conflicting_interpretations_of_pathogenicity|other",
    "Uncertain_significance|drug_response",
    "other",
]
BENIGN_ANNOTATIONS = [
    "Likely_benign",
    "Benign/Likely_benign",
    "Benign",
]

In [106]:
def get_clinvar_supercategory(x):
    if x in PATHOGENIC_ANNOTATIONS:
        return "Pathogenic"
    elif x in VUS_ANNOTATIONS:
        return "VUS"
    elif x in BENIGN_ANNOTATIONS:
        return "Benign"
    else:
        return "NA"

In [18]:
GENES_324 = list(pd.read_csv("../data/raw/gene2ind.txt", header=None)[0])

In [66]:
def get_matrices(df, merged_df, criteria="GPD_unit", criteria_value="PIU", index_name = "DepMap_ID", column_name = "Hugo_Symbol"):
    df_reduced = df[df[criteria] == criteria_value]
    df_reduced_matrix = pd.pivot_table(df_reduced, index=index_name, columns=column_name, values="xon17_score")
    print(df_reduced_matrix.shape)
    
    # Adding missing features from GENES_324 and replacing NaNs with 0 and adding patients across all
    for g in GENES_324:
        if g not in df_reduced_matrix.columns:
            df_reduced_matrix[g] = 0

    all_patients = list(merged_df[index_name].unique())
    df_reduced_matrix = df_reduced_matrix.reset_index()
    for p in set(all_patients) - set(df_reduced_matrix[index_name]):
        df_reduced_matrix = df_reduced_matrix.append({index_name: p}, ignore_index=True)

    df_reduced_matrix.set_index(index_name, drop=True, inplace=True)
    df_reduced_matrix.fillna(0, inplace=True)
    df_reduced_matrix = df_reduced_matrix.sort_index()[GENES_324]
    
    return df_reduced_matrix

### CCLE dataset

In [4]:
ccle_annovar_gpd_annotated_df = pd.read_csv("../data/processed/ccle_21q3_annovar_gpd_annot_per_patient_per_mutation.csv",)
ccle_annovar_gpd_annotated_df

Unnamed: 0,Hugo_Symbol,DepMap_ID,Protein_Change,Variant_annotation,mutation,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,...,bayesdel_addaf_pred,bayesdel_noaf_pred,clinpred_pred,list_s2_pred,fathmm_mkl_coding_pred,fathmm_xf_coding_pred,gene,HGVSp,1plusxon17_score,GPD_unit
0,TSC2,ACH-000001,p.F297fs,damaging,TSC2 F297fs,0,0,0,0,0,...,0,0,0,0,0,0,0,p.F297fs,1.000000,PIU
1,TP53,ACH-000001,0,other non-conserving,TP53,0,0,0,0,0,...,0,0,0,0,0,0,0,p.,1.000000,NCU
2,CD79B,ACH-000001,p.L21L,silent,CD79B L21L,0,0,0,0,0,...,0,0,0,0,0,0,0,p.L21L,1.000000,NCU
3,SOX9,ACH-000001,p.L3L,silent,SOX9 L3L,0,0,0,0,0,...,0,0,0,0,0,0,0,p.L3L,1.000000,NCU
4,NOTCH3,ACH-000001,p.R1981C,other non-conserving,NOTCH3 R1981C,1,1,0,1,0,...,0,1,1,1,1,1,NOTCH3,p.R1981C,1.647059,PIU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62307,BRCA1,ACH-002512,p.H888Y,other non-conserving,BRCA1 H888Y,0,0,0,0,1,...,0,0,0,0,0,0,BRCA1,p.H888Y,1.176471,LU
62308,SMARCA4,ACH-002512,p.A400A,silent,SMARCA4 A400A,1,1,1,1,1,...,0,0,1,1,1,1,SMARCA4,p.A400A,1.764706,NCU
62309,SMARCA4,ACH-002512,p.A400A,silent,SMARCA4 A400A,1,1,1,1,1,...,1,1,1,1,1,1,SMARCA4,p.A400A,2.000000,NCU
62310,SMARCA4,ACH-002512,p.A400A,silent,SMARCA4 A400A,1,1,1,1,1,...,1,1,1,1,1,1,SMARCA4,p.A400A,2.000000,NCU


In [5]:
ccle_annovar_gpd_annotated_df["xon17_score"] = ccle_annovar_gpd_annotated_df["1plusxon17_score"] - 1 # remvoing the 1
ccle_annovar_gpd_annotated_df

Unnamed: 0,Hugo_Symbol,DepMap_ID,Protein_Change,Variant_annotation,mutation,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,...,bayesdel_noaf_pred,clinpred_pred,list_s2_pred,fathmm_mkl_coding_pred,fathmm_xf_coding_pred,gene,HGVSp,1plusxon17_score,GPD_unit,xon17_score
0,TSC2,ACH-000001,p.F297fs,damaging,TSC2 F297fs,0,0,0,0,0,...,0,0,0,0,0,0,p.F297fs,1.000000,PIU,0.000000
1,TP53,ACH-000001,0,other non-conserving,TP53,0,0,0,0,0,...,0,0,0,0,0,0,p.,1.000000,NCU,0.000000
2,CD79B,ACH-000001,p.L21L,silent,CD79B L21L,0,0,0,0,0,...,0,0,0,0,0,0,p.L21L,1.000000,NCU,0.000000
3,SOX9,ACH-000001,p.L3L,silent,SOX9 L3L,0,0,0,0,0,...,0,0,0,0,0,0,p.L3L,1.000000,NCU,0.000000
4,NOTCH3,ACH-000001,p.R1981C,other non-conserving,NOTCH3 R1981C,1,1,0,1,0,...,1,1,1,1,1,NOTCH3,p.R1981C,1.647059,PIU,0.647059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62307,BRCA1,ACH-002512,p.H888Y,other non-conserving,BRCA1 H888Y,0,0,0,0,1,...,0,0,0,0,0,BRCA1,p.H888Y,1.176471,LU,0.176471
62308,SMARCA4,ACH-002512,p.A400A,silent,SMARCA4 A400A,1,1,1,1,1,...,0,1,1,1,1,SMARCA4,p.A400A,1.764706,NCU,0.764706
62309,SMARCA4,ACH-002512,p.A400A,silent,SMARCA4 A400A,1,1,1,1,1,...,1,1,1,1,1,SMARCA4,p.A400A,2.000000,NCU,1.000000
62310,SMARCA4,ACH-002512,p.A400A,silent,SMARCA4 A400A,1,1,1,1,1,...,1,1,1,1,1,SMARCA4,p.A400A,2.000000,NCU,1.000000


In [6]:
ccle_clinvar_df = pd.read_csv("../data/processed/clinvar_anno_features_per_mutation_ccle.csv", index_col=0)
ccle_clinvar_df

Unnamed: 0_level_0,SIFT_score,SIFT_converted_rankscore,SIFT_pred,SIFT4G_score,SIFT4G_converted_rankscore,SIFT4G_pred,LRT_score,LRT_converted_rankscore,LRT_pred,MutationTaster_score,...,GERP++_RS_rankscore,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons30way_mammalian,phastCons30way_mammalian_rankscore,CLNALLELEID,CLNDN,CLNDISDB,CLNREVSTAT,CLNSIG
input,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CD79B L21L,.,.,.,.,.,.,.,.,.,.,...,.,.,.,.,.,.,.,.,.,.
SOX9 L3L,.,.,.,.,.,.,.,.,.,.,...,.,.,.,.,.,.,.,.,.,.
NOTCH3 R1981C,0.0,0.913,D,0.0,0.928,D,0.01,0.302,N,1,...,0.336,1.0,0.716,0.393,0.256,.,.,.,.,.
PPP2R1A E100K,0.001,0.785,D,0.012,0.639,D,0.0,0.629,D,1.0,...,0.536,1.0,0.716,0.995,0.577,.,.,.,.,.
PPP2R1A S120S,.,.,.,.,.,.,.,.,.,.,...,.,.,.,.,.,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NFKBIA L229L,.,.,.,.,.,.,.,.,.,.,...,.,.,.,.,.,.,.,.,.,.
ALK Y1604H,0.004,0.654,D,0.116,0.365,T,0.026,0.259,N,0.999,...,0.378,0.602,0.278,0.015,0.131,.,.,.,.,.
ERBB3 V104L,0.131,0.371,T,0.274,0.483,T,0.001,0.432,N,1.0,...,0.644,1.0,0.716,0.71,0.312,.,.,.,.,.
GSK3B S21L,0.061,0.389,T,0.278,0.218,T,0.0,0.629,D,1.0,...,0.576,1.0,0.716,1.0,0.863,.,.,.,.,.


In [7]:
ccle_clinvar_df.CLNSIG.value_counts()

.                                                     24598
Pathogenic                                             6807
Pathogenic|drug_response|other                         5120
Uncertain_significance                                 2959
Pathogenic/Likely_pathogenic                           1657
Conflicting_interpretations_of_pathogenicity           1634
Likely_benign                                          1214
Likely_pathogenic                                       527
Benign/Likely_benign                                    248
Benign                                                  152
not_provided                                             56
Conflicting_interpretations_of_pathogenicity|other       24
Pathogenic/Likely_pathogenic|other                        6
drug_response                                             5
Uncertain_significance|drug_response                      3
Likely_pathogenic|other                                   2
Pathogenic|risk_factor                  

In [8]:
# remove duplicate mappings from input to ClinVar annotations
ccle_clinvar_df = ccle_clinvar_df.reset_index().drop_duplicates(subset=["input"])
ccle_clinvar_df.shape

(24894, 74)

In [9]:
ccle_clinvar_df["ClinVar_annotations_categorized"] = ccle_clinvar_df["CLNSIG"].apply(get_clinvar_supercategory)
ccle_clinvar_df["ClinVar_annotations_categorized"].value_counts()

VUS           22878
Benign         1181
Pathogenic      835
Name: ClinVar_annotations_categorized, dtype: int64

In [10]:
merged_ccle = pd.merge(ccle_annovar_gpd_annotated_df, ccle_clinvar_df, left_on = "mutation", right_on="input", how="left")
merged_ccle

Unnamed: 0,Hugo_Symbol,DepMap_ID,Protein_Change,Variant_annotation,mutation,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,...,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons30way_mammalian,phastCons30way_mammalian_rankscore,CLNALLELEID,CLNDN,CLNDISDB,CLNREVSTAT,CLNSIG,ClinVar_annotations_categorized
0,TSC2,ACH-000001,p.F297fs,damaging,TSC2 F297fs,0,0,0,0,0,...,,,,,,,,,,
1,TP53,ACH-000001,0,other non-conserving,TP53,0,0,0,0,0,...,,,,,,,,,,
2,CD79B,ACH-000001,p.L21L,silent,CD79B L21L,0,0,0,0,0,...,.,.,.,.,.,.,.,.,.,VUS
3,SOX9,ACH-000001,p.L3L,silent,SOX9 L3L,0,0,0,0,0,...,.,.,.,.,.,.,.,.,.,VUS
4,NOTCH3,ACH-000001,p.R1981C,other non-conserving,NOTCH3 R1981C,1,1,0,1,0,...,1.0,0.716,0.393,0.256,.,.,.,.,.,VUS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62307,BRCA1,ACH-002512,p.H888Y,other non-conserving,BRCA1 H888Y,0,0,0,0,1,...,0.001,0.138,0.912,0.384,69299,Hereditary_breast_ovarian_cancer_syndrome|Brea...,"MONDO:MONDO:0003582,MeSH:D061325,MedGen:C06777...","criteria_provided,_conflicting_interpretations",Conflicting_interpretations_of_pathogenicity,VUS
62308,SMARCA4,ACH-002512,p.A400A,silent,SMARCA4 A400A,1,1,1,1,1,...,.,.,.,.,469191,Rhabdoid_tumor_predisposition_syndrome_2|Hered...,"MONDO:MONDO:0013224,MedGen:C2750074,OMIM:61332...","criteria_provided,_multiple_submitters,_no_con...",Likely_benign,Benign
62309,SMARCA4,ACH-002512,p.A400A,silent,SMARCA4 A400A,1,1,1,1,1,...,.,.,.,.,469191,Rhabdoid_tumor_predisposition_syndrome_2|Hered...,"MONDO:MONDO:0013224,MedGen:C2750074,OMIM:61332...","criteria_provided,_multiple_submitters,_no_con...",Likely_benign,Benign
62310,SMARCA4,ACH-002512,p.A400A,silent,SMARCA4 A400A,1,1,1,1,1,...,.,.,.,.,469191,Rhabdoid_tumor_predisposition_syndrome_2|Hered...,"MONDO:MONDO:0013224,MedGen:C2750074,OMIM:61332...","criteria_provided,_multiple_submitters,_no_con...",Likely_benign,Benign


In [11]:
merged_ccle.Hugo_Symbol.isna().sum() # to ensure no NaNs exist

0

#### GPD based features

In [12]:
# Max feature
ccle_gpd_max = merged_ccle.groupby(["DepMap_ID", "Hugo_Symbol", "GPD_unit"]).aggregate("max")[["xon17_score"]].reset_index()
ccle_gpd_max

  ccle_gpd_max = merged_ccle.groupby(["DepMap_ID", "Hugo_Symbol", "GPD_unit"]).aggregate("max")[["xon17_score"]].reset_index()


Unnamed: 0,DepMap_ID,Hugo_Symbol,GPD_unit,xon17_score
0,ACH-000001,CD79B,NCU,0.000000
1,ACH-000001,NOTCH1,PIU,0.941176
2,ACH-000001,NOTCH3,PIU,0.647059
3,ACH-000001,PIK3R1,NCU,0.000000
4,ACH-000001,PPP2R1A,LU,0.882353
...,...,...,...,...
30922,ACH-002512,BRCA1,LU,0.176471
30923,ACH-002512,MLL2,NCU,0.000000
30924,ACH-002512,NTRK1,NCU,0.000000
30925,ACH-002512,SMARCA4,NCU,1.000000


In [24]:
ccle_gpd_max_lu_matrix = get_matrices(ccle_gpd_max, merged_ccle, criteria="GPD_unit", criteria_value="LU", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_gpd_max_piu_matrix = get_matrices(ccle_gpd_max, merged_ccle, criteria="GPD_unit", criteria_value="PIU", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_gpd_max_ncu_matrix = get_matrices(ccle_gpd_max, merged_ccle, criteria="GPD_unit", criteria_value="NCU", index_name='DepMap_ID', column_name='Hugo_Symbol')
print(ccle_gpd_max_lu_matrix.shape)
print(ccle_gpd_max_piu_matrix.shape)
print(ccle_gpd_max_ncu_matrix.shape)

(1570, 297)
(1709, 316)
(1620, 322)
(1744, 324)
(1744, 324)
(1744, 324)


In [27]:
# Sum feature
ccle_gpd_sum = merged_ccle.groupby(["DepMap_ID", "Hugo_Symbol", "GPD_unit"]).aggregate("sum")[["xon17_score"]].reset_index()
ccle_gpd_sum

Unnamed: 0,DepMap_ID,Hugo_Symbol,GPD_unit,xon17_score
0,ACH-000001,CD79B,NCU,0.000000
1,ACH-000001,NOTCH1,PIU,16.000000
2,ACH-000001,NOTCH3,PIU,0.647059
3,ACH-000001,PIK3R1,NCU,0.000000
4,ACH-000001,PPP2R1A,LU,0.882353
...,...,...,...,...
30922,ACH-002512,BRCA1,LU,0.176471
30923,ACH-002512,MLL2,NCU,0.000000
30924,ACH-002512,NTRK1,NCU,0.000000
30925,ACH-002512,SMARCA4,NCU,2.764706


In [28]:
ccle_gpd_sum_lu_matrix = get_matrices(ccle_gpd_sum, merged_ccle, criteria="GPD_unit", criteria_value="LU", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_gpd_sum_piu_matrix = get_matrices(ccle_gpd_sum, merged_ccle, criteria="GPD_unit", criteria_value="PIU", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_gpd_sum_ncu_matrix = get_matrices(ccle_gpd_sum, merged_ccle, criteria="GPD_unit", criteria_value="NCU", index_name='DepMap_ID', column_name='Hugo_Symbol')
print(ccle_gpd_sum_lu_matrix.shape)
print(ccle_gpd_sum_piu_matrix.shape)
print(ccle_gpd_sum_ncu_matrix.shape)

(1570, 297)
(1709, 316)
(1620, 322)
(1744, 324)
(1744, 324)
(1744, 324)


In [29]:
# Mean feature
ccle_gpd_mean = merged_ccle.groupby(["DepMap_ID", "Hugo_Symbol", "GPD_unit"]).aggregate("mean")[["xon17_score"]].reset_index()
ccle_gpd_mean

Unnamed: 0,DepMap_ID,Hugo_Symbol,GPD_unit,xon17_score
0,ACH-000001,CD79B,NCU,0.000000
1,ACH-000001,NOTCH1,PIU,0.941176
2,ACH-000001,NOTCH3,PIU,0.647059
3,ACH-000001,PIK3R1,NCU,0.000000
4,ACH-000001,PPP2R1A,LU,0.882353
...,...,...,...,...
30922,ACH-002512,BRCA1,LU,0.176471
30923,ACH-002512,MLL2,NCU,0.000000
30924,ACH-002512,NTRK1,NCU,0.000000
30925,ACH-002512,SMARCA4,NCU,0.921569


In [30]:
ccle_gpd_mean_lu_matrix = get_matrices(ccle_gpd_mean, merged_ccle, criteria="GPD_unit", criteria_value="LU", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_gpd_mean_piu_matrix = get_matrices(ccle_gpd_mean, merged_ccle, criteria="GPD_unit", criteria_value="PIU", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_gpd_mean_ncu_matrix = get_matrices(ccle_gpd_mean, merged_ccle, criteria="GPD_unit", criteria_value="NCU", index_name='DepMap_ID', column_name='Hugo_Symbol')
print(ccle_gpd_mean_lu_matrix.shape)
print(ccle_gpd_mean_piu_matrix.shape)
print(ccle_gpd_mean_ncu_matrix.shape)

(1570, 297)
(1709, 316)
(1620, 322)
(1744, 324)
(1744, 324)
(1744, 324)


In [31]:
# Count feature
ccle_gpd_count = merged_ccle.groupby(["DepMap_ID", "Hugo_Symbol", "GPD_unit"]).aggregate("count")[["xon17_score"]].reset_index()
ccle_gpd_count

Unnamed: 0,DepMap_ID,Hugo_Symbol,GPD_unit,xon17_score
0,ACH-000001,CD79B,NCU,1
1,ACH-000001,NOTCH1,PIU,17
2,ACH-000001,NOTCH3,PIU,1
3,ACH-000001,PIK3R1,NCU,1
4,ACH-000001,PPP2R1A,LU,1
...,...,...,...,...
30922,ACH-002512,BRCA1,LU,1
30923,ACH-002512,MLL2,NCU,1
30924,ACH-002512,NTRK1,NCU,1
30925,ACH-002512,SMARCA4,NCU,3


In [32]:
ccle_gpd_count_lu_matrix = get_matrices(ccle_gpd_count, merged_ccle, criteria="GPD_unit", criteria_value="LU", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_gpd_count_piu_matrix = get_matrices(ccle_gpd_count, merged_ccle, criteria="GPD_unit", criteria_value="PIU", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_gpd_count_ncu_matrix = get_matrices(ccle_gpd_count, merged_ccle, criteria="GPD_unit", criteria_value="NCU", index_name='DepMap_ID', column_name='Hugo_Symbol')
print(ccle_gpd_count_lu_matrix.shape)
print(ccle_gpd_count_piu_matrix.shape)
print(ccle_gpd_count_ncu_matrix.shape)

(1570, 297)
(1709, 316)
(1620, 322)
(1744, 324)
(1744, 324)
(1744, 324)


#### ClinVar based features

In [33]:
# Max feature
ccle_clinvar_max = merged_ccle.groupby(["DepMap_ID", "Hugo_Symbol", "ClinVar_annotations_categorized"]).aggregate("max")[["xon17_score"]].reset_index()
ccle_clinvar_max

Unnamed: 0,DepMap_ID,Hugo_Symbol,ClinVar_annotations_categorized,xon17_score
0,ACH-000001,CD79B,VUS,0.000000
1,ACH-000001,NOTCH1,VUS,0.941176
2,ACH-000001,NOTCH3,VUS,0.647059
3,ACH-000001,PPP2R1A,VUS,0.882353
4,ACH-000001,SOX9,VUS,0.000000
...,...,...,...,...
24454,ACH-002511,SDHD,VUS,0.470588
24455,ACH-002512,BRCA1,VUS,0.176471
24456,ACH-002512,NTRK1,Benign,0.000000
24457,ACH-002512,SMARCA4,Benign,1.000000


In [34]:
ccle_clinvar_max_pathogenic_matrix = get_matrices(ccle_clinvar_max, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="Pathogenic", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_clinvar_max_vus_matrix = get_matrices(ccle_clinvar_max, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="VUS", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_clinvar_max_benign_matrix = get_matrices(ccle_clinvar_max, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="Benign", index_name='DepMap_ID', column_name='Hugo_Symbol')
print(ccle_clinvar_max_pathogenic_matrix.shape)
print(ccle_clinvar_max_vus_matrix.shape)
print(ccle_clinvar_max_benign_matrix.shape)

(1147, 136)
(1732, 313)
(733, 177)
(1744, 324)
(1744, 324)
(1744, 324)


In [35]:
# Sum feature
ccle_clinvar_sum = merged_ccle.groupby(["DepMap_ID", "Hugo_Symbol", "ClinVar_annotations_categorized"]).aggregate("sum")[["xon17_score"]].reset_index()
ccle_clinvar_sum

Unnamed: 0,DepMap_ID,Hugo_Symbol,ClinVar_annotations_categorized,xon17_score
0,ACH-000001,CD79B,VUS,0.000000
1,ACH-000001,NOTCH1,VUS,16.000000
2,ACH-000001,NOTCH3,VUS,0.647059
3,ACH-000001,PPP2R1A,VUS,0.882353
4,ACH-000001,SOX9,VUS,0.000000
...,...,...,...,...
24454,ACH-002511,SDHD,VUS,0.470588
24455,ACH-002512,BRCA1,VUS,0.176471
24456,ACH-002512,NTRK1,Benign,0.000000
24457,ACH-002512,SMARCA4,Benign,2.764706


In [36]:
ccle_clinvar_sum_pathogenic_matrix = get_matrices(ccle_clinvar_sum, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="Pathogenic", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_clinvar_sum_vus_matrix = get_matrices(ccle_clinvar_sum, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="VUS", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_clinvar_sum_benign_matrix = get_matrices(ccle_clinvar_sum, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="Benign", index_name='DepMap_ID', column_name='Hugo_Symbol')
print(ccle_clinvar_sum_pathogenic_matrix.shape)
print(ccle_clinvar_sum_vus_matrix.shape)
print(ccle_clinvar_sum_benign_matrix.shape)

(1147, 136)
(1732, 313)
(733, 177)
(1744, 324)
(1744, 324)
(1744, 324)


In [37]:
# Mean feature
ccle_clinvar_mean = merged_ccle.groupby(["DepMap_ID", "Hugo_Symbol", "ClinVar_annotations_categorized"]).aggregate("mean")[["xon17_score"]].reset_index()
ccle_clinvar_mean

Unnamed: 0,DepMap_ID,Hugo_Symbol,ClinVar_annotations_categorized,xon17_score
0,ACH-000001,CD79B,VUS,0.000000
1,ACH-000001,NOTCH1,VUS,0.941176
2,ACH-000001,NOTCH3,VUS,0.647059
3,ACH-000001,PPP2R1A,VUS,0.441176
4,ACH-000001,SOX9,VUS,0.000000
...,...,...,...,...
24454,ACH-002511,SDHD,VUS,0.470588
24455,ACH-002512,BRCA1,VUS,0.176471
24456,ACH-002512,NTRK1,Benign,0.000000
24457,ACH-002512,SMARCA4,Benign,0.921569


In [38]:
ccle_clinvar_mean_pathogenic_matrix = get_matrices(ccle_clinvar_mean, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="Pathogenic", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_clinvar_mean_vus_matrix = get_matrices(ccle_clinvar_mean, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="VUS", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_clinvar_mean_benign_matrix = get_matrices(ccle_clinvar_mean, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="Benign", index_name='DepMap_ID', column_name='Hugo_Symbol')
print(ccle_clinvar_mean_pathogenic_matrix.shape)
print(ccle_clinvar_mean_vus_matrix.shape)
print(ccle_clinvar_mean_benign_matrix.shape)

(1147, 136)
(1732, 313)
(733, 177)
(1744, 324)
(1744, 324)
(1744, 324)


In [39]:
# Count feature
ccle_clinvar_count = merged_ccle.groupby(["DepMap_ID", "Hugo_Symbol", "ClinVar_annotations_categorized"]).aggregate("count")[["xon17_score"]].reset_index()
ccle_clinvar_count

Unnamed: 0,DepMap_ID,Hugo_Symbol,ClinVar_annotations_categorized,xon17_score
0,ACH-000001,CD79B,VUS,1
1,ACH-000001,NOTCH1,VUS,17
2,ACH-000001,NOTCH3,VUS,1
3,ACH-000001,PPP2R1A,VUS,2
4,ACH-000001,SOX9,VUS,1
...,...,...,...,...
24454,ACH-002511,SDHD,VUS,1
24455,ACH-002512,BRCA1,VUS,1
24456,ACH-002512,NTRK1,Benign,1
24457,ACH-002512,SMARCA4,Benign,3


In [40]:
ccle_clinvar_count_pathogenic_matrix = get_matrices(ccle_clinvar_count, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="Pathogenic", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_clinvar_count_vus_matrix = get_matrices(ccle_clinvar_count, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="VUS", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_clinvar_count_benign_matrix = get_matrices(ccle_clinvar_count, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="Benign", index_name='DepMap_ID', column_name='Hugo_Symbol')
print(ccle_clinvar_count_pathogenic_matrix.shape)
print(ccle_clinvar_count_vus_matrix.shape)
print(ccle_clinvar_count_benign_matrix.shape)

(1147, 136)
(1732, 313)
(733, 177)
(1744, 324)
(1744, 324)
(1744, 324)


In [88]:
# Add suffixes to identify columns
ccle_gpd_max_piu_matrix = ccle_gpd_max_piu_matrix.add_suffix('_piu_max')
ccle_gpd_sum_piu_matrix = ccle_gpd_sum_piu_matrix.add_suffix("_piu_sum")
ccle_gpd_mean_piu_matrix = ccle_gpd_mean_piu_matrix.add_suffix("_piu_mean")
ccle_gpd_count_piu_matrix = ccle_gpd_count_piu_matrix.add_suffix("_piu_count")
ccle_gpd_max_lu_matrix = ccle_gpd_max_lu_matrix.add_suffix("_lu_max")
ccle_gpd_sum_lu_matrix = ccle_gpd_sum_lu_matrix.add_suffix("_lu_sum")
ccle_gpd_mean_lu_matrix = ccle_gpd_mean_lu_matrix.add_suffix("_lu_mean")
ccle_gpd_count_lu_matrix = ccle_gpd_count_ncu_matrix.add_suffix("_lu_count")
ccle_gpd_max_ncu_matrix = ccle_gpd_max_ncu_matrix.add_suffix("_ncu_max")
ccle_gpd_sum_ncu_matrix = ccle_gpd_sum_ncu_matrix.add_suffix("_ncu_sum")
ccle_gpd_mean_ncu_matrix = ccle_gpd_mean_ncu_matrix.add_suffix("_ncu_mean")
ccle_gpd_count_ncu_matrix = ccle_gpd_count_ncu_matrix.add_suffix("_ncu_count")

ccle_clinvar_max_pathogenic_matrix = ccle_clinvar_max_pathogenic_matrix.add_suffix("_pathogenic_max")
ccle_clinvar_sum_pathogenic_matrix = ccle_clinvar_sum_pathogenic_matrix.add_suffix("_pathogenic_sum")
ccle_clinvar_mean_pathogenic_matrix = ccle_clinvar_mean_pathogenic_matrix.add_suffix("_pathogenic_mean")
ccle_clinvar_count_pathogenic_matrix = ccle_clinvar_count_pathogenic_matrix.add_suffix("_pathogenic_count")
ccle_clinvar_max_vus_matrix = ccle_clinvar_max_vus_matrix.add_suffix("_vus_max")
ccle_clinvar_sum_vus_matrix = ccle_clinvar_sum_vus_matrix.add_suffix("_vus_sum")
ccle_clinvar_mean_vus_matrix = ccle_clinvar_mean_vus_matrix.add_suffix("_vus_mean")
ccle_clinvar_count_vus_matrix = ccle_clinvar_count_vus_matrix.add_suffix("_vus_count")
ccle_clinvar_max_benign_matrix = ccle_clinvar_max_benign_matrix.add_suffix("_benign_max")
ccle_clinvar_sum_benign_matrix = ccle_clinvar_sum_benign_matrix.add_suffix("_benign_sum")
ccle_clinvar_mean_benign_matrix = ccle_clinvar_mean_benign_matrix.add_suffix("_benign_mean")
ccle_clinvar_count_benign_matrix = ccle_clinvar_count_benign_matrix.add_suffix("_benign_count")

In [89]:
# PIU - Max, Sum, Mean, Count, LU - Max, Sum, Mean, Count, NCU - Max, Sum, Mean, Count, Pathogenic - Max, Sum, Mean, Count, VUS - Max, Sum, Mean, Count, Benign - Max, Sum, Mean, Count
ccle_feature_matrix = pd.concat([
    ccle_gpd_max_piu_matrix, ccle_gpd_sum_piu_matrix, ccle_gpd_mean_piu_matrix, ccle_gpd_count_piu_matrix,
    ccle_gpd_max_lu_matrix, ccle_gpd_sum_lu_matrix, ccle_gpd_mean_lu_matrix, ccle_gpd_count_lu_matrix,
    ccle_gpd_max_ncu_matrix, ccle_gpd_sum_ncu_matrix, ccle_gpd_mean_ncu_matrix, ccle_gpd_count_ncu_matrix,
    ccle_clinvar_max_pathogenic_matrix, ccle_clinvar_sum_pathogenic_matrix, ccle_clinvar_mean_pathogenic_matrix, ccle_clinvar_count_pathogenic_matrix,
    ccle_clinvar_max_vus_matrix, ccle_clinvar_sum_vus_matrix, ccle_clinvar_mean_vus_matrix, ccle_clinvar_count_vus_matrix,
    ccle_clinvar_max_benign_matrix, ccle_clinvar_sum_benign_matrix, ccle_clinvar_mean_benign_matrix, ccle_clinvar_count_benign_matrix,
], axis = 1)
ccle_feature_matrix.shape

(1744, 7776)

In [91]:
ccle_feature_matrix.to_csv("../data/processed/clinvar_gpd_annovar_annotated_ccle_feature_matrix.csv")

### TCGA dataset

In [56]:
tcga_annovar_gpd_annotated_df = pd.read_csv("../data/processed/tcga_annovar_gpd_annot_per_patient_per_mutation.csv",)
tcga_annovar_gpd_annotated_df

Unnamed: 0,submitter_id,point_mutation,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,fathmm_pred,provean_pred,metasvm_pred,...,deogen2_pred,bayesdel_addaf_pred,bayesdel_noaf_pred,clinpred_pred,list_s2_pred,fathmm_mkl_coding_pred,fathmm_xf_coding_pred,gene,1plusxon17_score,GPD_unit
0,TCGA-2E-A9G8,FBXW7 R505G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FBXW7,1.000000,PIU
1,TCGA-2E-A9G8,TP53 E286_E287del,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TP53,1.000000,PIU
2,TCGA-A5-A1OH,AXL S447S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,AXL,1.000000,NCU
3,TCGA-A5-A1OH,BRD4 X1340_splice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BRD4,1.000000,NCU
4,TCGA-A5-A1OH,HRAS E31K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,HRAS,1.000000,PIU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16826,TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882,LU
16827,TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882,LU
16828,TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882,LU
16829,TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882,LU


In [57]:
tcga_annovar_gpd_annotated_df["xon17_score"] = tcga_annovar_gpd_annotated_df["1plusxon17_score"] - 1 # remvoing the 1
tcga_annovar_gpd_annotated_df

Unnamed: 0,submitter_id,point_mutation,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,fathmm_pred,provean_pred,metasvm_pred,...,bayesdel_addaf_pred,bayesdel_noaf_pred,clinpred_pred,list_s2_pred,fathmm_mkl_coding_pred,fathmm_xf_coding_pred,gene,1plusxon17_score,GPD_unit,xon17_score
0,TCGA-2E-A9G8,FBXW7 R505G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,FBXW7,1.000000,PIU,0.000000
1,TCGA-2E-A9G8,TP53 E286_E287del,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,TP53,1.000000,PIU,0.000000
2,TCGA-A5-A1OH,AXL S447S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,AXL,1.000000,NCU,0.000000
3,TCGA-A5-A1OH,BRD4 X1340_splice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,BRD4,1.000000,NCU,0.000000
4,TCGA-A5-A1OH,HRAS E31K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,HRAS,1.000000,PIU,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16826,TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882,LU,0.705882
16827,TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882,LU,0.705882
16828,TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882,LU,0.705882
16829,TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882,LU,0.705882


In [58]:
tcga_clinvar_df = pd.read_csv("../data/processed/clinvar_anno_features_per_mutation_tcga.csv", index_col=0)
tcga_clinvar_df

Unnamed: 0_level_0,SIFT_score,SIFT_converted_rankscore,SIFT_pred,SIFT4G_score,SIFT4G_converted_rankscore,SIFT4G_pred,LRT_score,LRT_converted_rankscore,LRT_pred,MutationTaster_score,...,GERP++_RS_rankscore,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons30way_mammalian,phastCons30way_mammalian_rankscore,CLNALLELEID,CLNDN,CLNDISDB,CLNREVSTAT,CLNSIG
input,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FBXW7 R505G,0.001,0.913,D,0.015,0.616,D,0.0,0.629,D,1,...,0.631,1.0,0.716,1.0,0.863,363302,B-cell_chronic_lymphocytic_leukemia|Transition...,"Human_Phenotype_Ontology:HP:0005550,Human_Phen...",no_assertion_criteria_provided,Likely_pathogenic
HRAS E31K,0.109,0.311,T,0.345,0.177,T,0.0,0.843,D,1,...,0.338,1.0,0.716,0.826,0.344,.,.,.,.,.
PPP2R1A S256F,0.0,0.913,D,0.0,0.928,D,0.001,0.408,D,1.0,...,0.54,1.0,0.716,0.998,0.659,.,.,.,.,.
SPOP L282V,0.001,0.785,D,0.041,0.505,D,0.0,0.843,D,1,...,0.475,1.0,0.716,1.0,0.863,.,.,.,.,.
ERBB3 F219V,0.071,0.352,T,0.116,0.372,T,0.0,0.629,D,1,...,0.883,0.999,0.427,1.0,0.863,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
KRAS G12D,0.01,0.565,D,0.0,0.928,D,0.0,0.843,D,1,...,0.88,1.0,0.716,1.0,0.863,27621,Vascular_Tumors_Including_Pyogenic_Granuloma|A...,".|Human_Phenotype_Ontology:HP:0001914,Human_Ph...","criteria_provided,_multiple_submitters,_no_con...",Pathogenic
NOTCH1 V2119M,0.047,0.403,D,0.062,0.453,T,0.0,0.629,N,1.0,...,0.548,1.0,0.716,0.998,0.659,.,.,.,.,.
RB1 S834*,.,.,.,.,.,.,0.0,0.629,D,1,...,0.894,1.0,0.716,1.0,0.863,420582,Retinoblastoma|Hereditary_cancer-predisposing_...,"Human_Phenotype_Ontology:HP:0009919,MONDO:MOND...","criteria_provided,_multiple_submitters,_no_con...",Pathogenic
TP53 A161T,0.006,0.913,D,0.013,0.631,D,0.0,0.843,D,1.0,...,0.569,0.999,0.427,0.99,0.524,171616,Li-Fraumeni_syndrome_1|Malignant_tumor_of_pros...,"Gene:553989,MONDO:MONDO:0007903,MedGen:C183539...","criteria_provided,_conflicting_interpretations",Conflicting_interpretations_of_pathogenicity


In [59]:
tcga_clinvar_df.CLNSIG.value_counts()

.                                               3681
Uncertain_significance                           475
Pathogenic                                       461
Pathogenic/Likely_pathogenic                     151
Conflicting_interpretations_of_pathogenicity     135
Likely_pathogenic                                130
Pathogenic|drug_response|other                    22
Likely_benign                                     21
not_provided                                      11
Benign                                             8
Benign/Likely_benign                               8
Pathogenic/Likely_pathogenic|other                 2
drug_response                                      2
Pathogenic/Likely_pathogenic|drug_response         1
other                                              1
Name: CLNSIG, dtype: int64

In [60]:
# remove duplicate mappings from input to ClinVar annotations
tcga_clinvar_df = tcga_clinvar_df.reset_index().drop_duplicates(subset=["input"])
tcga_clinvar_df.shape

(3855, 74)

In [61]:
tcga_clinvar_df["ClinVar_annotations_categorized"] = tcga_clinvar_df["CLNSIG"].apply(get_clinvar_supercategory)
tcga_clinvar_df["ClinVar_annotations_categorized"].value_counts()

VUS           3445
Pathogenic     385
Benign          25
Name: ClinVar_annotations_categorized, dtype: int64

In [62]:
merged_tcga = pd.merge(tcga_annovar_gpd_annotated_df, tcga_clinvar_df, left_on = "point_mutation", right_on="input", how="left")
merged_tcga

Unnamed: 0,submitter_id,point_mutation,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,fathmm_pred,provean_pred,metasvm_pred,...,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons30way_mammalian,phastCons30way_mammalian_rankscore,CLNALLELEID,CLNDN,CLNDISDB,CLNREVSTAT,CLNSIG,ClinVar_annotations_categorized
0,TCGA-2E-A9G8,FBXW7 R505G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.716,1.0,0.863,363302,B-cell_chronic_lymphocytic_leukemia|Transition...,"Human_Phenotype_Ontology:HP:0005550,Human_Phen...",no_assertion_criteria_provided,Likely_pathogenic,Pathogenic
1,TCGA-2E-A9G8,TP53 E286_E287del,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,TCGA-A5-A1OH,AXL S447S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,TCGA-A5-A1OH,BRD4 X1340_splice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,TCGA-A5-A1OH,HRAS E31K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.716,0.826,0.344,.,.,.,.,.,VUS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16826,TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.995,0.388,0.938,0.406,.,.,.,.,.,VUS
16827,TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.995,0.388,0.938,0.406,.,.,.,.,.,VUS
16828,TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.995,0.388,0.938,0.406,.,.,.,.,.,VUS
16829,TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.995,0.388,0.938,0.406,.,.,.,.,.,VUS


In [63]:
merged_tcga.gene.isna().sum() # to ensure no NaNs exist

0

#### GPD based features

In [64]:
# Max feature
tcga_gpd_max = merged_tcga.groupby(["submitter_id", "gene", "GPD_unit"]).aggregate("max")[["xon17_score"]].reset_index()
tcga_gpd_max

Unnamed: 0,submitter_id,gene,GPD_unit,xon17_score
0,TCGA-05-4384,CD22,PIU,0.0
1,TCGA-05-4384,DOT1L,LU,0.0
2,TCGA-05-4384,ERBB2,NCU,0.0
3,TCGA-05-4384,KDR,PIU,0.0
4,TCGA-05-4384,KEAP1,PIU,0.0
...,...,...,...,...
5627,TCGA-Z7-A8R5,TBX3,NCU,0.0
5628,TCGA-Z7-A8R6,ALK,PIU,0.0
5629,TCGA-Z7-A8R6,STAG2,LU,0.0
5630,TCGA-Z7-A8R6,TBX3,PIU,0.0


In [67]:
tcga_gpd_max_lu_matrix = get_matrices(tcga_gpd_max, merged_tcga, criteria="GPD_unit", criteria_value="LU", index_name='submitter_id', column_name='gene')
tcga_gpd_max_piu_matrix = get_matrices(tcga_gpd_max, merged_tcga, criteria="GPD_unit", criteria_value="PIU", index_name='submitter_id', column_name='gene')
tcga_gpd_max_ncu_matrix = get_matrices(tcga_gpd_max, merged_tcga, criteria="GPD_unit", criteria_value="NCU", index_name='submitter_id', column_name='gene')
print(tcga_gpd_max_lu_matrix.shape)
print(tcga_gpd_max_piu_matrix.shape)
print(tcga_gpd_max_ncu_matrix.shape)

(396, 240)
(551, 294)
(389, 287)
(596, 324)
(596, 324)
(596, 324)


In [68]:
# Sum feature
tcga_gpd_sum = merged_tcga.groupby(["submitter_id", "gene", "GPD_unit"]).aggregate("sum")[["xon17_score"]].reset_index()
tcga_gpd_sum

Unnamed: 0,submitter_id,gene,GPD_unit,xon17_score
0,TCGA-05-4384,CD22,PIU,0.0
1,TCGA-05-4384,DOT1L,LU,0.0
2,TCGA-05-4384,ERBB2,NCU,0.0
3,TCGA-05-4384,KDR,PIU,0.0
4,TCGA-05-4384,KEAP1,PIU,0.0
...,...,...,...,...
5627,TCGA-Z7-A8R5,TBX3,NCU,0.0
5628,TCGA-Z7-A8R6,ALK,PIU,0.0
5629,TCGA-Z7-A8R6,STAG2,LU,0.0
5630,TCGA-Z7-A8R6,TBX3,PIU,0.0


In [69]:
tcga_gpd_sum_lu_matrix = get_matrices(tcga_gpd_sum, merged_tcga, criteria="GPD_unit", criteria_value="LU", index_name='submitter_id', column_name='gene')
tcga_gpd_sum_piu_matrix = get_matrices(tcga_gpd_sum, merged_tcga, criteria="GPD_unit", criteria_value="PIU", index_name='submitter_id', column_name='gene')
tcga_gpd_sum_ncu_matrix = get_matrices(tcga_gpd_sum, merged_tcga, criteria="GPD_unit", criteria_value="NCU", index_name='submitter_id', column_name='gene')
print(tcga_gpd_sum_lu_matrix.shape)
print(tcga_gpd_sum_piu_matrix.shape)
print(tcga_gpd_sum_ncu_matrix.shape)

(396, 240)
(551, 294)
(389, 287)
(596, 324)
(596, 324)
(596, 324)


In [70]:
# Mean feature
tcga_gpd_mean = merged_tcga.groupby(["submitter_id", "gene", "GPD_unit"]).aggregate("mean")[["xon17_score"]].reset_index()
tcga_gpd_mean

Unnamed: 0,submitter_id,gene,GPD_unit,xon17_score
0,TCGA-05-4384,CD22,PIU,0.0
1,TCGA-05-4384,DOT1L,LU,0.0
2,TCGA-05-4384,ERBB2,NCU,0.0
3,TCGA-05-4384,KDR,PIU,0.0
4,TCGA-05-4384,KEAP1,PIU,0.0
...,...,...,...,...
5627,TCGA-Z7-A8R5,TBX3,NCU,0.0
5628,TCGA-Z7-A8R6,ALK,PIU,0.0
5629,TCGA-Z7-A8R6,STAG2,LU,0.0
5630,TCGA-Z7-A8R6,TBX3,PIU,0.0


In [71]:
tcga_gpd_mean_lu_matrix = get_matrices(tcga_gpd_mean, merged_tcga, criteria="GPD_unit", criteria_value="LU", index_name='submitter_id', column_name='gene')
tcga_gpd_mean_piu_matrix = get_matrices(tcga_gpd_mean, merged_tcga, criteria="GPD_unit", criteria_value="PIU", index_name='submitter_id', column_name='gene')
tcga_gpd_mean_ncu_matrix = get_matrices(tcga_gpd_mean, merged_tcga, criteria="GPD_unit", criteria_value="NCU", index_name='submitter_id', column_name='gene')
print(tcga_gpd_mean_lu_matrix.shape)
print(tcga_gpd_mean_piu_matrix.shape)
print(tcga_gpd_mean_ncu_matrix.shape)

(396, 240)
(551, 294)
(389, 287)
(596, 324)
(596, 324)
(596, 324)


In [73]:
# Count feature
tcga_gpd_count = merged_tcga.groupby(["submitter_id", "gene", "GPD_unit"]).aggregate("count")[["xon17_score"]].reset_index()
tcga_gpd_count

Unnamed: 0,submitter_id,gene,GPD_unit,xon17_score
0,TCGA-05-4384,CD22,PIU,1
1,TCGA-05-4384,DOT1L,LU,1
2,TCGA-05-4384,ERBB2,NCU,1
3,TCGA-05-4384,KDR,PIU,1
4,TCGA-05-4384,KEAP1,PIU,1
...,...,...,...,...
5627,TCGA-Z7-A8R5,TBX3,NCU,1
5628,TCGA-Z7-A8R6,ALK,PIU,1
5629,TCGA-Z7-A8R6,STAG2,LU,1
5630,TCGA-Z7-A8R6,TBX3,PIU,1


In [74]:
tcga_gpd_count_lu_matrix = get_matrices(tcga_gpd_count, merged_tcga, criteria="GPD_unit", criteria_value="LU", index_name='submitter_id', column_name='gene')
tcga_gpd_count_piu_matrix = get_matrices(tcga_gpd_count, merged_tcga, criteria="GPD_unit", criteria_value="PIU", index_name='submitter_id', column_name='gene')
tcga_gpd_count_ncu_matrix = get_matrices(tcga_gpd_count, merged_tcga, criteria="GPD_unit", criteria_value="NCU", index_name='submitter_id', column_name='gene')
print(tcga_gpd_count_lu_matrix.shape)
print(tcga_gpd_count_piu_matrix.shape)
print(tcga_gpd_count_ncu_matrix.shape)

(396, 240)
(551, 294)
(389, 287)
(596, 324)
(596, 324)
(596, 324)


#### ClinVar based features

In [75]:
# Max feature
tcga_clinvar_max = merged_tcga.groupby(["submitter_id", "gene", "ClinVar_annotations_categorized"]).aggregate("max")[["xon17_score"]].reset_index()
tcga_clinvar_max

Unnamed: 0,submitter_id,gene,ClinVar_annotations_categorized,xon17_score
0,TCGA-05-4384,CD22,VUS,0.0
1,TCGA-05-4384,KDR,VUS,0.0
2,TCGA-05-4384,KEAP1,VUS,0.0
3,TCGA-05-4384,PPARG,VUS,0.0
4,TCGA-05-4384,SMO,VUS,0.0
...,...,...,...,...
3782,TCGA-XX-A899,PIK3CA,Pathogenic,0.0
3783,TCGA-Z7-A8R5,PIK3CA,Pathogenic,0.0
3784,TCGA-Z7-A8R6,ALK,Benign,0.0
3785,TCGA-Z7-A8R6,STAG2,VUS,0.0


In [76]:
tcga_clinvar_max_pathogenic_matrix = get_matrices(tcga_clinvar_max, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="Pathogenic", index_name='submitter_id', column_name='gene')
tcga_clinvar_max_vus_matrix = get_matrices(tcga_clinvar_max, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="VUS", index_name='submitter_id', column_name='gene')
tcga_clinvar_max_benign_matrix = get_matrices(tcga_clinvar_max, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="Benign", index_name='submitter_id', column_name='gene')
print(tcga_clinvar_max_pathogenic_matrix.shape)
print(tcga_clinvar_max_vus_matrix.shape)
print(tcga_clinvar_max_benign_matrix.shape)

(398, 85)
(512, 305)
(22, 18)
(596, 324)
(596, 324)
(596, 324)


In [77]:
# Sum feature
tcga_clinvar_sum = merged_tcga.groupby(["submitter_id", "gene", "ClinVar_annotations_categorized"]).aggregate("sum")[["xon17_score"]].reset_index()
tcga_clinvar_sum

Unnamed: 0,submitter_id,gene,ClinVar_annotations_categorized,xon17_score
0,TCGA-05-4384,CD22,VUS,0.0
1,TCGA-05-4384,KDR,VUS,0.0
2,TCGA-05-4384,KEAP1,VUS,0.0
3,TCGA-05-4384,PPARG,VUS,0.0
4,TCGA-05-4384,SMO,VUS,0.0
...,...,...,...,...
3782,TCGA-XX-A899,PIK3CA,Pathogenic,0.0
3783,TCGA-Z7-A8R5,PIK3CA,Pathogenic,0.0
3784,TCGA-Z7-A8R6,ALK,Benign,0.0
3785,TCGA-Z7-A8R6,STAG2,VUS,0.0


In [78]:
tcga_clinvar_sum_pathogenic_matrix = get_matrices(tcga_clinvar_sum, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="Pathogenic", index_name='submitter_id', column_name='gene')
tcga_clinvar_sum_vus_matrix = get_matrices(tcga_clinvar_sum, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="VUS", index_name='submitter_id', column_name='gene')
tcga_clinvar_sum_benign_matrix = get_matrices(tcga_clinvar_sum, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="Benign", index_name='submitter_id', column_name='gene')
print(tcga_clinvar_sum_pathogenic_matrix.shape)
print(tcga_clinvar_sum_vus_matrix.shape)
print(tcga_clinvar_sum_benign_matrix.shape)

(398, 85)
(512, 305)
(22, 18)
(596, 324)
(596, 324)
(596, 324)


In [79]:
# Mean feature
tcga_clinvar_mean = merged_tcga.groupby(["submitter_id", "gene", "ClinVar_annotations_categorized"]).aggregate("mean")[["xon17_score"]].reset_index()
tcga_clinvar_mean

Unnamed: 0,submitter_id,gene,ClinVar_annotations_categorized,xon17_score
0,TCGA-05-4384,CD22,VUS,0.0
1,TCGA-05-4384,KDR,VUS,0.0
2,TCGA-05-4384,KEAP1,VUS,0.0
3,TCGA-05-4384,PPARG,VUS,0.0
4,TCGA-05-4384,SMO,VUS,0.0
...,...,...,...,...
3782,TCGA-XX-A899,PIK3CA,Pathogenic,0.0
3783,TCGA-Z7-A8R5,PIK3CA,Pathogenic,0.0
3784,TCGA-Z7-A8R6,ALK,Benign,0.0
3785,TCGA-Z7-A8R6,STAG2,VUS,0.0


In [80]:
tcga_clinvar_mean_pathogenic_matrix = get_matrices(tcga_clinvar_mean, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="Pathogenic", index_name='submitter_id', column_name='gene')
tcga_clinvar_mean_vus_matrix = get_matrices(tcga_clinvar_mean, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="VUS", index_name='submitter_id', column_name='gene')
tcga_clinvar_mean_benign_matrix = get_matrices(tcga_clinvar_mean, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="Benign", index_name='submitter_id', column_name='gene')
print(tcga_clinvar_mean_pathogenic_matrix.shape)
print(tcga_clinvar_mean_vus_matrix.shape)
print(tcga_clinvar_mean_benign_matrix.shape)

(398, 85)
(512, 305)
(22, 18)
(596, 324)
(596, 324)
(596, 324)


In [81]:
# Count feature
tcga_clinvar_count = merged_tcga.groupby(["submitter_id", "gene", "ClinVar_annotations_categorized"]).aggregate("count")[["xon17_score"]].reset_index()
tcga_clinvar_count

Unnamed: 0,submitter_id,gene,ClinVar_annotations_categorized,xon17_score
0,TCGA-05-4384,CD22,VUS,1
1,TCGA-05-4384,KDR,VUS,1
2,TCGA-05-4384,KEAP1,VUS,1
3,TCGA-05-4384,PPARG,VUS,1
4,TCGA-05-4384,SMO,VUS,1
...,...,...,...,...
3782,TCGA-XX-A899,PIK3CA,Pathogenic,1
3783,TCGA-Z7-A8R5,PIK3CA,Pathogenic,1
3784,TCGA-Z7-A8R6,ALK,Benign,1
3785,TCGA-Z7-A8R6,STAG2,VUS,1


In [82]:
tcga_clinvar_count_pathogenic_matrix = get_matrices(tcga_clinvar_count, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="Pathogenic", index_name='submitter_id', column_name='gene')
tcga_clinvar_count_vus_matrix = get_matrices(tcga_clinvar_count, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="VUS", index_name='submitter_id', column_name='gene')
tcga_clinvar_count_benign_matrix = get_matrices(tcga_clinvar_count, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="Benign", index_name='submitter_id', column_name='gene')
print(tcga_clinvar_count_pathogenic_matrix.shape)
print(tcga_clinvar_count_vus_matrix.shape)
print(tcga_clinvar_count_benign_matrix.shape)

(398, 85)
(512, 305)
(22, 18)
(596, 324)
(596, 324)
(596, 324)


In [92]:
# Add suffixes to identify columns
tcga_gpd_max_piu_matrix = tcga_gpd_max_piu_matrix.add_suffix('_piu_max')
tcga_gpd_sum_piu_matrix = tcga_gpd_sum_piu_matrix.add_suffix("_piu_sum")
tcga_gpd_mean_piu_matrix = tcga_gpd_mean_piu_matrix.add_suffix("_piu_mean")
tcga_gpd_count_piu_matrix = tcga_gpd_count_piu_matrix.add_suffix("_piu_count")
tcga_gpd_max_lu_matrix = tcga_gpd_max_lu_matrix.add_suffix("_lu_max")
tcga_gpd_sum_lu_matrix = tcga_gpd_sum_lu_matrix.add_suffix("_lu_sum")
tcga_gpd_mean_lu_matrix = tcga_gpd_mean_lu_matrix.add_suffix("_lu_mean")
tcga_gpd_count_lu_matrix = tcga_gpd_count_ncu_matrix.add_suffix("_lu_count")
tcga_gpd_max_ncu_matrix = tcga_gpd_max_ncu_matrix.add_suffix("_ncu_max")
tcga_gpd_sum_ncu_matrix = tcga_gpd_sum_ncu_matrix.add_suffix("_ncu_sum")
tcga_gpd_mean_ncu_matrix = tcga_gpd_mean_ncu_matrix.add_suffix("_ncu_mean")
tcga_gpd_count_ncu_matrix = tcga_gpd_count_ncu_matrix.add_suffix("_ncu_count")

tcga_clinvar_max_pathogenic_matrix = tcga_clinvar_max_pathogenic_matrix.add_suffix("_pathogenic_max")
tcga_clinvar_sum_pathogenic_matrix = tcga_clinvar_sum_pathogenic_matrix.add_suffix("_pathogenic_sum")
tcga_clinvar_mean_pathogenic_matrix = tcga_clinvar_mean_pathogenic_matrix.add_suffix("_pathogenic_mean")
tcga_clinvar_count_pathogenic_matrix = tcga_clinvar_count_pathogenic_matrix.add_suffix("_pathogenic_count")
tcga_clinvar_max_vus_matrix = tcga_clinvar_max_vus_matrix.add_suffix("_vus_max")
tcga_clinvar_sum_vus_matrix = tcga_clinvar_sum_vus_matrix.add_suffix("_vus_sum")
tcga_clinvar_mean_vus_matrix = tcga_clinvar_mean_vus_matrix.add_suffix("_vus_mean")
tcga_clinvar_count_vus_matrix = tcga_clinvar_count_vus_matrix.add_suffix("_vus_count")
tcga_clinvar_max_benign_matrix = tcga_clinvar_max_benign_matrix.add_suffix("_benign_max")
tcga_clinvar_sum_benign_matrix = tcga_clinvar_sum_benign_matrix.add_suffix("_benign_sum")
tcga_clinvar_mean_benign_matrix = tcga_clinvar_mean_benign_matrix.add_suffix("_benign_mean")
tcga_clinvar_count_benign_matrix = tcga_clinvar_count_benign_matrix.add_suffix("_benign_count")

In [93]:
# PIU - Max, Sum, Mean, Count, LU - Max, Sum, Mean, Count, NCU - Max, Sum, Mean, Count, Pathogenic - Max, Sum, Mean, Count, VUS - Max, Sum, Mean, Count, Benign - Max, Sum, Mean, Count
tcga_feature_matrix = pd.concat([
    tcga_gpd_max_piu_matrix, tcga_gpd_sum_piu_matrix, tcga_gpd_mean_piu_matrix, tcga_gpd_count_piu_matrix,
    tcga_gpd_max_lu_matrix, tcga_gpd_sum_lu_matrix, tcga_gpd_mean_lu_matrix, tcga_gpd_count_lu_matrix,
    tcga_gpd_max_ncu_matrix, tcga_gpd_sum_ncu_matrix, tcga_gpd_mean_ncu_matrix, tcga_gpd_count_ncu_matrix,
    tcga_clinvar_max_pathogenic_matrix, tcga_clinvar_sum_pathogenic_matrix, tcga_clinvar_mean_pathogenic_matrix, tcga_clinvar_count_pathogenic_matrix,
    tcga_clinvar_max_vus_matrix, tcga_clinvar_sum_vus_matrix, tcga_clinvar_mean_vus_matrix, tcga_clinvar_count_vus_matrix,
    tcga_clinvar_max_benign_matrix, tcga_clinvar_sum_benign_matrix, tcga_clinvar_mean_benign_matrix, tcga_clinvar_count_benign_matrix,
], axis = 1)
tcga_feature_matrix.shape

(596, 7776)

In [94]:
tcga_feature_matrix.to_csv("../data/processed/clinvar_gpd_annovar_annotated_tcga_feature_matrix.csv")

In [110]:
tcga_feature_matrix

Unnamed: 0_level_0,ABL1_piu_max,ACVR1B_piu_max,AKT1_piu_max,AKT2_piu_max,AKT3_piu_max,ALK_piu_max,ALOX12B_piu_max,APC_piu_max,AR_piu_max,ARAF_piu_max,...,U2AF1_benign_count,VEGFA_benign_count,VHL_benign_count,WHSC1_benign_count,WHSC1L1_benign_count,WT1_benign_count,XPO1_benign_count,XRCC2_benign_count,ZNF217_benign_count,ZNF703_benign_count
submitter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-05-4384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-05-4390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-05-4398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-05-4402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-05-4427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-XF-AAN5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-XF-AAN7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-XX-A899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-Z7-A8R5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
