This notebook takes the ClinVar, GPD and Annovar annotations for each dataset and generates processed files for training. Each variant is put into 1 of 6 bins - PIU/LU/NCU and Pathogenic/VUS/Benign. Within each bucket, all variants in the same gene for a patient are aggregated using count(variants), max(Annovar score), sum(Annovar score) and mean(Annovar score).

In [None]:
import pandas as pd
import numpy as np

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
PATHOGENIC_ANNOTATIONS = [
    "Pathogenic",
    "Pathogenic|drug_response|other",
    "Pathogenic/Likely_pathogenic",
    "Likely_pathogenic",
    "Pathogenic/Likely_pathogenic|other",
    "drug_response",
    "Likely_pathogenic|other",
    "Pathogenic|risk_factor",
    "Pathogenic/Likely_pathogenic|drug_response",
    "Likely_risk_allele",
    "risk_factor",
]
VUS_ANNOTATIONS = [
    ".",
    "Uncertain_significance",
    "Conflicting_interpretations_of_pathogenicity",
    "not_provided",
    "Conflicting_interpretations_of_pathogenicity|other",
    "Uncertain_significance|drug_response",
    "other",
]
BENIGN_ANNOTATIONS = [
    "Likely_benign",
    "Benign/Likely_benign",
    "Benign",
]

In [None]:
def get_clinvar_supercategory(x):
    if x in PATHOGENIC_ANNOTATIONS:
        return "Pathogenic"
    elif x in VUS_ANNOTATIONS:
        return "VUS"
    elif x in BENIGN_ANNOTATIONS:
        return "Benign"
    else:
        return "NA"

In [None]:
GENES_324 = list(pd.read_csv("../data/raw/gene2ind.txt", header=None)[0])

In [None]:
def get_matrices(df, merged_df, criteria="GPD_unit", criteria_value="PIU", index_name = "DepMap_ID", column_name = "Hugo_Symbol"):
    df_reduced = df[df[criteria] == criteria_value]
    df_reduced_matrix = pd.pivot_table(df_reduced, index=index_name, columns=column_name, values="xon17_score")
    print(df_reduced_matrix.shape)
    
    # Adding missing features from GENES_324 and replacing NaNs with 0 and adding patients across all
    for g in GENES_324:
        if g not in df_reduced_matrix.columns:
            df_reduced_matrix[g] = 0

    all_patients = list(merged_df[index_name].unique())
    df_reduced_matrix = df_reduced_matrix.reset_index()
    for p in set(all_patients) - set(df_reduced_matrix[index_name]):
        df_reduced_matrix = df_reduced_matrix.append({index_name: p}, ignore_index=True)

    df_reduced_matrix.set_index(index_name, drop=True, inplace=True)
    df_reduced_matrix.fillna(0, inplace=True)
    df_reduced_matrix = df_reduced_matrix.sort_index()[GENES_324]
    
    return df_reduced_matrix

### CCLE dataset

In [None]:
ccle_annovar_gpd_annotated_df = pd.read_csv("../data/processed/ccle_21q3_annovar_gpd_annot_per_patient_per_mutation.csv",)
ccle_annovar_gpd_annotated_df

In [None]:
ccle_annovar_gpd_annotated_df["xon17_score"] = ccle_annovar_gpd_annotated_df["1plusxon17_score"] - 1 # remvoing the 1
ccle_annovar_gpd_annotated_df

In [None]:
ccle_clinvar_df = pd.read_csv("../data/processed/clinvar_anno_features_per_mutation_ccle.csv", index_col=0)
ccle_clinvar_df

In [None]:
ccle_clinvar_df.CLNSIG.value_counts()

In [None]:
# remove duplicate mappings from input to ClinVar annotations
ccle_clinvar_df = ccle_clinvar_df.reset_index().drop_duplicates(subset=["input"])
ccle_clinvar_df.shape

In [None]:
ccle_clinvar_df["ClinVar_annotations_categorized"] = ccle_clinvar_df["CLNSIG"].apply(get_clinvar_supercategory)
ccle_clinvar_df["ClinVar_annotations_categorized"].value_counts()

In [None]:
merged_ccle = pd.merge(ccle_annovar_gpd_annotated_df, ccle_clinvar_df, left_on = "mutation", right_on="input", how="left")
merged_ccle

In [None]:
merged_ccle.Hugo_Symbol.isna().sum() # to ensure no NaNs exist

#### GPD based features

In [None]:
# Max feature
ccle_gpd_max = merged_ccle.groupby(["DepMap_ID", "Hugo_Symbol", "GPD_unit"]).aggregate("max")[["xon17_score"]].reset_index()
ccle_gpd_max

In [None]:
ccle_gpd_max_lu_matrix = get_matrices(ccle_gpd_max, merged_ccle, criteria="GPD_unit", criteria_value="LU", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_gpd_max_piu_matrix = get_matrices(ccle_gpd_max, merged_ccle, criteria="GPD_unit", criteria_value="PIU", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_gpd_max_ncu_matrix = get_matrices(ccle_gpd_max, merged_ccle, criteria="GPD_unit", criteria_value="NCU", index_name='DepMap_ID', column_name='Hugo_Symbol')
print(ccle_gpd_max_lu_matrix.shape)
print(ccle_gpd_max_piu_matrix.shape)
print(ccle_gpd_max_ncu_matrix.shape)

In [None]:
# Sum feature
ccle_gpd_sum = merged_ccle.groupby(["DepMap_ID", "Hugo_Symbol", "GPD_unit"]).aggregate("sum")[["xon17_score"]].reset_index()
ccle_gpd_sum

In [None]:
ccle_gpd_sum_lu_matrix = get_matrices(ccle_gpd_sum, merged_ccle, criteria="GPD_unit", criteria_value="LU", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_gpd_sum_piu_matrix = get_matrices(ccle_gpd_sum, merged_ccle, criteria="GPD_unit", criteria_value="PIU", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_gpd_sum_ncu_matrix = get_matrices(ccle_gpd_sum, merged_ccle, criteria="GPD_unit", criteria_value="NCU", index_name='DepMap_ID', column_name='Hugo_Symbol')
print(ccle_gpd_sum_lu_matrix.shape)
print(ccle_gpd_sum_piu_matrix.shape)
print(ccle_gpd_sum_ncu_matrix.shape)

In [None]:
# Mean feature
ccle_gpd_mean = merged_ccle.groupby(["DepMap_ID", "Hugo_Symbol", "GPD_unit"]).aggregate("mean")[["xon17_score"]].reset_index()
ccle_gpd_mean

In [None]:
ccle_gpd_mean_lu_matrix = get_matrices(ccle_gpd_mean, merged_ccle, criteria="GPD_unit", criteria_value="LU", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_gpd_mean_piu_matrix = get_matrices(ccle_gpd_mean, merged_ccle, criteria="GPD_unit", criteria_value="PIU", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_gpd_mean_ncu_matrix = get_matrices(ccle_gpd_mean, merged_ccle, criteria="GPD_unit", criteria_value="NCU", index_name='DepMap_ID', column_name='Hugo_Symbol')
print(ccle_gpd_mean_lu_matrix.shape)
print(ccle_gpd_mean_piu_matrix.shape)
print(ccle_gpd_mean_ncu_matrix.shape)

In [None]:
# Count feature
ccle_gpd_count = merged_ccle.groupby(["DepMap_ID", "Hugo_Symbol", "GPD_unit"]).aggregate("count")[["xon17_score"]].reset_index()
ccle_gpd_count

In [None]:
ccle_gpd_count_lu_matrix = get_matrices(ccle_gpd_count, merged_ccle, criteria="GPD_unit", criteria_value="LU", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_gpd_count_piu_matrix = get_matrices(ccle_gpd_count, merged_ccle, criteria="GPD_unit", criteria_value="PIU", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_gpd_count_ncu_matrix = get_matrices(ccle_gpd_count, merged_ccle, criteria="GPD_unit", criteria_value="NCU", index_name='DepMap_ID', column_name='Hugo_Symbol')
print(ccle_gpd_count_lu_matrix.shape)
print(ccle_gpd_count_piu_matrix.shape)
print(ccle_gpd_count_ncu_matrix.shape)

#### ClinVar based features

In [None]:
# Max feature
ccle_clinvar_max = merged_ccle.groupby(["DepMap_ID", "Hugo_Symbol", "ClinVar_annotations_categorized"]).aggregate("max")[["xon17_score"]].reset_index()
ccle_clinvar_max

In [None]:
ccle_clinvar_max_pathogenic_matrix = get_matrices(ccle_clinvar_max, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="Pathogenic", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_clinvar_max_vus_matrix = get_matrices(ccle_clinvar_max, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="VUS", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_clinvar_max_benign_matrix = get_matrices(ccle_clinvar_max, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="Benign", index_name='DepMap_ID', column_name='Hugo_Symbol')
print(ccle_clinvar_max_pathogenic_matrix.shape)
print(ccle_clinvar_max_vus_matrix.shape)
print(ccle_clinvar_max_benign_matrix.shape)

In [None]:
# Sum feature
ccle_clinvar_sum = merged_ccle.groupby(["DepMap_ID", "Hugo_Symbol", "ClinVar_annotations_categorized"]).aggregate("sum")[["xon17_score"]].reset_index()
ccle_clinvar_sum

In [None]:
ccle_clinvar_sum_pathogenic_matrix = get_matrices(ccle_clinvar_sum, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="Pathogenic", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_clinvar_sum_vus_matrix = get_matrices(ccle_clinvar_sum, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="VUS", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_clinvar_sum_benign_matrix = get_matrices(ccle_clinvar_sum, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="Benign", index_name='DepMap_ID', column_name='Hugo_Symbol')
print(ccle_clinvar_sum_pathogenic_matrix.shape)
print(ccle_clinvar_sum_vus_matrix.shape)
print(ccle_clinvar_sum_benign_matrix.shape)

In [None]:
# Mean feature
ccle_clinvar_mean = merged_ccle.groupby(["DepMap_ID", "Hugo_Symbol", "ClinVar_annotations_categorized"]).aggregate("mean")[["xon17_score"]].reset_index()
ccle_clinvar_mean

In [None]:
ccle_clinvar_mean_pathogenic_matrix = get_matrices(ccle_clinvar_mean, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="Pathogenic", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_clinvar_mean_vus_matrix = get_matrices(ccle_clinvar_mean, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="VUS", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_clinvar_mean_benign_matrix = get_matrices(ccle_clinvar_mean, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="Benign", index_name='DepMap_ID', column_name='Hugo_Symbol')
print(ccle_clinvar_mean_pathogenic_matrix.shape)
print(ccle_clinvar_mean_vus_matrix.shape)
print(ccle_clinvar_mean_benign_matrix.shape)

In [None]:
# Count feature
ccle_clinvar_count = merged_ccle.groupby(["DepMap_ID", "Hugo_Symbol", "ClinVar_annotations_categorized"]).aggregate("count")[["xon17_score"]].reset_index()
ccle_clinvar_count

In [None]:
ccle_clinvar_count_pathogenic_matrix = get_matrices(ccle_clinvar_count, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="Pathogenic", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_clinvar_count_vus_matrix = get_matrices(ccle_clinvar_count, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="VUS", index_name='DepMap_ID', column_name='Hugo_Symbol')
ccle_clinvar_count_benign_matrix = get_matrices(ccle_clinvar_count, merged_ccle, criteria="ClinVar_annotations_categorized", criteria_value="Benign", index_name='DepMap_ID', column_name='Hugo_Symbol')
print(ccle_clinvar_count_pathogenic_matrix.shape)
print(ccle_clinvar_count_vus_matrix.shape)
print(ccle_clinvar_count_benign_matrix.shape)

In [None]:
# Add suffixes to identify columns
ccle_gpd_max_piu_matrix = ccle_gpd_max_piu_matrix.add_suffix('_piu_max')
ccle_gpd_sum_piu_matrix = ccle_gpd_sum_piu_matrix.add_suffix("_piu_sum")
ccle_gpd_mean_piu_matrix = ccle_gpd_mean_piu_matrix.add_suffix("_piu_mean")
ccle_gpd_count_piu_matrix = ccle_gpd_count_piu_matrix.add_suffix("_piu_count")
ccle_gpd_max_lu_matrix = ccle_gpd_max_lu_matrix.add_suffix("_lu_max")
ccle_gpd_sum_lu_matrix = ccle_gpd_sum_lu_matrix.add_suffix("_lu_sum")
ccle_gpd_mean_lu_matrix = ccle_gpd_mean_lu_matrix.add_suffix("_lu_mean")
ccle_gpd_count_lu_matrix = ccle_gpd_count_ncu_matrix.add_suffix("_lu_count")
ccle_gpd_max_ncu_matrix = ccle_gpd_max_ncu_matrix.add_suffix("_ncu_max")
ccle_gpd_sum_ncu_matrix = ccle_gpd_sum_ncu_matrix.add_suffix("_ncu_sum")
ccle_gpd_mean_ncu_matrix = ccle_gpd_mean_ncu_matrix.add_suffix("_ncu_mean")
ccle_gpd_count_ncu_matrix = ccle_gpd_count_ncu_matrix.add_suffix("_ncu_count")

ccle_clinvar_max_pathogenic_matrix = ccle_clinvar_max_pathogenic_matrix.add_suffix("_pathogenic_max")
ccle_clinvar_sum_pathogenic_matrix = ccle_clinvar_sum_pathogenic_matrix.add_suffix("_pathogenic_sum")
ccle_clinvar_mean_pathogenic_matrix = ccle_clinvar_mean_pathogenic_matrix.add_suffix("_pathogenic_mean")
ccle_clinvar_count_pathogenic_matrix = ccle_clinvar_count_pathogenic_matrix.add_suffix("_pathogenic_count")
ccle_clinvar_max_vus_matrix = ccle_clinvar_max_vus_matrix.add_suffix("_vus_max")
ccle_clinvar_sum_vus_matrix = ccle_clinvar_sum_vus_matrix.add_suffix("_vus_sum")
ccle_clinvar_mean_vus_matrix = ccle_clinvar_mean_vus_matrix.add_suffix("_vus_mean")
ccle_clinvar_count_vus_matrix = ccle_clinvar_count_vus_matrix.add_suffix("_vus_count")
ccle_clinvar_max_benign_matrix = ccle_clinvar_max_benign_matrix.add_suffix("_benign_max")
ccle_clinvar_sum_benign_matrix = ccle_clinvar_sum_benign_matrix.add_suffix("_benign_sum")
ccle_clinvar_mean_benign_matrix = ccle_clinvar_mean_benign_matrix.add_suffix("_benign_mean")
ccle_clinvar_count_benign_matrix = ccle_clinvar_count_benign_matrix.add_suffix("_benign_count")

In [None]:
# PIU - Max, Sum, Mean, Count, LU - Max, Sum, Mean, Count, NCU - Max, Sum, Mean, Count, Pathogenic - Max, Sum, Mean, Count, VUS - Max, Sum, Mean, Count, Benign - Max, Sum, Mean, Count
ccle_feature_matrix = pd.concat([
    ccle_gpd_max_piu_matrix, ccle_gpd_sum_piu_matrix, ccle_gpd_mean_piu_matrix, ccle_gpd_count_piu_matrix,
    ccle_gpd_max_lu_matrix, ccle_gpd_sum_lu_matrix, ccle_gpd_mean_lu_matrix, ccle_gpd_count_lu_matrix,
    ccle_gpd_max_ncu_matrix, ccle_gpd_sum_ncu_matrix, ccle_gpd_mean_ncu_matrix, ccle_gpd_count_ncu_matrix,
    ccle_clinvar_max_pathogenic_matrix, ccle_clinvar_sum_pathogenic_matrix, ccle_clinvar_mean_pathogenic_matrix, ccle_clinvar_count_pathogenic_matrix,
    ccle_clinvar_max_vus_matrix, ccle_clinvar_sum_vus_matrix, ccle_clinvar_mean_vus_matrix, ccle_clinvar_count_vus_matrix,
    ccle_clinvar_max_benign_matrix, ccle_clinvar_sum_benign_matrix, ccle_clinvar_mean_benign_matrix, ccle_clinvar_count_benign_matrix,
], axis = 1)
ccle_feature_matrix.shape

In [None]:
ccle_feature_matrix.to_csv("../data/processed/clinvar_gpd_annovar_annotated_ccle_feature_matrix.csv")

### TCGA dataset

In [None]:
tcga_annovar_gpd_annotated_df = pd.read_csv("../data/processed/tcga_annovar_gpd_annot_per_patient_per_mutation.csv",)
tcga_annovar_gpd_annotated_df

In [None]:
tcga_annovar_gpd_annotated_df["xon17_score"] = tcga_annovar_gpd_annotated_df["1plusxon17_score"] - 1 # remvoing the 1
tcga_annovar_gpd_annotated_df

In [None]:
tcga_clinvar_df = pd.read_csv("../data/processed/clinvar_anno_features_per_mutation_tcga.csv", index_col=0)
tcga_clinvar_df

In [None]:
tcga_clinvar_df.CLNSIG.value_counts()

In [None]:
# remove duplicate mappings from input to ClinVar annotations
tcga_clinvar_df = tcga_clinvar_df.reset_index().drop_duplicates(subset=["input"])
tcga_clinvar_df.shape

In [None]:
tcga_clinvar_df["ClinVar_annotations_categorized"] = tcga_clinvar_df["CLNSIG"].apply(get_clinvar_supercategory)
tcga_clinvar_df["ClinVar_annotations_categorized"].value_counts()

In [None]:
merged_tcga = pd.merge(tcga_annovar_gpd_annotated_df, tcga_clinvar_df, left_on = "point_mutation", right_on="input", how="left")
merged_tcga

In [None]:
merged_tcga.gene.isna().sum() # to ensure no NaNs exist

#### GPD based features

In [None]:
# Max feature
tcga_gpd_max = merged_tcga.groupby(["submitter_id", "gene", "GPD_unit"]).aggregate("max")[["xon17_score"]].reset_index()
tcga_gpd_max

In [None]:
tcga_gpd_max_lu_matrix = get_matrices(tcga_gpd_max, merged_tcga, criteria="GPD_unit", criteria_value="LU", index_name='submitter_id', column_name='gene')
tcga_gpd_max_piu_matrix = get_matrices(tcga_gpd_max, merged_tcga, criteria="GPD_unit", criteria_value="PIU", index_name='submitter_id', column_name='gene')
tcga_gpd_max_ncu_matrix = get_matrices(tcga_gpd_max, merged_tcga, criteria="GPD_unit", criteria_value="NCU", index_name='submitter_id', column_name='gene')
print(tcga_gpd_max_lu_matrix.shape)
print(tcga_gpd_max_piu_matrix.shape)
print(tcga_gpd_max_ncu_matrix.shape)

In [None]:
# Sum feature
tcga_gpd_sum = merged_tcga.groupby(["submitter_id", "gene", "GPD_unit"]).aggregate("sum")[["xon17_score"]].reset_index()
tcga_gpd_sum

In [None]:
tcga_gpd_sum_lu_matrix = get_matrices(tcga_gpd_sum, merged_tcga, criteria="GPD_unit", criteria_value="LU", index_name='submitter_id', column_name='gene')
tcga_gpd_sum_piu_matrix = get_matrices(tcga_gpd_sum, merged_tcga, criteria="GPD_unit", criteria_value="PIU", index_name='submitter_id', column_name='gene')
tcga_gpd_sum_ncu_matrix = get_matrices(tcga_gpd_sum, merged_tcga, criteria="GPD_unit", criteria_value="NCU", index_name='submitter_id', column_name='gene')
print(tcga_gpd_sum_lu_matrix.shape)
print(tcga_gpd_sum_piu_matrix.shape)
print(tcga_gpd_sum_ncu_matrix.shape)

In [None]:
# Mean feature
tcga_gpd_mean = merged_tcga.groupby(["submitter_id", "gene", "GPD_unit"]).aggregate("mean")[["xon17_score"]].reset_index()
tcga_gpd_mean

In [None]:
tcga_gpd_mean_lu_matrix = get_matrices(tcga_gpd_mean, merged_tcga, criteria="GPD_unit", criteria_value="LU", index_name='submitter_id', column_name='gene')
tcga_gpd_mean_piu_matrix = get_matrices(tcga_gpd_mean, merged_tcga, criteria="GPD_unit", criteria_value="PIU", index_name='submitter_id', column_name='gene')
tcga_gpd_mean_ncu_matrix = get_matrices(tcga_gpd_mean, merged_tcga, criteria="GPD_unit", criteria_value="NCU", index_name='submitter_id', column_name='gene')
print(tcga_gpd_mean_lu_matrix.shape)
print(tcga_gpd_mean_piu_matrix.shape)
print(tcga_gpd_mean_ncu_matrix.shape)

In [None]:
# Count feature
tcga_gpd_count = merged_tcga.groupby(["submitter_id", "gene", "GPD_unit"]).aggregate("count")[["xon17_score"]].reset_index()
tcga_gpd_count

In [None]:
tcga_gpd_count_lu_matrix = get_matrices(tcga_gpd_count, merged_tcga, criteria="GPD_unit", criteria_value="LU", index_name='submitter_id', column_name='gene')
tcga_gpd_count_piu_matrix = get_matrices(tcga_gpd_count, merged_tcga, criteria="GPD_unit", criteria_value="PIU", index_name='submitter_id', column_name='gene')
tcga_gpd_count_ncu_matrix = get_matrices(tcga_gpd_count, merged_tcga, criteria="GPD_unit", criteria_value="NCU", index_name='submitter_id', column_name='gene')
print(tcga_gpd_count_lu_matrix.shape)
print(tcga_gpd_count_piu_matrix.shape)
print(tcga_gpd_count_ncu_matrix.shape)

#### ClinVar based features

In [None]:
# Max feature
tcga_clinvar_max = merged_tcga.groupby(["submitter_id", "gene", "ClinVar_annotations_categorized"]).aggregate("max")[["xon17_score"]].reset_index()
tcga_clinvar_max

In [None]:
tcga_clinvar_max_pathogenic_matrix = get_matrices(tcga_clinvar_max, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="Pathogenic", index_name='submitter_id', column_name='gene')
tcga_clinvar_max_vus_matrix = get_matrices(tcga_clinvar_max, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="VUS", index_name='submitter_id', column_name='gene')
tcga_clinvar_max_benign_matrix = get_matrices(tcga_clinvar_max, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="Benign", index_name='submitter_id', column_name='gene')
print(tcga_clinvar_max_pathogenic_matrix.shape)
print(tcga_clinvar_max_vus_matrix.shape)
print(tcga_clinvar_max_benign_matrix.shape)

In [None]:
# Sum feature
tcga_clinvar_sum = merged_tcga.groupby(["submitter_id", "gene", "ClinVar_annotations_categorized"]).aggregate("sum")[["xon17_score"]].reset_index()
tcga_clinvar_sum

In [None]:
tcga_clinvar_sum_pathogenic_matrix = get_matrices(tcga_clinvar_sum, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="Pathogenic", index_name='submitter_id', column_name='gene')
tcga_clinvar_sum_vus_matrix = get_matrices(tcga_clinvar_sum, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="VUS", index_name='submitter_id', column_name='gene')
tcga_clinvar_sum_benign_matrix = get_matrices(tcga_clinvar_sum, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="Benign", index_name='submitter_id', column_name='gene')
print(tcga_clinvar_sum_pathogenic_matrix.shape)
print(tcga_clinvar_sum_vus_matrix.shape)
print(tcga_clinvar_sum_benign_matrix.shape)

In [None]:
# Mean feature
tcga_clinvar_mean = merged_tcga.groupby(["submitter_id", "gene", "ClinVar_annotations_categorized"]).aggregate("mean")[["xon17_score"]].reset_index()
tcga_clinvar_mean

In [None]:
tcga_clinvar_mean_pathogenic_matrix = get_matrices(tcga_clinvar_mean, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="Pathogenic", index_name='submitter_id', column_name='gene')
tcga_clinvar_mean_vus_matrix = get_matrices(tcga_clinvar_mean, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="VUS", index_name='submitter_id', column_name='gene')
tcga_clinvar_mean_benign_matrix = get_matrices(tcga_clinvar_mean, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="Benign", index_name='submitter_id', column_name='gene')
print(tcga_clinvar_mean_pathogenic_matrix.shape)
print(tcga_clinvar_mean_vus_matrix.shape)
print(tcga_clinvar_mean_benign_matrix.shape)

In [None]:
# Count feature
tcga_clinvar_count = merged_tcga.groupby(["submitter_id", "gene", "ClinVar_annotations_categorized"]).aggregate("count")[["xon17_score"]].reset_index()
tcga_clinvar_count

In [None]:
tcga_clinvar_count_pathogenic_matrix = get_matrices(tcga_clinvar_count, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="Pathogenic", index_name='submitter_id', column_name='gene')
tcga_clinvar_count_vus_matrix = get_matrices(tcga_clinvar_count, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="VUS", index_name='submitter_id', column_name='gene')
tcga_clinvar_count_benign_matrix = get_matrices(tcga_clinvar_count, merged_tcga, criteria="ClinVar_annotations_categorized", criteria_value="Benign", index_name='submitter_id', column_name='gene')
print(tcga_clinvar_count_pathogenic_matrix.shape)
print(tcga_clinvar_count_vus_matrix.shape)
print(tcga_clinvar_count_benign_matrix.shape)

In [None]:
# Add suffixes to identify columns
tcga_gpd_max_piu_matrix = tcga_gpd_max_piu_matrix.add_suffix('_piu_max')
tcga_gpd_sum_piu_matrix = tcga_gpd_sum_piu_matrix.add_suffix("_piu_sum")
tcga_gpd_mean_piu_matrix = tcga_gpd_mean_piu_matrix.add_suffix("_piu_mean")
tcga_gpd_count_piu_matrix = tcga_gpd_count_piu_matrix.add_suffix("_piu_count")
tcga_gpd_max_lu_matrix = tcga_gpd_max_lu_matrix.add_suffix("_lu_max")
tcga_gpd_sum_lu_matrix = tcga_gpd_sum_lu_matrix.add_suffix("_lu_sum")
tcga_gpd_mean_lu_matrix = tcga_gpd_mean_lu_matrix.add_suffix("_lu_mean")
tcga_gpd_count_lu_matrix = tcga_gpd_count_ncu_matrix.add_suffix("_lu_count")
tcga_gpd_max_ncu_matrix = tcga_gpd_max_ncu_matrix.add_suffix("_ncu_max")
tcga_gpd_sum_ncu_matrix = tcga_gpd_sum_ncu_matrix.add_suffix("_ncu_sum")
tcga_gpd_mean_ncu_matrix = tcga_gpd_mean_ncu_matrix.add_suffix("_ncu_mean")
tcga_gpd_count_ncu_matrix = tcga_gpd_count_ncu_matrix.add_suffix("_ncu_count")

tcga_clinvar_max_pathogenic_matrix = tcga_clinvar_max_pathogenic_matrix.add_suffix("_pathogenic_max")
tcga_clinvar_sum_pathogenic_matrix = tcga_clinvar_sum_pathogenic_matrix.add_suffix("_pathogenic_sum")
tcga_clinvar_mean_pathogenic_matrix = tcga_clinvar_mean_pathogenic_matrix.add_suffix("_pathogenic_mean")
tcga_clinvar_count_pathogenic_matrix = tcga_clinvar_count_pathogenic_matrix.add_suffix("_pathogenic_count")
tcga_clinvar_max_vus_matrix = tcga_clinvar_max_vus_matrix.add_suffix("_vus_max")
tcga_clinvar_sum_vus_matrix = tcga_clinvar_sum_vus_matrix.add_suffix("_vus_sum")
tcga_clinvar_mean_vus_matrix = tcga_clinvar_mean_vus_matrix.add_suffix("_vus_mean")
tcga_clinvar_count_vus_matrix = tcga_clinvar_count_vus_matrix.add_suffix("_vus_count")
tcga_clinvar_max_benign_matrix = tcga_clinvar_max_benign_matrix.add_suffix("_benign_max")
tcga_clinvar_sum_benign_matrix = tcga_clinvar_sum_benign_matrix.add_suffix("_benign_sum")
tcga_clinvar_mean_benign_matrix = tcga_clinvar_mean_benign_matrix.add_suffix("_benign_mean")
tcga_clinvar_count_benign_matrix = tcga_clinvar_count_benign_matrix.add_suffix("_benign_count")

In [None]:
# PIU - Max, Sum, Mean, Count, LU - Max, Sum, Mean, Count, NCU - Max, Sum, Mean, Count, Pathogenic - Max, Sum, Mean, Count, VUS - Max, Sum, Mean, Count, Benign - Max, Sum, Mean, Count
tcga_feature_matrix = pd.concat([
    tcga_gpd_max_piu_matrix, tcga_gpd_sum_piu_matrix, tcga_gpd_mean_piu_matrix, tcga_gpd_count_piu_matrix,
    tcga_gpd_max_lu_matrix, tcga_gpd_sum_lu_matrix, tcga_gpd_mean_lu_matrix, tcga_gpd_count_lu_matrix,
    tcga_gpd_max_ncu_matrix, tcga_gpd_sum_ncu_matrix, tcga_gpd_mean_ncu_matrix, tcga_gpd_count_ncu_matrix,
    tcga_clinvar_max_pathogenic_matrix, tcga_clinvar_sum_pathogenic_matrix, tcga_clinvar_mean_pathogenic_matrix, tcga_clinvar_count_pathogenic_matrix,
    tcga_clinvar_max_vus_matrix, tcga_clinvar_sum_vus_matrix, tcga_clinvar_mean_vus_matrix, tcga_clinvar_count_vus_matrix,
    tcga_clinvar_max_benign_matrix, tcga_clinvar_sum_benign_matrix, tcga_clinvar_mean_benign_matrix, tcga_clinvar_count_benign_matrix,
], axis = 1)
tcga_feature_matrix.shape

In [None]:
tcga_feature_matrix.to_csv("../data/processed/clinvar_gpd_annovar_annotated_tcga_feature_matrix.csv")

In [None]:
tcga_feature_matrix