In [2]:
import sys

sys.path.append("../../vae_zinb_reprn/")
sys.path.append("../src/")

In [3]:
import numpy as np
import pandas as pd

import datetime
import logging
import os
import time
from functools import cached_property

from sklearn.metrics import average_precision_score, ndcg_score, roc_auc_score

In [4]:
from data import (
    construct_anno_features,
    construct_raw_mutation_features,
    construct_raw_cnv_features,
    canonicalize_mutations,
    construct_anno_features_xon17
)

In [5]:
tcga_mutations_df = pd.read_csv("../data/raw/tcga_point_mutations_incl_alive.csv")
tcga_mutations_df

Unnamed: 0,submitter_id,input
0,TCGA-2E-A9G8,FBXW7 R505G
1,TCGA-2E-A9G8,TP53 E286_E287del
2,TCGA-4E-A92E,ARID1A S2269*
3,TCGA-4E-A92E,PIK3C2B I19I
4,TCGA-4E-A92E,CTNNB1 S37F
...,...,...
55794,TCGA-ZF-AA5N,ARID1A Q505Hfs*113
55795,TCGA-ZF-AA5N,HRAS K117N
55796,TCGA-ZF-AA5P,PPARG S249L
55797,TCGA-ZF-AA5P,TP53 H193R


In [6]:
tcga_response = pd.read_csv("../data/processed/TCGA_drug_response_010222.csv")
tcga_response.rename(
    columns={
        "patient.arr": "submitter_id",
        "drug": "drug_name",
        "response": "response_description",
        "response_cat": "response",
    },
    inplace=True,
)
tcga_response

Unnamed: 0,submitter_id,drug.name,response_description,response,drug_name
0,TCGA-G2-A2EC,Methotrexate,Partial Response,1,METHOTREXATE
1,TCGA-G2-A2EC,Doxorubicin,Partial Response,1,DOXORUBICIN
2,TCGA-G2-A2EC,Vinblastine,Partial Response,1,VINBLASTINE
3,TCGA-G2-A2EC,Cisplatin,Partial Response,1,CISPLATIN
4,TCGA-G2-A2EJ,Paclitaxel,Stable Disease,0,PACLITAXEL
...,...,...,...,...,...
1244,TCGA-BG-A0VZ,Cisplatin,Complete Response,1,CISPLATIN
1245,TCGA-BG-A0VZ,Paclitaxel,Complete Response,1,PACLITAXEL
1246,TCGA-BG-A0VZ,Doxorubicin,Complete Response,1,DOXORUBICIN
1247,TCGA-BG-A0VT,Carboplatin,Complete Response,1,CARBOPLATIN


In [7]:
tcga_mutations_df = tcga_mutations_df[tcga_mutations_df.submitter_id.isin(tcga_response.submitter_id.unique())].copy()
tcga_mutations_df.shape

(6121, 2)

## Variant Annotations with Annovar/Transvar

In [8]:
genes_324 = pd.read_csv("../../data/druid_1.3_data/gene2ind.txt", header=None)[
    0
].tolist()

In [8]:
# Extract point mutations for the 324 F1 genes
tcga_mutations = []
for submitter_id in tcga_mutations_df.submitter_id.unique():
    filtered_df = tcga_mutations_df[tcga_mutations_df.submitter_id == submitter_id]
    point_mutations = filtered_df.input.values
    point_mutations = canonicalize_mutations(point_mutations)
    for gene in genes_324:
        for point_mutation in point_mutations:
            if f"{gene} " in point_mutation:
                tcga_mutations.append((submitter_id, point_mutation.strip()))
                
tcga_mutations_filtered = pd.DataFrame(
    tcga_mutations, columns=["submitter_id", "point_mutation"]
)
tcga_mutations_filtered

Unnamed: 0,submitter_id,point_mutation
0,TCGA-2E-A9G8,FBXW7 R505G
1,TCGA-2E-A9G8,TP53 E286_E287del
2,TCGA-A5-A1OH,AXL S447S
3,TCGA-A5-A1OH,BRD4 X1340_splice
4,TCGA-A5-A1OH,HRAS E31K
...,...,...
6116,TCGA-XF-AAN7,RB1 S834*
6117,TCGA-XF-AAN7,SMAD4 A406A
6118,TCGA-XF-AAN7,TP53 A161T
6119,TCGA-XF-AAN7,XPO1 F599F


In [9]:
# import dask

# from tqdm import tqdm
# from dask.distributed import Client


# client = Client()
# client.cluster.scale(30)

In [10]:
# %load_ext jupyterlab_notify

In [11]:
# %%notify

# futures = []
# for submitter_id in tcga_mutations_filtered.submitter_id.unique():
#     filtered_df = tcga_mutations_filtered[
#         tcga_mutations_filtered.submitter_id == submitter_id
#     ]
#     point_mutations = list(filtered_df.point_mutation.values)
#     future = client.submit(construct_anno_features_xon17, submitter_id, point_mutations, False)
#     futures.append(future)

# anno_features_combined = client.gather(futures, errors="skip")
# client.shutdown()

In [46]:
# variant annotation on these point mutations
import sys
sys.path.append("../src/")
from data import (construct_anno_features, 
                  construct_raw_cnv_features, 
                  construct_raw_mutation_features, 
                  canonicalize_mutations,
                  ALIAS_TO_CANONICAL_NAME_MAP, 
                  _is_valid_point_mutations,
                  get_annotation_features,
                  preprocess_annotation_features,
                  MICEData
                 )

In [47]:
import logging
import re
import subprocess
import tempfile
import itertools
import json

In [48]:
REQUIRED_ANNOTATION_COLUMNS = [
    "SIFT_score",
    "SIFT_converted_rankscore",
    "SIFT_pred",
    "SIFT4G_score",
    "SIFT4G_converted_rankscore",
    "SIFT4G_pred",
    "LRT_score",
    "LRT_converted_rankscore",
    "LRT_pred",
    "MutationTaster_score",
    "MutationTaster_converted_rankscore",
    "MutationTaster_pred",
    "MutationAssessor_score",
    "MutationAssessor_rankscore",
    "MutationAssessor_pred",
    "FATHMM_score",
    "FATHMM_converted_rankscore",
    "FATHMM_pred",
    "PROVEAN_score",
    "PROVEAN_converted_rankscore",
    "PROVEAN_pred",
    "MetaSVM_pred",
    "M-CAP_score",
    "M-CAP_rankscore",
    "M-CAP_pred",
    "MVP_score",
    "MVP_rankscore",
    "MPC_score",
    "MPC_rankscore",
    "PrimateAI_score",
    "PrimateAI_rankscore",
    "PrimateAI_pred",
    "DEOGEN2_score",
    "DEOGEN2_rankscore",
    "DEOGEN2_pred",
    "BayesDel_addAF_score",
    "BayesDel_addAF_pred",
    "BayesDel_noAF_score",
    "BayesDel_noAF_rankscore",
    "BayesDel_noAF_pred",
    "ClinPred_score",
    "ClinPred_rankscore",
    "ClinPred_pred",
    "LIST-S2_score",
    "LIST-S2_rankscore",
    "LIST-S2_pred",
    "DANN_score",
    "DANN_rankscore",
    "fathmm-MKL_coding_score",
    "fathmm-MKL_coding_rankscore",
    "fathmm-MKL_coding_pred",
    "fathmm-XF_coding_score",
    "fathmm-XF_coding_rankscore",
    "fathmm-XF_coding_pred",
    "Eigen-raw_coding",
    "Eigen-raw_coding_rankscore",
    "Eigen-PC-raw_coding",
    "Eigen-PC-raw_coding_rankscore",
]
CATEGORICAL_COLUMNS = [
    "sift_pred",
    "sift4g_pred",
    "lrt_pred",
    "mutationtaster_pred",
    "mutationassessor_pred",
    "fathmm_pred",
    "provean_pred",
    "metasvm_pred",
    "m_cap_pred",
    "primateai_pred",
    "deogen2_pred",
    "bayesdel_addaf_pred",
    "bayesdel_noaf_pred",
    "clinpred_pred",
    "list_s2_pred",
    "fathmm_mkl_coding_pred",
    "fathmm_xf_coding_pred",
]

# The thresholds used in PREDICTOR_LAMBDA_MAP are taken from the corresponding
# technique's published paper/web page
PREDICTOR_LAMBDA_MAP = {
    "sift_pred": ("sift_score", lambda v: "D" if v <= 0.05 else "T"),
    "sift4g_pred": ("sift4g_score", lambda v: "D" if v <= 0.05 else "T"),
    "lrt_pred": ("lrt_score", lambda v: "D" if v <= 0.001 else "U"),
    "mutationtaster_pred": (
        "mutationtaster_score",
        lambda v: None,
    ),  # Threshold is not available and couldn't be derived from available values as well
    "mutationassessor_pred": (
        "mutationassessor_score",
        lambda v: "H" if v >= 3.5 else ("M" if v >= 1.94 else "L"),
    ),
    "fathmm_pred": ("fathmm_score", lambda v: "D" if v < 1.5 else "T"),
    "provean_pred": ("provean_score", lambda v: "D" if v <= 2.282 else "N"),
    "metasvm_pred": (
        "metasvm_pred",
        lambda v: None,
    ),  # No corresponding numeric score available for this method
    "m_cap_pred": ("m_cap_score", lambda v: "D" if v >= 0.025 else "T"),
    "primateai_pred": ("primateai_score", lambda v: "D" if v >= 0.803 else "T"),
    "deogen2_pred": ("deogen2_score", lambda v: "D" if v >= 0.45 else "T"),
    "bayesdel_addaf_pred": (
        "bayesdel_addaf_score",
        lambda v: "D" if v >= 0.0692 else "T",
    ),
    "bayesdel_noaf_pred": (
        "bayesdel_noaf_score",
        lambda v: "D" if v >= -0.0570 else "T",
    ),
    "clinpred_pred": ("clinpred_score", lambda v: "D" if v >= 0.5 else "T"),
    "list_s2_pred": ("list_s2_score", lambda v: "D" if v >= 0.85 else "T"),
    "fathmm_mkl_coding_pred": (
        "fathmm_mkl_coding_score",
        lambda v: "D" if v >= 0.5 else "N",
    ),
    "fathmm_xf_coding_pred": (
        "fathmm_xf_coding_score",
        lambda v: "D" if v >= 0.5 else "N",
    ),
}

DELETERIOUS_VALUES = ["D", "A", "H", "M"]

CNV_PATTERN = r"loss|amplification"

GENES_324 = pd.read_csv("../../data/druid_1.3_data/gene2ind.txt", header=None)[
    0
].tolist()

ANNOTATION_SCRIPT_PATH = "../script/goAAtoGv2.sh"
SPECIAL_CASES = r"rearrangement|truncation|fs|del|ins"

In [15]:
def construct_anno_features_xon17_gpd(patient_id, patient_mutations, agg_features=False):
    """
    TODO: Add support for other agg functions (mean, OR, etc) - as of 202209, only
    sum is supported
    Here, the aggregation is done as an average over all variants over all 17 algorithms.
    """
    if agg_features:
        logging.warn(
            """
        Received agg_features=True -> As of now, construct_anno_features only supports sum aggregation.
        Please ensure that the agg used in dataset definition is sum - if it is not sum, please pass
        agg_features=False and perform agg in dataset definition
        """
        )

    if not _is_valid_point_mutations(patient_mutations):
        return None

    anno_features_combined_imputed_df = pd.read_csv(
        "../data/processed/anno_features_combined_imputed.csv"
    )
    logging.info(anno_features_combined_imputed_df.shape)
    anno_features_combined_imputed_df.set_index(["input"], inplace=True)
    anno_features_combined_imputed_df.head()

    canonical_mutations = canonicalize_mutations(patient_mutations)

    mutations_with_missing_annotations = []
    available_mutations = []
    for mutation in canonical_mutations:
        if mutation in anno_features_combined_imputed_df.index:
            available_mutations.append(mutation)
        elif not re.search(CNV_PATTERN, mutation, re.IGNORECASE):
            mutations_with_missing_annotations.append(mutation)

    if available_mutations:
        patient_anno_features = anno_features_combined_imputed_df.loc[
            available_mutations
        ]
        patient_anno_features = patient_anno_features[CATEGORICAL_COLUMNS].copy()
    else:
        patient_anno_features = None

    if len(mutations_with_missing_annotations) != 0:
        logging.info(
            f"Found mutations with missing annotations - {mutations_with_missing_annotations}"
        )
        missing_annotations = get_annotation_features(
            mutations_with_missing_annotations
        )
        if missing_annotations is not None:
            missing_annotations = missing_annotations[
                REQUIRED_ANNOTATION_COLUMNS
            ].copy()
            missing_annotations.reset_index(inplace=True)
            missing_annotations = preprocess_annotation_features(missing_annotations)
            missing_annotations = missing_annotations[~missing_annotations.duplicated()]
            missing_annotations.reset_index(drop=True, inplace=True)
            missing_annotations.set_index("input", inplace=True)

            numeric_columns = list(
                column
                for column in missing_annotations.columns
                if pd.api.types.is_numeric_dtype(missing_annotations[column])
            )
            # Prepare mask by identifying rows that have all na values for numeric_columns
            na_mask = None
            for col in numeric_columns:
                if type(na_mask) == pd.Series:
                    na_mask = na_mask & missing_annotations[col].isna()
                else:
                    na_mask = missing_annotations[col].isna()

            missing_annotations = missing_annotations[~na_mask]
            numeric_df = missing_annotations[numeric_columns].copy()
            logging.info(numeric_df.shape)
            numeric_df.head()
            numeric_df = pd.concat(
                [numeric_df, anno_features_combined_imputed_df[numeric_columns]],
            )

            categorical_columns = [
                column
                for column in missing_annotations.columns
                if column not in numeric_columns
            ]
            categorical_missing_annotations = missing_annotations[
                categorical_columns
            ].copy()
            logging.info(categorical_missing_annotations.shape)
            categorical_missing_annotations.head()

            imp = MICEData(numeric_df)
            # Impute missing values in numeric columns - Expensive!!
            imp.update_all()
            imputed_df = imp.data
            assert numeric_df.shape == imputed_df.shape
            imputed_df.index = numeric_df.index
            imputed_df = imputed_df[
                imputed_df.index.isin(mutations_with_missing_annotations)
            ].copy()
            numeric_imputed_df = pd.concat(
                [categorical_missing_annotations, imputed_df,], axis=1,
            )
            logging.info(numeric_imputed_df.shape)
            for column in CATEGORICAL_COLUMNS:
                logging.info(
                    column,
                    numeric_imputed_df[column].unique(),
                    len(numeric_imputed_df[numeric_imputed_df[column].isna()]),
                )
                col_na_mask = numeric_imputed_df[column].isna()
                numeric_imputed_df.loc[col_na_mask, column] = numeric_imputed_df[
                    col_na_mask
                ][PREDICTOR_LAMBDA_MAP[column][0]].apply(
                    PREDICTOR_LAMBDA_MAP[column][1]
                )

                # logging.info(
                #     column,
                #     numeric_imputed_df[column].unique(),
                #     len(numeric_imputed_df[numeric_imputed_df[column].isna()]),
                # )

            numeric_imputed_df = numeric_imputed_df.dropna()
            logging.info(numeric_imputed_df.shape)
            missing_anno_features_df = numeric_imputed_df[CATEGORICAL_COLUMNS].copy()

            patient_anno_features = pd.concat(
                [missing_anno_features_df, patient_anno_features]
            )
            
    if patient_anno_features is not None:
        for col in patient_anno_features.columns:
            patient_anno_features[col] = patient_anno_features[col].apply(
                lambda v: 1 if v in DELETERIOUS_VALUES else 0
            )

        patient_anno_features.reset_index(inplace=True)
        patient_anno_features["gene"] = patient_anno_features.input.apply(
            lambda gene_mut: gene_mut.split(" ")[0]
        )
    return patient_anno_features

In [16]:
tcga_mutations_filtered

Unnamed: 0,submitter_id,point_mutation
0,TCGA-2E-A9G8,FBXW7 R505G
1,TCGA-2E-A9G8,TP53 E286_E287del
2,TCGA-A5-A1OH,AXL S447S
3,TCGA-A5-A1OH,BRD4 X1340_splice
4,TCGA-A5-A1OH,HRAS E31K
...,...,...
6116,TCGA-XF-AAN7,RB1 S834*
6117,TCGA-XF-AAN7,SMAD4 A406A
6118,TCGA-XF-AAN7,TP53 A161T
6119,TCGA-XF-AAN7,XPO1 F599F


In [18]:
# %%notify

# futures = []
anno_features_combined = pd.DataFrame()
for patient_id in tcga_mutations_filtered.submitter_id.unique():
    filtered_df = tcga_mutations_filtered[
        tcga_mutations_filtered.submitter_id == submitter_id
    ]
    point_mutations = list(filtered_df.point_mutation.values)
    res = construct_anno_features_xon17_gpd(patient_id, point_mutations, False)
    anno_features_combined = pd.concat([anno_features_combined, res], ignore_index = True)
#     future = client.submit(construct_anno_features_xon17_gpd, patient_id, point_mutations, False)
#     futures.append(future)

# anno_features_combined = client.gather(futures, errors="skip")
# client.shutdown()

In [19]:
anno_features_combined

Unnamed: 0,input,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,fathmm_pred,provean_pred,metasvm_pred,m_cap_pred,primateai_pred,deogen2_pred,bayesdel_addaf_pred,bayesdel_noaf_pred,clinpred_pred,list_s2_pred,fathmm_mkl_coding_pred,fathmm_xf_coding_pred,gene
0,ATRX E733K,1,1,1,1,0,1,0,1,1,0,0,1,1,1,1,1,0,ATRX
1,CTCF E376*,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0,CTCF
2,KRAS G12D,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,KRAS
3,NOTCH1 V2119M,1,0,0,1,0,1,0,1,1,0,0,0,0,1,1,1,1,NOTCH1
4,RB1 S834*,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0,RB1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4167,KRAS G12D,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,KRAS
4168,NOTCH1 V2119M,1,0,0,1,0,1,0,1,1,0,0,0,0,1,1,1,1,NOTCH1
4169,RB1 S834*,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0,RB1
4170,TP53 A161T,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,TP53


In [21]:
agg_anno_features_combined_df = anno_features_combined#pd.concat(anno_features_combined).reset_index()
agg_anno_features_combined_df.set_index("input", inplace=True)
agg_anno_features_combined_df

Unnamed: 0_level_0,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,fathmm_pred,provean_pred,metasvm_pred,m_cap_pred,primateai_pred,deogen2_pred,bayesdel_addaf_pred,bayesdel_noaf_pred,clinpred_pred,list_s2_pred,fathmm_mkl_coding_pred,fathmm_xf_coding_pred,gene
input,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ATRX E733K,1,1,1,1,0,1,0,1,1,0,0,1,1,1,1,1,0,ATRX
CTCF E376*,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0,CTCF
KRAS G12D,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,KRAS
NOTCH1 V2119M,1,0,0,1,0,1,0,1,1,0,0,0,0,1,1,1,1,NOTCH1
RB1 S834*,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0,RB1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
KRAS G12D,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,KRAS
NOTCH1 V2119M,1,0,0,1,0,1,0,1,1,0,0,0,0,1,1,1,1,NOTCH1
RB1 S834*,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0,RB1
TP53 A161T,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,TP53


In [22]:
# HERE!! - Now merge with original and save
merged_1 = pd.merge(tcga_mutations_filtered, agg_anno_features_combined_df.reset_index(), left_on = "point_mutation", right_on="input", how="left")
merged_1

Unnamed: 0,submitter_id,point_mutation,input,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,fathmm_pred,provean_pred,...,m_cap_pred,primateai_pred,deogen2_pred,bayesdel_addaf_pred,bayesdel_noaf_pred,clinpred_pred,list_s2_pred,fathmm_mkl_coding_pred,fathmm_xf_coding_pred,gene
0,TCGA-2E-A9G8,FBXW7 R505G,,,,,,,,,...,,,,,,,,,,
1,TCGA-2E-A9G8,TP53 E286_E287del,,,,,,,,,...,,,,,,,,,,
2,TCGA-A5-A1OH,AXL S447S,,,,,,,,,...,,,,,,,,,,
3,TCGA-A5-A1OH,BRD4 X1340_splice,,,,,,,,,...,,,,,,,,,,
4,TCGA-A5-A1OH,HRAS E31K,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16826,TCGA-XF-AAN7,ZNF217 L231V,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217
16827,TCGA-XF-AAN7,ZNF217 L231V,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217
16828,TCGA-XF-AAN7,ZNF217 L231V,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217
16829,TCGA-XF-AAN7,ZNF217 L231V,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217


In [23]:
merged_1.drop("input", axis = 1, inplace=True)
merged_1.isna().sum()

submitter_id                 0
point_mutation               0
sift_pred                 6103
sift4g_pred               6103
lrt_pred                  6103
mutationtaster_pred       6103
mutationassessor_pred     6103
fathmm_pred               6103
provean_pred              6103
metasvm_pred              6103
m_cap_pred                6103
primateai_pred            6103
deogen2_pred              6103
bayesdel_addaf_pred       6103
bayesdel_noaf_pred        6103
clinpred_pred             6103
list_s2_pred              6103
fathmm_mkl_coding_pred    6103
fathmm_xf_coding_pred     6103
gene                      6103
dtype: int64

In [24]:
# For those point mutations without annotations
merged_1.fillna(0, inplace=True)

In [29]:
merged_1["gene"] = merged_1["point_mutation"].apply(lambda x: x.split(" ")[0])

In [32]:
merged_1["gene"].value_counts()

KRAS      6011
TP53      1507
RB1       1240
ATRX       642
NOTCH1     640
          ... 
BCL2L2       2
RAD51D       1
AURKA        1
CEBPA        1
CCND3        1
Name: gene, Length: 312, dtype: int64

In [33]:
merged_1["1plusxon17_score"] = 1 + merged_1[CATEGORICAL_COLUMNS].sum(axis=1)/17
merged_1

Unnamed: 0,submitter_id,point_mutation,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,fathmm_pred,provean_pred,metasvm_pred,...,primateai_pred,deogen2_pred,bayesdel_addaf_pred,bayesdel_noaf_pred,clinpred_pred,list_s2_pred,fathmm_mkl_coding_pred,fathmm_xf_coding_pred,gene,1plusxon17_score
0,TCGA-2E-A9G8,FBXW7 R505G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FBXW7,1.000000
1,TCGA-2E-A9G8,TP53 E286_E287del,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TP53,1.000000
2,TCGA-A5-A1OH,AXL S447S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,AXL,1.000000
3,TCGA-A5-A1OH,BRD4 X1340_splice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BRD4,1.000000
4,TCGA-A5-A1OH,HRAS E31K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,HRAS,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16826,TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882
16827,TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882
16828,TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882
16829,TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882


### Output: Xon17 annotation

In [35]:
patient_gene_matrix_xon17 = merged_1.pivot_table(index="submitter_id", columns="gene", values="1plusxon17_score", aggfunc="max")
patient_gene_matrix_xon17.fillna(0, inplace=True)
patient_gene_matrix_xon17

gene,ABL1,ACVR1B,AKT1,AKT2,AKT3,ALK,ALOX12B,APC,AR,ARAF,...,TYRO3,VEGFA,VHL,WHSC1,WHSC1L1,WT1,XPO1,XRCC2,ZNF217,ZNF703
submitter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-05-4384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
TCGA-05-4390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
TCGA-05-4398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
TCGA-05-4402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
TCGA-05-4427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-XF-AAN5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.0
TCGA-XF-AAN7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.705882,0.0
TCGA-XX-A899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
TCGA-Z7-A8R5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


In [37]:
for g in GENES_324:
    if g not in patient_gene_matrix_xon17.columns:
        patient_gene_matrix_xon17[g] = 0
patient_gene_matrix_xon17.shape

(596, 324)

In [39]:
patient_gene_matrix_xon17[GENES_324].reset_index().to_csv("../data/processed/tcga_anno_features_xon17.csv", index=False)

In [30]:
patient_gene_matrix_xon17 = pd.read_csv("../data/processed/tcga_anno_features_xon17.csv", index_col = 0)
patient_gene_matrix_xon17

Unnamed: 0_level_0,ABL1,ACVR1B,AKT1,AKT2,AKT3,ALK,ALOX12B,APC,AR,ARAF,...,U2AF1,VEGFA,VHL,WHSC1,WHSC1L1,WT1,XPO1,XRCC2,ZNF217,ZNF703
submitter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-05-4384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
TCGA-05-4390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
TCGA-05-4398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
TCGA-05-4402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
TCGA-05-4427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-XF-AAN5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.0
TCGA-XF-AAN7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.705882,0.0
TCGA-XX-A899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
TCGA-Z7-A8R5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


### Annovar annotation

In [21]:
annotated_df = pd.DataFrame(columns=['input', 'sift_pred', 'sift4g_pred', 'lrt_pred', 'mutationtaster_pred',
       'mutationassessor_pred', 'fathmm_pred', 'provean_pred', 'metasvm_pred',
       'm_cap_pred', 'primateai_pred', 'deogen2_pred', 'bayesdel_addaf_pred',
       'bayesdel_noaf_pred', 'clinpred_pred', 'list_s2_pred',
       'fathmm_mkl_coding_pred', 'fathmm_xf_coding_pred', 'gene'])
for submitter_id in tcga_mutations_filtered.submitter_id.unique():
    filtered_df = tcga_mutations_filtered[
        tcga_mutations_filtered.submitter_id == submitter_id
    ]
    point_mutations = list(filtered_df.point_mutation.values)
    _, annot_df = construct_anno_features_xon17(submitter_id, point_mutations, False)
    annotated_df = pd.concat([annotated_df, annot_df])

ERROR:root:Encountered error while processing mutation TP53 E286_E287del - No columns to parse from file
ERROR:root:Encountered error while processing mutation BRD4 X1340_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation ARID1A Y551Lfs*72 - No columns to parse from file
ERROR:root:Encountered error while processing mutation ATR X20_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation BCORL1 G1178Efs*44 - No columns to parse from file
ERROR:root:Encountered error while processing mutation CHEK1 T226Hfs*14 - No columns to parse from file
ERROR:root:Encountered error while processing mutation FAM46C R286R - No columns to parse from file
ERROR:root:Encountered error while processing mutation JAK1 I143Dfs*9 - No columns to parse from file
ERROR:root:Encountered error while processing mutation MRE11A R366Q - No columns to parse from file
ERROR:root:Encountered error while processing mutation SMAD4 X263_splic

ERROR:root:Encountered error while processing mutation MRE11A E466D - No columns to parse from file
ERROR:root:Encountered error while processing mutation MRE11A R388Q - No columns to parse from file
ERROR:root:Encountered error while processing mutation MRE11A D131N - No columns to parse from file
ERROR:root:Encountered error while processing mutation PARK2 R455R - No columns to parse from file
ERROR:root:Encountered error while processing mutation PARK2 E79* - No columns to parse from file
ERROR:root:Encountered error while processing mutation PDGFRA X854_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation PIK3R1 X373_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation RPTOR X169_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation TSC1 X344_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation WHSC1L1 I503T - No columns 

ERROR:root:Encountered error while processing mutation TP53 P60Qfs*63 - No columns to parse from file
ERROR:root:Encountered error while processing mutation APC T899Ifs*12 - No columns to parse from file
ERROR:root:Encountered error while processing mutation APC T1556Nfs*3 - No columns to parse from file
ERROR:root:Encountered error while processing mutation ACVR1B X421_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation BCOR L487Hfs*13 - No columns to parse from file
ERROR:root:Encountered error while processing mutation APC M1383Ifs*3 - No columns to parse from file
ERROR:root:Encountered error while processing mutation TP53 R290Kfs*53 - No columns to parse from file
ERROR:root:Encountered error while processing mutation APC H1490Nfs*25 - No columns to parse from file
ERROR:root:Encountered error while processing mutation TP53 X224_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation APC D849Ifs*12 - N

ERROR:root:Encountered error while processing mutation WHSC1L1 S709T - No columns to parse from file
ERROR:root:Encountered error while processing mutation CDH1 X522_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation ERBB2 L755S - No columns to parse from file
ERROR:root:Encountered error while processing mutation CDH1 D764Gfs*5 - No columns to parse from file
ERROR:root:Encountered error while processing mutation GATA3 R329Kfs*21 - No columns to parse from file
ERROR:root:Encountered error while processing mutation GATA3 S410Qfs*97 - No columns to parse from file
ERROR:root:Encountered error while processing mutation MAP3K1 S1077Qfs*5 - No columns to parse from file
ERROR:root:Encountered error while processing mutation CDH1 N431Qfs*14 - No columns to parse from file
ERROR:root:Encountered error while processing mutation MAP3K1 L380Kfs*4 - No columns to parse from file
ERROR:root:Encountered error while processing mutation GATA3 R330Efs*23 - 

ERROR:root:Encountered error while processing mutation TP53 C238Lfs*9 - No columns to parse from file
ERROR:root:Encountered error while processing mutation NOTCH1 S647Pfs*123 - No columns to parse from file
ERROR:root:Encountered error while processing mutation CDKN2A S56Afs*90 - No columns to parse from file
ERROR:root:Encountered error while processing mutation NF2 X525_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation TGFBR2 X466_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation CASP8 X137_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation RB1 L326Rfs*9 - No columns to parse from file
ERROR:root:Encountered error while processing mutation TP53 V272M - No columns to parse from file
ERROR:root:Encountered error while processing mutation TP53 X224_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation TP53 R290Sfs*56 

ERROR:root:Encountered error while processing mutation MRE11A Y111S - No columns to parse from file
ERROR:root:Encountered error while processing mutation P2RY8 R51Afs*12 - No columns to parse from file
ERROR:root:Encountered error while processing mutation RBM10 X387_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation STK11 D53Tfs*11 - No columns to parse from file
ERROR:root:Encountered error while processing mutation SMARCA4 X1183_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation STK11 Q220Pfs*46 - No columns to parse from file
ERROR:root:Encountered error while processing mutation MSH6 K854del - No columns to parse from file
ERROR:root:Encountered error while processing mutation TP53 X126_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation PPP2R2A X153_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation WHSC1L1 E31

ERROR:root:Encountered error while processing mutation BCORL1 P1681Qfs*20 - No columns to parse from file
ERROR:root:Encountered error while processing mutation CREBBP N390Tfs*3 - No columns to parse from file
ERROR:root:Encountered error while processing mutation INPP4B R818Efs*4 - No columns to parse from file
ERROR:root:Encountered error while processing mutation JAK1 R1103Sfs*5 - No columns to parse from file
ERROR:root:Encountered error while processing mutation JAK1 K860Nfs*16 - No columns to parse from file
ERROR:root:Encountered error while processing mutation JAK1 K616del - No columns to parse from file
ERROR:root:Encountered error while processing mutation JAK1 N339Ifs*3 - No columns to parse from file
ERROR:root:Encountered error while processing mutation MSH6 F1104Lfs*11 - No columns to parse from file
ERROR:root:Encountered error while processing mutation PALB2 M296* - No columns to parse from file
ERROR:root:Encountered error while processing mutation WHSC1 V264I - No col

ERROR:root:Encountered error while processing mutation ATM T1399Pfs*3 - No columns to parse from file
ERROR:root:Encountered error while processing mutation ETV4 R134Dfs*30 - No columns to parse from file
ERROR:root:Encountered error while processing mutation EZR P119Lfs*34 - No columns to parse from file
ERROR:root:Encountered error while processing mutation FANCL L191Wfs*4 - No columns to parse from file
ERROR:root:Encountered error while processing mutation IKBKE R49Afs*5 - No columns to parse from file
ERROR:root:Encountered error while processing mutation KDM5A D759Ifs*2 - No columns to parse from file
ERROR:root:Encountered error while processing mutation NBN F744Lfs*7 - No columns to parse from file
ERROR:root:Encountered error while processing mutation PBRM1 I709Ffs*5 - No columns to parse from file
ERROR:root:Encountered error while processing mutation SGK1 F372Lfs*5 - No columns to parse from file
ERROR:root:Encountered error while processing mutation TSC2 K34Nfs*12 - No colu

ERROR:root:Encountered error while processing mutation PIK3C2B R287Afs*92 - No columns to parse from file
ERROR:root:Encountered error while processing mutation QKI K134Rfs*14 - No columns to parse from file
ERROR:root:Encountered error while processing mutation RNF43 G659Vfs*41 - No columns to parse from file
ERROR:root:Encountered error while processing mutation ROS1 N1499Tfs*5 - No columns to parse from file
ERROR:root:Encountered error while processing mutation TET2 V1006Gfs*3 - No columns to parse from file
ERROR:root:Encountered error while processing mutation ZNF217 K525Nfs*23 - No columns to parse from file
ERROR:root:Encountered error while processing mutation BCOR P1621Qfs*53 - No columns to parse from file
ERROR:root:Encountered error while processing mutation NBN R466Gfs*18 - No columns to parse from file
ERROR:root:Encountered error while processing mutation PTCH1 N97Tfs*20 - No columns to parse from file
ERROR:root:Encountered error while processing mutation RNF43 E278del

ERROR:root:Encountered error while processing mutation KDM6A A756Qfs*18 - No columns to parse from file
ERROR:root:Encountered error while processing mutation STAG2 X274_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation PPARG T20Nfs*14 - No columns to parse from file
ERROR:root:Encountered error while processing mutation CDKN1A E108Gfs*21 - No columns to parse from file
ERROR:root:Encountered error while processing mutation EP300 E1334del - No columns to parse from file
ERROR:root:Encountered error while processing mutation WHSC1 L1184L - No columns to parse from file
ERROR:root:Encountered error while processing mutation STAG2 E1187_D1192del - No columns to parse from file
ERROR:root:Encountered error while processing mutation TSC1 X36_splice - No columns to parse from file
ERROR:root:Encountered error while processing mutation CDKN1A F51_D52ins* - No columns to parse from file
ERROR:root:Encountered error while processing mutation CDKN1A G4

In [22]:
annotated_df.head()

Unnamed: 0,input,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,fathmm_pred,provean_pred,metasvm_pred,m_cap_pred,primateai_pred,deogen2_pred,bayesdel_addaf_pred,bayesdel_noaf_pred,clinpred_pred,list_s2_pred,fathmm_mkl_coding_pred,fathmm_xf_coding_pred,gene
0,FBXW7 R505G,1,1,1,1,1,0,1,0,1,1,0,1,1,1,1,1,1,FBXW7
0,HRAS E31K,0,0,1,1,0,0,1,0,1,1,1,1,1,1,0,1,1,HRAS
1,PPP2R1A S256F,1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,PPP2R1A
2,SPOP L282V,1,1,1,1,0,0,1,0,1,1,1,1,1,1,1,1,1,SPOP
0,ERBB3 F219V,0,0,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,ERBB3


In [23]:
annotated_df.shape

(4532, 19)

In [24]:
# annotated_df.to_csv("../data/processed/anno_features_per_mutation_tcga.csv")

#### Clinvar annotations

In [43]:
# Also get Clinvar annotations for these mutations
annotated_df = pd.read_csv("../data/processed/anno_features_per_mutation_tcga.csv", index_col = 0)
annotated_df

Unnamed: 0,input,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,fathmm_pred,provean_pred,metasvm_pred,m_cap_pred,primateai_pred,deogen2_pred,bayesdel_addaf_pred,bayesdel_noaf_pred,clinpred_pred,list_s2_pred,fathmm_mkl_coding_pred,fathmm_xf_coding_pred,gene
0,FBXW7 R505G,1,1,1,1,1,0,1,0,1,1,0,1,1,1,1,1,1,FBXW7
0,HRAS E31K,0,0,1,1,0,0,1,0,1,1,1,1,1,1,0,1,1,HRAS
1,PPP2R1A S256F,1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,PPP2R1A
2,SPOP L282V,1,1,1,1,0,0,1,0,1,1,1,1,1,1,1,1,1,SPOP
0,ERBB3 F219V,0,0,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,ERBB3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,KRAS G12D,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,KRAS
3,NOTCH1 V2119M,1,0,0,1,0,1,0,1,1,0,0,0,0,1,1,1,1,NOTCH1
4,RB1 S834*,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0,RB1
5,TP53 A161T,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,TP53


In [44]:
ANNOTATION_SCRIPT_PATH_CLINVAR = "../script/goAAtoGv2_clinvar.sh"

In [49]:
results = []
for mutation in list(annotated_df["input"]):
    try:
        # Run annotation script within a temp file and extract features as DataFrame
        with tempfile.TemporaryDirectory() as tmpdirname:
            input_file_path = tmpdirname + "anno_input.txt"
            with open(input_file_path, "w+") as input_file:
                mutation_cleaned = [part for part in mutation.split(" ") if part]
                input_file.write(":p.".join(mutation_cleaned))
                input_file.write("\n")

            # Execute script
            cmd = "bash {0} {1}".format(ANNOTATION_SCRIPT_PATH_CLINVAR, input_file_path)

            logging.info(f"Executing command {cmd}")
            subprocess.call(cmd, shell=True, executable="/bin/bash")
            out_file_path = f"{input_file_path}.annot.hg38_finalannot.txt"
            res = pd.read_table(out_file_path)
    # Some inputs lead to errors, such as "PTEN loss" - ignore and continue processing
    except Exception as e:
        logging.error(
            f"Encountered error while processing mutation {mutation} - {e}"
        )
        continue
    res["input"] = mutation
    res = res[~res.duplicated()]
    results.append(res)
clinvar_annot_df = pd.concat(results)
clinvar_annot_df.set_index(["input"], inplace=True)
clinvar_annot_df.drop(columns=["Otherinfo1"], inplace=True)

In [50]:
clinvar_annot_df

Unnamed: 0_level_0,SIFT_score,SIFT_converted_rankscore,SIFT_pred,SIFT4G_score,SIFT4G_converted_rankscore,SIFT4G_pred,LRT_score,LRT_converted_rankscore,LRT_pred,MutationTaster_score,...,GERP++_RS_rankscore,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons30way_mammalian,phastCons30way_mammalian_rankscore,CLNALLELEID,CLNDN,CLNDISDB,CLNREVSTAT,CLNSIG
input,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FBXW7 R505G,0.001,0.913,D,0.015,0.616,D,0.0,0.629,D,1,...,0.631,1.0,0.716,1.0,0.863,363302,B-cell_chronic_lymphocytic_leukemia|Transition...,"Human_Phenotype_Ontology:HP:0005550,Human_Phen...",no_assertion_criteria_provided,Likely_pathogenic
HRAS E31K,0.109,0.311,T,0.345,0.177,T,0.0,0.843,D,1,...,0.338,1.0,0.716,0.826,0.344,.,.,.,.,.
PPP2R1A S256F,0.0,0.913,D,0.0,0.928,D,0.001,0.408,D,1.0,...,0.54,1.0,0.716,0.998,0.659,.,.,.,.,.
SPOP L282V,0.001,0.785,D,0.041,0.505,D,0.0,0.843,D,1,...,0.475,1.0,0.716,1.0,0.863,.,.,.,.,.
ERBB3 F219V,0.071,0.352,T,0.116,0.372,T,0.0,0.629,D,1,...,0.883,0.999,0.427,1.0,0.863,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
KRAS G12D,0.01,0.565,D,0.0,0.928,D,0.0,0.843,D,1,...,0.88,1.0,0.716,1.0,0.863,27621,Vascular_Tumors_Including_Pyogenic_Granuloma|A...,".|Human_Phenotype_Ontology:HP:0001914,Human_Ph...","criteria_provided,_multiple_submitters,_no_con...",Pathogenic
NOTCH1 V2119M,0.047,0.403,D,0.062,0.453,T,0.0,0.629,N,1.0,...,0.548,1.0,0.716,0.998,0.659,.,.,.,.,.
RB1 S834*,.,.,.,.,.,.,0.0,0.629,D,1,...,0.894,1.0,0.716,1.0,0.863,420582,Retinoblastoma|Hereditary_cancer-predisposing_...,"Human_Phenotype_Ontology:HP:0009919,MONDO:MOND...","criteria_provided,_multiple_submitters,_no_con...",Pathogenic
TP53 A161T,0.006,0.913,D,0.013,0.631,D,0.0,0.843,D,1.0,...,0.569,0.999,0.427,0.99,0.524,171616,Li-Fraumeni_syndrome_1|Malignant_tumor_of_pros...,"Gene:553989,MONDO:MONDO:0007903,MedGen:C183539...","criteria_provided,_conflicting_interpretations",Conflicting_interpretations_of_pathogenicity


In [51]:
clinvar_annot_df.CLNSIG.value_counts()

.                                               3681
Uncertain_significance                           475
Pathogenic                                       461
Pathogenic/Likely_pathogenic                     151
Conflicting_interpretations_of_pathogenicity     135
Likely_pathogenic                                130
Pathogenic|drug_response|other                    22
Likely_benign                                     21
not_provided                                      11
Benign                                             8
Benign/Likely_benign                               8
Pathogenic/Likely_pathogenic|other                 2
drug_response                                      2
Pathogenic/Likely_pathogenic|drug_response         1
other                                              1
Name: CLNSIG, dtype: int64

In [61]:
clinvar_annot_df.to_csv("../data/processed/clinvar_anno_features_per_mutation_tcga.csv")

### Variant Annotations from GPD

In [9]:
merged_1 = pd.read_csv("../data/processed/tcga_annovar_gpd_annot_per_patient_per_mutation.csv", index_col = 0)
merged_1

Unnamed: 0_level_0,point_mutation,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,fathmm_pred,provean_pred,metasvm_pred,m_cap_pred,primateai_pred,deogen2_pred,bayesdel_addaf_pred,bayesdel_noaf_pred,clinpred_pred,list_s2_pred,fathmm_mkl_coding_pred,fathmm_xf_coding_pred,gene,1plusxon17_score
submitter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
TCGA-2E-A9G8,FBXW7 R505G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FBXW7,1.000000
TCGA-2E-A9G8,TP53 E286_E287del,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TP53,1.000000
TCGA-A5-A1OH,AXL S447S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,AXL,1.000000
TCGA-A5-A1OH,BRD4 X1340_splice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BRD4,1.000000
TCGA-A5-A1OH,HRAS E31K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,HRAS,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882
TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882
TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882
TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882


In [10]:
# Load intermediate GPD files for NPC and PC

In [11]:
npc_mutations = pd.read_csv("../data/processed/tcga_gpd_results/tcga_mutation_npc.tsv", sep="\t")
npc_mutations

Unnamed: 0,Hugo_Symbol,Gene,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,HGVSc,HGVSp,agg_sample_id,mut_freq


In [12]:
npc_mutations["HGVSp"].value_counts(dropna=False)

Series([], Name: HGVSp, dtype: int64)

In [13]:
npc_mutations["Variant_Classification"].value_counts(dropna=False)

Series([], Name: Variant_Classification, dtype: int64)

In [14]:
pc_mutations = pd.read_csv("../data/processed/tcga_gpd_results/tcga_mutation_pc_pos.tsv", sep="\t")
pc_mutations

Unnamed: 0,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Hugo_Symbol,Gene,HGVSc,HGVSp,prot_start_pos,prot_end_pos,agg_sample_id,mut_freq
0,7,140753336,140753336,Missense,SNP,BRAF,ENSG00000157764,c.1799T>A,p.V600E,600,600,TCGA-A6-2672_TCGA-A6-2686_TCGA-A6-3809_TCGA-A6...,259
1,3,179218303,179218303,Missense,SNP,PIK3CA,ENSG00000121879,c.1633G>A,p.E545K,545,545,TCGA-A5-A1OJ_TCGA-AJ-A2QM_TCGA-AP-A0LN_TCGA-AP...,223
2,3,179234297,179234297,Missense,SNP,PIK3CA,ENSG00000121879,c.3140A>G,p.H1047R,1047,1047,TCGA-AJ-A3BK_TCGA-AJ-A3NG_TCGA-AJ-A8CT_TCGA-AJ...,200
3,3,179218294,179218294,Missense,SNP,PIK3CA,ENSG00000121879,c.1624G>A,p.E542K,542,542,TCGA-4E-A92E_TCGA-A5-A0GM_TCGA-A5-A0GX_TCGA-A5...,150
4,12,25245350,25245350,Missense,SNP,KRAS,ENSG00000133703,c.35G>A,p.G12D,12,12,TCGA-A5-A0GP_TCGA-AP-A0LD_TCGA-AP-A0LS_TCGA-AP...,118
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32208,8,37698543,37698543,Missense,SNP,ZNF703,ENSG00000183779,c.1642T>A,p.Y548N,548,548,TCGA-A7-A2KD,1
32209,8,37698553,37698553,Missense,SNP,ZNF703,ENSG00000183779,c.1652G>T,p.S551I,551,551,TCGA-VQ-A8P2,1
32210,8,37698560,37698560,Missense,SNP,ZNF703,ENSG00000183779,c.1659A>T,p.L553F,553,553,TCGA-E6-A1LX,1
32211,8,37698604,37698604,Frameshift,SNP,ZNF703,ENSG00000183779,c.1703delG,p.G568Dfs*14,568,568,TCGA-21-1070,1


In [15]:
pc_mutations["Variant_Classification"].value_counts(dropna=False)

Missense            26557
Nonsense             3231
Frameshift           2047
InFrameDeletion       321
MultiAAMissense        44
InFrameInsertion       13
Name: Variant_Classification, dtype: int64

In [16]:
# To get PIU vs LU, we used the locations in ptm_pfam_combine.csv which is used in the GPD implementation
ptm_pfam_df = pd.read_csv("/data/ajayago/druid/datasets/ptm_pfam_combine.csv", index_col = 0)
ptm_pfam_df

Unnamed: 0,uniprot_accession,start_position,end_position,center_position,unit_name,gene_name,gene_id,unit_label
1,A0A024RBG1,17,144,80,NUDIX,NUDT4,ENSG00000173598,Domain
2,A0A024RBG1,17,144,80,NUDIX,NUDT4B,ENSG00000173598,Domain
3,A0A024RBG1,17,144,80,NUDIX,NUDT4,ENSG00000177144,Domain
4,A0A024RBG1,17,144,80,NUDIX,NUDT4B,ENSG00000177144,Domain
5,A0A075B6H9,25,119,72,V-set,IGLV4-69,ENSG00000211637,Domain
...,...,...,...,...,...,...,...,...
394261,Q9Y6Z7,196,206,201,acety,COLEC10,ENSG00000184374,PTM
394262,Q9Y6Z7,165,272,218,Lectin_C,COLEC10,ENSG00000184374,Domain
394263,S4R3P1,1,24,12,Humanin,MTRNR2L13,ENSG00000270394,Family
394264,S4R3Y5,1,24,12,Humanin,MTRNR2L11,ENSG00000270188,Family


In [17]:
GPD_unit = []
for idx, row in pc_mutations.iterrows():
    subset_ptm = ptm_pfam_df[ptm_pfam_df.gene_id == row["Gene"]]
    x = "LU"
    for idx, r in subset_ptm.iterrows():
        if (row["prot_start_pos"] >= r["start_position"]) & (row["prot_start_pos"] <= r["end_position"]) \
    | (row["prot_end_pos"] >= r["start_position"]) & (row["prot_end_pos"] <= r["end_position"]):
            x = "PIU"
            break
    GPD_unit.append(x)

pc_mutations["GPD_unit"] = GPD_unit

In [18]:
pc_mutations["GPD_unit"].value_counts()

PIU    21007
LU     11206
Name: GPD_unit, dtype: int64

In [19]:
piu_mutations = set(pc_mutations[pc_mutations["GPD_unit"] == "PIU"]["Hugo_Symbol"] + " " + pc_mutations[pc_mutations["GPD_unit"] == "PIU"]["HGVSp"].apply(lambda x: x.split("p.")[1]))
lu_mutations = set(pc_mutations[pc_mutations["GPD_unit"] == "LU"]["Hugo_Symbol"] + " " + pc_mutations[pc_mutations["GPD_unit"] == "LU"]["HGVSp"].apply(lambda x: x.split("p.")[1]))
len(piu_mutations), len(lu_mutations)

(21007, 11206)

In [21]:
# Map each point mutation to PIU/LU or NCU
GPD_unit_merged1 = []
for idx, row in merged_1.iterrows():
    if row["point_mutation"] in (piu_mutations):
        GPD_unit_merged1.append("PIU")
    elif row["point_mutation"] in (lu_mutations):
        GPD_unit_merged1.append("LU")
    else:
        GPD_unit_merged1.append("NCU")

In [22]:
merged_1["GPD_unit"] = GPD_unit_merged1

In [58]:
merged_1

Unnamed: 0_level_0,point_mutation,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,fathmm_pred,provean_pred,metasvm_pred,m_cap_pred,...,deogen2_pred,bayesdel_addaf_pred,bayesdel_noaf_pred,clinpred_pred,list_s2_pred,fathmm_mkl_coding_pred,fathmm_xf_coding_pred,gene,1plusxon17_score,GPD_unit
submitter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-2E-A9G8,FBXW7 R505G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FBXW7,1.000000,PIU
TCGA-2E-A9G8,TP53 E286_E287del,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TP53,1.000000,PIU
TCGA-A5-A1OH,AXL S447S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,AXL,1.000000,NCU
TCGA-A5-A1OH,BRD4 X1340_splice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BRD4,1.000000,NCU
TCGA-A5-A1OH,HRAS E31K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,HRAS,1.000000,PIU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882,LU
TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882,LU
TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882,LU
TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882,LU


### Output: GPD + Annovar annotation

In [25]:
patient_gene_matrix_xon17_piu = merged_1[merged_1.GPD_unit == "PIU"].pivot_table(index="submitter_id", columns="gene", values="1plusxon17_score", aggfunc="max")
patient_gene_matrix_xon17_piu.fillna(0, inplace=True)
for g in genes_324:
    if g not in patient_gene_matrix_xon17_piu.columns:
        patient_gene_matrix_xon17_piu[g] = 0

patient_gene_matrix_xon17_piu

gene,ABL1,ACVR1B,AKT1,AKT2,AKT3,ALK,ALOX12B,APC,AR,ARAF,...,PARK2,PTEN,SDC4,SDHC,SOCS1,SRC,TERC,U2AF1,WHSC1,WHSC1L1
submitter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-05-4384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-05-4390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-05-4398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-05-4427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-05-4432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-XF-AAN5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-XF-AAN7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-XX-A899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-Z7-A8R5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
patient_gene_matrix_xon17_lu = merged_1[merged_1.GPD_unit == "LU"].pivot_table(index="submitter_id", columns="gene", values="1plusxon17_score", aggfunc="max")
patient_gene_matrix_xon17_lu.fillna(0, inplace=True)
for g in genes_324:
    if g not in patient_gene_matrix_xon17_lu.columns:
        patient_gene_matrix_xon17_lu[g] = 0
patient_gene_matrix_xon17_lu

gene,ABL1,ACVR1B,AKT2,ALK,ALOX12B,APC,AR,ARID1A,ASXL1,ATM,...,SOCS1,SPOP,SUFU,TERC,TGFBR2,U2AF1,VEGFA,VHL,WHSC1,WHSC1L1
submitter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-05-4384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-05-4390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-05-4398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-05-4427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-05-4432,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-XF-AAN3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-XF-AAN5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-XF-AAN7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-Z7-A8R5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
patient_gene_matrix_xon17_ncu = merged_1[merged_1.GPD_unit == "NCU"].pivot_table(index="submitter_id", columns="gene", values="1plusxon17_score", aggfunc="max")
patient_gene_matrix_xon17_ncu.fillna(0, inplace=True)
for g in genes_324:
    if g not in patient_gene_matrix_xon17_ncu.columns:
        patient_gene_matrix_xon17_ncu[g] = 0
patient_gene_matrix_xon17_ncu

gene,ABL1,ACVR1B,AKT1,AKT2,AKT3,ALK,ALOX12B,APC,AR,ARAF,...,RAD54L,SDHB,SDHC,SOCS1,SOX2,SUFU,TERC,U2AF1,VEGFA,ZNF703
submitter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-05-4384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-05-4398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-05-4402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-05-4427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-05-4432,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-XF-AAN3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-XF-AAN5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-XF-AAN7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TCGA-XX-A899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
patient_gene_matrix_xon17_piu.shape, patient_gene_matrix_xon17_lu.shape, patient_gene_matrix_xon17_ncu.shape

((551, 324), (396, 324), (389, 324))

In [31]:
len(set(patient_gene_matrix_xon17.index) - set(patient_gene_matrix_xon17_piu.index))

45

In [32]:
# Add in missing patient IDs in each matrix
patient_gene_matrix_xon17_piu.reset_index(inplace=True)
patient_gene_matrix_xon17_lu.reset_index(inplace=True)
patient_gene_matrix_xon17_ncu.reset_index(inplace=True)
for t in set(patient_gene_matrix_xon17.index) - set(patient_gene_matrix_xon17_piu.submitter_id):
    patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ignore_index=True))
patient_gene_matrix_xon17_piu.set_index("submitter_id", inplace=True)    
for t in set(patient_gene_matrix_xon17.index) - set(patient_gene_matrix_xon17_lu.submitter_id):
    patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
patient_gene_matrix_xon17_lu.set_index("submitter_id", inplace=True)
for t in set(patient_gene_matrix_xon17.index) - set(patient_gene_matrix_xon17_ncu.submitter_id):
    patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
patient_gene_matrix_xon17_ncu.set_index("submitter_id", inplace=True)  

  patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ign

  patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ign

  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
 

  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
 

  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
 

  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
 

  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
 

  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ign

  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ign

  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ign

  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ign

  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ign

  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
  patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ign

In [33]:
patient_gene_matrix_xon17_piu.shape, patient_gene_matrix_xon17_lu.shape, patient_gene_matrix_xon17_ncu.shape

((596, 324), (596, 324), (596, 324))

In [36]:
patient_gene_matrix_xon17_piu.fillna(0, inplace=True)
patient_gene_matrix_xon17_lu.fillna(0, inplace=True)
patient_gene_matrix_xon17_ncu.fillna(0, inplace=True)

In [38]:
patient_gene_matrix_xon17_piu.loc[patient_gene_matrix_xon17.index][genes_324].to_csv("../data/processed/xon17_gpd_annotations/tcga_piu_annotated_df.csv")

In [39]:
patient_gene_matrix_xon17_lu.loc[patient_gene_matrix_xon17.index][genes_324].to_csv("../data/processed/xon17_gpd_annotations/tcga_lu_annotated_df.csv")

In [40]:
patient_gene_matrix_xon17_ncu.loc[patient_gene_matrix_xon17.index][genes_324].to_csv("../data/processed/xon17_gpd_annotations/tcga_ncu_annotated_df.csv")

In [63]:
merged_1.to_csv("../data/processed/tcga_annovar_gpd_annot_per_patient_per_mutation.csv",)

In [64]:
merged_1

Unnamed: 0_level_0,point_mutation,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,fathmm_pred,provean_pred,metasvm_pred,m_cap_pred,...,deogen2_pred,bayesdel_addaf_pred,bayesdel_noaf_pred,clinpred_pred,list_s2_pred,fathmm_mkl_coding_pred,fathmm_xf_coding_pred,gene,1plusxon17_score,GPD_unit
submitter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-2E-A9G8,FBXW7 R505G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FBXW7,1.000000,PIU
TCGA-2E-A9G8,TP53 E286_E287del,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TP53,1.000000,PIU
TCGA-A5-A1OH,AXL S447S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,AXL,1.000000,NCU
TCGA-A5-A1OH,BRD4 X1340_splice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BRD4,1.000000,NCU
TCGA-A5-A1OH,HRAS E31K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,HRAS,1.000000,PIU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882,LU
TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882,LU
TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882,LU
TCGA-XF-AAN7,ZNF217 L231V,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,ZNF217,1.705882,LU


#### Combine Clinvar annotations, GPD and Annovar annotations

In [59]:
merged_clinvar_df = pd.merge(merged_1, clinvar_annot_df.reset_index(), left_on="point_mutation", right_on="input").groupby("input").aggregate(max)
merged_clinvar_df

Unnamed: 0_level_0,point_mutation,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,fathmm_pred,provean_pred,metasvm_pred,m_cap_pred,...,GERP++_RS_rankscore,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons30way_mammalian,phastCons30way_mammalian_rankscore,CLNALLELEID,CLNDN,CLNDISDB,CLNREVSTAT,CLNSIG
input,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ABL1 E189G,ABL1 E189G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.978,1.0,0.716,0.996,0.595,.,.,.,.,.
ABL1 E197*,ABL1 E197*,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.978,1.0,0.716,0.991,0.532,.,.,.,.,.
ABL1 H246Y,ABL1 H246Y,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.978,1.0,0.716,1.0,0.863,.,.,.,.,.
ABL1 Q447P,ABL1 Q447P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.706,1.0,0.716,0.995,0.577,.,.,.,.,.
ABL1 R533S,ABL1 R533S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.049,0.819,0.3,0.948,0.416,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF217 V45A,ZNF217 V45A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.231,0.013,0.188,0.49,0.271,.,.,.,.,.
ZNF703 L63F,ZNF703 L63F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.433,1.0,0.716,0.995,0.577,.,.,.,.,.
ZNF703 Q44*,ZNF703 Q44*,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.441,1.0,0.716,1.0,0.863,.,.,.,.,.
ZNF703 S583L,ZNF703 S583L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.395,1.0,0.716,0.978,0.47,.,.,.,.,.


In [60]:
merged_clinvar_df.groupby(["GPD_unit", "CLNSIG"]).aggregate("count")

Unnamed: 0_level_0,Unnamed: 1_level_0,point_mutation,sift_pred,sift4g_pred,lrt_pred,mutationtaster_pred,mutationassessor_pred,fathmm_pred,provean_pred,metasvm_pred,m_cap_pred,...,GERP++_RS,GERP++_RS_rankscore,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons30way_mammalian,phastCons30way_mammalian_rankscore,CLNALLELEID,CLNDN,CLNDISDB,CLNREVSTAT
GPD_unit,CLNSIG,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
LU,.,1008,1008,1008,1008,1008,1008,1008,1008,1008,1008,...,1008,1008,1008,1008,1008,1008,1008,1008,1008,1008
LU,Benign,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
LU,Benign/Likely_benign,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
LU,Conflicting_interpretations_of_pathogenicity,26,26,26,26,26,26,26,26,26,26,...,26,26,26,26,26,26,26,26,26,26
LU,Likely_benign,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
LU,Likely_pathogenic,18,18,18,18,18,18,18,18,18,18,...,18,18,18,18,18,18,18,18,18,18
LU,Pathogenic,63,63,63,63,63,63,63,63,63,63,...,63,63,63,63,63,63,63,63,63,63
LU,Pathogenic/Likely_pathogenic,12,12,12,12,12,12,12,12,12,12,...,12,12,12,12,12,12,12,12,12,12
LU,Uncertain_significance,131,131,131,131,131,131,131,131,131,131,...,131,131,131,131,131,131,131,131,131,131
LU,not_provided,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
