In [None]:
import sys

sys.path.append("../../vae_zinb_reprn/")
sys.path.append("../src/")

In [None]:
import numpy as np
import pandas as pd

import datetime
import logging
import os
import time
from functools import cached_property

from sklearn.metrics import average_precision_score, ndcg_score, roc_auc_score

In [None]:
from data import (
    construct_anno_features,
    construct_raw_mutation_features,
    construct_raw_cnv_features,
    canonicalize_mutations,
    construct_anno_features_xon17
)

In [None]:
tcga_mutations_df = pd.read_csv("../data/raw/tcga_point_mutations_incl_alive.csv")
tcga_mutations_df

In [None]:
tcga_response = pd.read_csv("../data/processed/TCGA_drug_response_010222.csv")
tcga_response.rename(
    columns={
        "patient.arr": "submitter_id",
        "drug": "drug_name",
        "response": "response_description",
        "response_cat": "response",
    },
    inplace=True,
)
tcga_response

In [None]:
tcga_mutations_df = tcga_mutations_df[tcga_mutations_df.submitter_id.isin(tcga_response.submitter_id.unique())].copy()
tcga_mutations_df.shape

## Variant Annotations with Annovar/Transvar

In [None]:
genes_324 = pd.read_csv("../../data/druid_1.3_data/gene2ind.txt", header=None)[
    0
].tolist()

In [None]:
# Extract point mutations for the 324 F1 genes
tcga_mutations = []
for submitter_id in tcga_mutations_df.submitter_id.unique():
    filtered_df = tcga_mutations_df[tcga_mutations_df.submitter_id == submitter_id]
    point_mutations = filtered_df.input.values
    point_mutations = canonicalize_mutations(point_mutations)
    for gene in genes_324:
        for point_mutation in point_mutations:
            if f"{gene} " in point_mutation:
                tcga_mutations.append((submitter_id, point_mutation.strip()))
                
tcga_mutations_filtered = pd.DataFrame(
    tcga_mutations, columns=["submitter_id", "point_mutation"]
)
tcga_mutations_filtered

In [None]:
# import dask

# from tqdm import tqdm
# from dask.distributed import Client


# client = Client()
# client.cluster.scale(30)

In [None]:
# %load_ext jupyterlab_notify

In [None]:
# %%notify

# futures = []
# for submitter_id in tcga_mutations_filtered.submitter_id.unique():
#     filtered_df = tcga_mutations_filtered[
#         tcga_mutations_filtered.submitter_id == submitter_id
#     ]
#     point_mutations = list(filtered_df.point_mutation.values)
#     future = client.submit(construct_anno_features_xon17, submitter_id, point_mutations, False)
#     futures.append(future)

# anno_features_combined = client.gather(futures, errors="skip")
# client.shutdown()

In [None]:
# variant annotation on these point mutations
import sys
sys.path.append("../src/")
from data import (construct_anno_features, 
                  construct_raw_cnv_features, 
                  construct_raw_mutation_features, 
                  canonicalize_mutations,
                  ALIAS_TO_CANONICAL_NAME_MAP, 
                  _is_valid_point_mutations,
                  get_annotation_features,
                  preprocess_annotation_features,
                  MICEData
                 )

In [None]:
import logging
import re
import subprocess
import tempfile
import itertools
import json

In [None]:
REQUIRED_ANNOTATION_COLUMNS = [
    "SIFT_score",
    "SIFT_converted_rankscore",
    "SIFT_pred",
    "SIFT4G_score",
    "SIFT4G_converted_rankscore",
    "SIFT4G_pred",
    "LRT_score",
    "LRT_converted_rankscore",
    "LRT_pred",
    "MutationTaster_score",
    "MutationTaster_converted_rankscore",
    "MutationTaster_pred",
    "MutationAssessor_score",
    "MutationAssessor_rankscore",
    "MutationAssessor_pred",
    "FATHMM_score",
    "FATHMM_converted_rankscore",
    "FATHMM_pred",
    "PROVEAN_score",
    "PROVEAN_converted_rankscore",
    "PROVEAN_pred",
    "MetaSVM_pred",
    "M-CAP_score",
    "M-CAP_rankscore",
    "M-CAP_pred",
    "MVP_score",
    "MVP_rankscore",
    "MPC_score",
    "MPC_rankscore",
    "PrimateAI_score",
    "PrimateAI_rankscore",
    "PrimateAI_pred",
    "DEOGEN2_score",
    "DEOGEN2_rankscore",
    "DEOGEN2_pred",
    "BayesDel_addAF_score",
    "BayesDel_addAF_pred",
    "BayesDel_noAF_score",
    "BayesDel_noAF_rankscore",
    "BayesDel_noAF_pred",
    "ClinPred_score",
    "ClinPred_rankscore",
    "ClinPred_pred",
    "LIST-S2_score",
    "LIST-S2_rankscore",
    "LIST-S2_pred",
    "DANN_score",
    "DANN_rankscore",
    "fathmm-MKL_coding_score",
    "fathmm-MKL_coding_rankscore",
    "fathmm-MKL_coding_pred",
    "fathmm-XF_coding_score",
    "fathmm-XF_coding_rankscore",
    "fathmm-XF_coding_pred",
    "Eigen-raw_coding",
    "Eigen-raw_coding_rankscore",
    "Eigen-PC-raw_coding",
    "Eigen-PC-raw_coding_rankscore",
]
CATEGORICAL_COLUMNS = [
    "sift_pred",
    "sift4g_pred",
    "lrt_pred",
    "mutationtaster_pred",
    "mutationassessor_pred",
    "fathmm_pred",
    "provean_pred",
    "metasvm_pred",
    "m_cap_pred",
    "primateai_pred",
    "deogen2_pred",
    "bayesdel_addaf_pred",
    "bayesdel_noaf_pred",
    "clinpred_pred",
    "list_s2_pred",
    "fathmm_mkl_coding_pred",
    "fathmm_xf_coding_pred",
]

# The thresholds used in PREDICTOR_LAMBDA_MAP are taken from the corresponding
# technique's published paper/web page
PREDICTOR_LAMBDA_MAP = {
    "sift_pred": ("sift_score", lambda v: "D" if v <= 0.05 else "T"),
    "sift4g_pred": ("sift4g_score", lambda v: "D" if v <= 0.05 else "T"),
    "lrt_pred": ("lrt_score", lambda v: "D" if v <= 0.001 else "U"),
    "mutationtaster_pred": (
        "mutationtaster_score",
        lambda v: None,
    ),  # Threshold is not available and couldn't be derived from available values as well
    "mutationassessor_pred": (
        "mutationassessor_score",
        lambda v: "H" if v >= 3.5 else ("M" if v >= 1.94 else "L"),
    ),
    "fathmm_pred": ("fathmm_score", lambda v: "D" if v < 1.5 else "T"),
    "provean_pred": ("provean_score", lambda v: "D" if v <= 2.282 else "N"),
    "metasvm_pred": (
        "metasvm_pred",
        lambda v: None,
    ),  # No corresponding numeric score available for this method
    "m_cap_pred": ("m_cap_score", lambda v: "D" if v >= 0.025 else "T"),
    "primateai_pred": ("primateai_score", lambda v: "D" if v >= 0.803 else "T"),
    "deogen2_pred": ("deogen2_score", lambda v: "D" if v >= 0.45 else "T"),
    "bayesdel_addaf_pred": (
        "bayesdel_addaf_score",
        lambda v: "D" if v >= 0.0692 else "T",
    ),
    "bayesdel_noaf_pred": (
        "bayesdel_noaf_score",
        lambda v: "D" if v >= -0.0570 else "T",
    ),
    "clinpred_pred": ("clinpred_score", lambda v: "D" if v >= 0.5 else "T"),
    "list_s2_pred": ("list_s2_score", lambda v: "D" if v >= 0.85 else "T"),
    "fathmm_mkl_coding_pred": (
        "fathmm_mkl_coding_score",
        lambda v: "D" if v >= 0.5 else "N",
    ),
    "fathmm_xf_coding_pred": (
        "fathmm_xf_coding_score",
        lambda v: "D" if v >= 0.5 else "N",
    ),
}

DELETERIOUS_VALUES = ["D", "A", "H", "M"]

CNV_PATTERN = r"loss|amplification"

GENES_324 = pd.read_csv("../../data/druid_1.3_data/gene2ind.txt", header=None)[
    0
].tolist()

ANNOTATION_SCRIPT_PATH = "../script/goAAtoGv2.sh"
SPECIAL_CASES = r"rearrangement|truncation|fs|del|ins"

In [None]:
def construct_anno_features_xon17_gpd(patient_id, patient_mutations, agg_features=False):
    """
    TODO: Add support for other agg functions (mean, OR, etc) - as of 202209, only
    sum is supported
    Here, the aggregation is done as an average over all variants over all 17 algorithms.
    """
    if agg_features:
        logging.warn(
            """
        Received agg_features=True -> As of now, construct_anno_features only supports sum aggregation.
        Please ensure that the agg used in dataset definition is sum - if it is not sum, please pass
        agg_features=False and perform agg in dataset definition
        """
        )

    if not _is_valid_point_mutations(patient_mutations):
        return None

    anno_features_combined_imputed_df = pd.read_csv(
        "../data/processed/anno_features_combined_imputed.csv"
    )
    logging.info(anno_features_combined_imputed_df.shape)
    anno_features_combined_imputed_df.set_index(["input"], inplace=True)
    anno_features_combined_imputed_df.head()

    canonical_mutations = canonicalize_mutations(patient_mutations)

    mutations_with_missing_annotations = []
    available_mutations = []
    for mutation in canonical_mutations:
        if mutation in anno_features_combined_imputed_df.index:
            available_mutations.append(mutation)
        elif not re.search(CNV_PATTERN, mutation, re.IGNORECASE):
            mutations_with_missing_annotations.append(mutation)

    if available_mutations:
        patient_anno_features = anno_features_combined_imputed_df.loc[
            available_mutations
        ]
        patient_anno_features = patient_anno_features[CATEGORICAL_COLUMNS].copy()
    else:
        patient_anno_features = None

    if len(mutations_with_missing_annotations) != 0:
        logging.info(
            f"Found mutations with missing annotations - {mutations_with_missing_annotations}"
        )
        missing_annotations = get_annotation_features(
            mutations_with_missing_annotations
        )
        if missing_annotations is not None:
            missing_annotations = missing_annotations[
                REQUIRED_ANNOTATION_COLUMNS
            ].copy()
            missing_annotations.reset_index(inplace=True)
            missing_annotations = preprocess_annotation_features(missing_annotations)
            missing_annotations = missing_annotations[~missing_annotations.duplicated()]
            missing_annotations.reset_index(drop=True, inplace=True)
            missing_annotations.set_index("input", inplace=True)

            numeric_columns = list(
                column
                for column in missing_annotations.columns
                if pd.api.types.is_numeric_dtype(missing_annotations[column])
            )
            # Prepare mask by identifying rows that have all na values for numeric_columns
            na_mask = None
            for col in numeric_columns:
                if type(na_mask) == pd.Series:
                    na_mask = na_mask & missing_annotations[col].isna()
                else:
                    na_mask = missing_annotations[col].isna()

            missing_annotations = missing_annotations[~na_mask]
            numeric_df = missing_annotations[numeric_columns].copy()
            logging.info(numeric_df.shape)
            numeric_df.head()
            numeric_df = pd.concat(
                [numeric_df, anno_features_combined_imputed_df[numeric_columns]],
            )

            categorical_columns = [
                column
                for column in missing_annotations.columns
                if column not in numeric_columns
            ]
            categorical_missing_annotations = missing_annotations[
                categorical_columns
            ].copy()
            logging.info(categorical_missing_annotations.shape)
            categorical_missing_annotations.head()

            imp = MICEData(numeric_df)
            # Impute missing values in numeric columns - Expensive!!
            imp.update_all()
            imputed_df = imp.data
            assert numeric_df.shape == imputed_df.shape
            imputed_df.index = numeric_df.index
            imputed_df = imputed_df[
                imputed_df.index.isin(mutations_with_missing_annotations)
            ].copy()
            numeric_imputed_df = pd.concat(
                [categorical_missing_annotations, imputed_df,], axis=1,
            )
            logging.info(numeric_imputed_df.shape)
            for column in CATEGORICAL_COLUMNS:
                logging.info(
                    column,
                    numeric_imputed_df[column].unique(),
                    len(numeric_imputed_df[numeric_imputed_df[column].isna()]),
                )
                col_na_mask = numeric_imputed_df[column].isna()
                numeric_imputed_df.loc[col_na_mask, column] = numeric_imputed_df[
                    col_na_mask
                ][PREDICTOR_LAMBDA_MAP[column][0]].apply(
                    PREDICTOR_LAMBDA_MAP[column][1]
                )

                # logging.info(
                #     column,
                #     numeric_imputed_df[column].unique(),
                #     len(numeric_imputed_df[numeric_imputed_df[column].isna()]),
                # )

            numeric_imputed_df = numeric_imputed_df.dropna()
            logging.info(numeric_imputed_df.shape)
            missing_anno_features_df = numeric_imputed_df[CATEGORICAL_COLUMNS].copy()

            patient_anno_features = pd.concat(
                [missing_anno_features_df, patient_anno_features]
            )
            
    if patient_anno_features is not None:
        for col in patient_anno_features.columns:
            patient_anno_features[col] = patient_anno_features[col].apply(
                lambda v: 1 if v in DELETERIOUS_VALUES else 0
            )

        patient_anno_features.reset_index(inplace=True)
        patient_anno_features["gene"] = patient_anno_features.input.apply(
            lambda gene_mut: gene_mut.split(" ")[0]
        )
    return patient_anno_features

In [None]:
tcga_mutations_filtered

In [None]:
# %%notify

# futures = []
anno_features_combined = pd.DataFrame()
for patient_id in tcga_mutations_filtered.submitter_id.unique():
    filtered_df = tcga_mutations_filtered[
        tcga_mutations_filtered.submitter_id == submitter_id
    ]
    point_mutations = list(filtered_df.point_mutation.values)
    res = construct_anno_features_xon17_gpd(patient_id, point_mutations, False)
    anno_features_combined = pd.concat([anno_features_combined, res], ignore_index = True)
#     future = client.submit(construct_anno_features_xon17_gpd, patient_id, point_mutations, False)
#     futures.append(future)

# anno_features_combined = client.gather(futures, errors="skip")
# client.shutdown()

In [None]:
anno_features_combined

In [None]:
agg_anno_features_combined_df = anno_features_combined#pd.concat(anno_features_combined).reset_index()
agg_anno_features_combined_df.set_index("input", inplace=True)
agg_anno_features_combined_df

In [None]:
# HERE!! - Now merge with original and save
merged_1 = pd.merge(tcga_mutations_filtered, agg_anno_features_combined_df.reset_index(), left_on = "point_mutation", right_on="input", how="left")
merged_1

In [None]:
merged_1.drop("input", axis = 1, inplace=True)
merged_1.isna().sum()

In [None]:
# For those point mutations without annotations
merged_1.fillna(0, inplace=True)

In [None]:
merged_1["gene"] = merged_1["point_mutation"].apply(lambda x: x.split(" ")[0])

In [None]:
merged_1["gene"].value_counts()

In [None]:
merged_1["1plusxon17_score"] = 1 + merged_1[CATEGORICAL_COLUMNS].sum(axis=1)/17
merged_1

### Output: Xon17 annotation

In [None]:
patient_gene_matrix_xon17 = merged_1.pivot_table(index="submitter_id", columns="gene", values="1plusxon17_score", aggfunc="max")
patient_gene_matrix_xon17.fillna(0, inplace=True)
patient_gene_matrix_xon17

In [None]:
for g in GENES_324:
    if g not in patient_gene_matrix_xon17.columns:
        patient_gene_matrix_xon17[g] = 0
patient_gene_matrix_xon17.shape

In [None]:
patient_gene_matrix_xon17[GENES_324].reset_index().to_csv("../data/processed/tcga_anno_features_xon17.csv", index=False)

In [None]:
patient_gene_matrix_xon17 = pd.read_csv("../data/processed/tcga_anno_features_xon17.csv", index_col = 0)
patient_gene_matrix_xon17

### Annovar annotation

In [None]:
annotated_df = pd.DataFrame(columns=['input', 'sift_pred', 'sift4g_pred', 'lrt_pred', 'mutationtaster_pred',
       'mutationassessor_pred', 'fathmm_pred', 'provean_pred', 'metasvm_pred',
       'm_cap_pred', 'primateai_pred', 'deogen2_pred', 'bayesdel_addaf_pred',
       'bayesdel_noaf_pred', 'clinpred_pred', 'list_s2_pred',
       'fathmm_mkl_coding_pred', 'fathmm_xf_coding_pred', 'gene'])
for submitter_id in tcga_mutations_filtered.submitter_id.unique():
    filtered_df = tcga_mutations_filtered[
        tcga_mutations_filtered.submitter_id == submitter_id
    ]
    point_mutations = list(filtered_df.point_mutation.values)
    _, annot_df = construct_anno_features_xon17(submitter_id, point_mutations, False)
    annotated_df = pd.concat([annotated_df, annot_df])

In [None]:
annotated_df.head()

In [None]:
annotated_df.shape

In [None]:
# annotated_df.to_csv("../data/processed/anno_features_per_mutation_tcga.csv")

#### Clinvar annotations

In [None]:
# Also get Clinvar annotations for these mutations
annotated_df = pd.read_csv("../data/processed/anno_features_per_mutation_tcga.csv", index_col = 0)
annotated_df

In [None]:
ANNOTATION_SCRIPT_PATH_CLINVAR = "../script/goAAtoGv2_clinvar.sh"

In [None]:
results = []
for mutation in list(annotated_df["input"]):
    try:
        # Run annotation script within a temp file and extract features as DataFrame
        with tempfile.TemporaryDirectory() as tmpdirname:
            input_file_path = tmpdirname + "anno_input.txt"
            with open(input_file_path, "w+") as input_file:
                mutation_cleaned = [part for part in mutation.split(" ") if part]
                input_file.write(":p.".join(mutation_cleaned))
                input_file.write("\n")

            # Execute script
            cmd = "bash {0} {1}".format(ANNOTATION_SCRIPT_PATH_CLINVAR, input_file_path)

            logging.info(f"Executing command {cmd}")
            subprocess.call(cmd, shell=True, executable="/bin/bash")
            out_file_path = f"{input_file_path}.annot.hg38_finalannot.txt"
            res = pd.read_table(out_file_path)
    # Some inputs lead to errors, such as "PTEN loss" - ignore and continue processing
    except Exception as e:
        logging.error(
            f"Encountered error while processing mutation {mutation} - {e}"
        )
        continue
    res["input"] = mutation
    res = res[~res.duplicated()]
    results.append(res)
clinvar_annot_df = pd.concat(results)
clinvar_annot_df.set_index(["input"], inplace=True)
clinvar_annot_df.drop(columns=["Otherinfo1"], inplace=True)

In [None]:
clinvar_annot_df

In [None]:
clinvar_annot_df.CLNSIG.value_counts()

In [None]:
clinvar_annot_df.to_csv("../data/processed/clinvar_anno_features_per_mutation_tcga.csv")

### Variant Annotations from GPD

In [None]:
merged_1 = pd.read_csv("../data/processed/tcga_annovar_gpd_annot_per_patient_per_mutation.csv", index_col = 0)
merged_1

In [None]:
# Load intermediate GPD files for NPC and PC

In [None]:
npc_mutations = pd.read_csv("../data/processed/tcga_gpd_results/tcga_mutation_npc.tsv", sep="\t")
npc_mutations

In [None]:
npc_mutations["HGVSp"].value_counts(dropna=False)

In [None]:
npc_mutations["Variant_Classification"].value_counts(dropna=False)

In [None]:
pc_mutations = pd.read_csv("../data/processed/tcga_gpd_results/tcga_mutation_pc_pos.tsv", sep="\t")
pc_mutations

In [None]:
pc_mutations["Variant_Classification"].value_counts(dropna=False)

In [None]:
# To get PIU vs LU, we used the locations in ptm_pfam_combine.csv which is used in the GPD implementation
ptm_pfam_df = pd.read_csv("/data/ajayago/druid/datasets/ptm_pfam_combine.csv", index_col = 0)
ptm_pfam_df

In [None]:
GPD_unit = []
for idx, row in pc_mutations.iterrows():
    subset_ptm = ptm_pfam_df[ptm_pfam_df.gene_id == row["Gene"]]
    x = "LU"
    for idx, r in subset_ptm.iterrows():
        if (row["prot_start_pos"] >= r["start_position"]) & (row["prot_start_pos"] <= r["end_position"]) \
    | (row["prot_end_pos"] >= r["start_position"]) & (row["prot_end_pos"] <= r["end_position"]):
            x = "PIU"
            break
    GPD_unit.append(x)

pc_mutations["GPD_unit"] = GPD_unit

In [None]:
pc_mutations["GPD_unit"].value_counts()

In [None]:
piu_mutations = set(pc_mutations[pc_mutations["GPD_unit"] == "PIU"]["Hugo_Symbol"] + " " + pc_mutations[pc_mutations["GPD_unit"] == "PIU"]["HGVSp"].apply(lambda x: x.split("p.")[1]))
lu_mutations = set(pc_mutations[pc_mutations["GPD_unit"] == "LU"]["Hugo_Symbol"] + " " + pc_mutations[pc_mutations["GPD_unit"] == "LU"]["HGVSp"].apply(lambda x: x.split("p.")[1]))
len(piu_mutations), len(lu_mutations)

In [None]:
# Map each point mutation to PIU/LU or NCU
GPD_unit_merged1 = []
for idx, row in merged_1.iterrows():
    if row["point_mutation"] in (piu_mutations):
        GPD_unit_merged1.append("PIU")
    elif row["point_mutation"] in (lu_mutations):
        GPD_unit_merged1.append("LU")
    else:
        GPD_unit_merged1.append("NCU")

In [None]:
merged_1["GPD_unit"] = GPD_unit_merged1

In [None]:
merged_1

### Output: GPD + Annovar annotation

In [None]:
patient_gene_matrix_xon17_piu = merged_1[merged_1.GPD_unit == "PIU"].pivot_table(index="submitter_id", columns="gene", values="1plusxon17_score", aggfunc="max")
patient_gene_matrix_xon17_piu.fillna(0, inplace=True)
for g in genes_324:
    if g not in patient_gene_matrix_xon17_piu.columns:
        patient_gene_matrix_xon17_piu[g] = 0

patient_gene_matrix_xon17_piu

In [None]:
patient_gene_matrix_xon17_lu = merged_1[merged_1.GPD_unit == "LU"].pivot_table(index="submitter_id", columns="gene", values="1plusxon17_score", aggfunc="max")
patient_gene_matrix_xon17_lu.fillna(0, inplace=True)
for g in genes_324:
    if g not in patient_gene_matrix_xon17_lu.columns:
        patient_gene_matrix_xon17_lu[g] = 0
patient_gene_matrix_xon17_lu

In [None]:
patient_gene_matrix_xon17_ncu = merged_1[merged_1.GPD_unit == "NCU"].pivot_table(index="submitter_id", columns="gene", values="1plusxon17_score", aggfunc="max")
patient_gene_matrix_xon17_ncu.fillna(0, inplace=True)
for g in genes_324:
    if g not in patient_gene_matrix_xon17_ncu.columns:
        patient_gene_matrix_xon17_ncu[g] = 0
patient_gene_matrix_xon17_ncu

In [None]:
patient_gene_matrix_xon17_piu.shape, patient_gene_matrix_xon17_lu.shape, patient_gene_matrix_xon17_ncu.shape

In [None]:
len(set(patient_gene_matrix_xon17.index) - set(patient_gene_matrix_xon17_piu.index))

In [None]:
# Add in missing patient IDs in each matrix
patient_gene_matrix_xon17_piu.reset_index(inplace=True)
patient_gene_matrix_xon17_lu.reset_index(inplace=True)
patient_gene_matrix_xon17_ncu.reset_index(inplace=True)
for t in set(patient_gene_matrix_xon17.index) - set(patient_gene_matrix_xon17_piu.submitter_id):
    patient_gene_matrix_xon17_piu = (patient_gene_matrix_xon17_piu.append({"submitter_id": t}, ignore_index=True))
patient_gene_matrix_xon17_piu.set_index("submitter_id", inplace=True)    
for t in set(patient_gene_matrix_xon17.index) - set(patient_gene_matrix_xon17_lu.submitter_id):
    patient_gene_matrix_xon17_lu = (patient_gene_matrix_xon17_lu.append({"submitter_id": t}, ignore_index=True))
patient_gene_matrix_xon17_lu.set_index("submitter_id", inplace=True)
for t in set(patient_gene_matrix_xon17.index) - set(patient_gene_matrix_xon17_ncu.submitter_id):
    patient_gene_matrix_xon17_ncu = (patient_gene_matrix_xon17_ncu.append({"submitter_id": t}, ignore_index=True))
patient_gene_matrix_xon17_ncu.set_index("submitter_id", inplace=True)  

In [None]:
patient_gene_matrix_xon17_piu.shape, patient_gene_matrix_xon17_lu.shape, patient_gene_matrix_xon17_ncu.shape

In [None]:
patient_gene_matrix_xon17_piu.fillna(0, inplace=True)
patient_gene_matrix_xon17_lu.fillna(0, inplace=True)
patient_gene_matrix_xon17_ncu.fillna(0, inplace=True)

In [None]:
patient_gene_matrix_xon17_piu.loc[patient_gene_matrix_xon17.index][genes_324].to_csv("../data/processed/xon17_gpd_annotations/tcga_piu_annotated_df.csv")

In [None]:
patient_gene_matrix_xon17_lu.loc[patient_gene_matrix_xon17.index][genes_324].to_csv("../data/processed/xon17_gpd_annotations/tcga_lu_annotated_df.csv")

In [None]:
patient_gene_matrix_xon17_ncu.loc[patient_gene_matrix_xon17.index][genes_324].to_csv("../data/processed/xon17_gpd_annotations/tcga_ncu_annotated_df.csv")

In [None]:
merged_1.to_csv("../data/processed/tcga_annovar_gpd_annot_per_patient_per_mutation.csv",)

In [None]:
merged_1

#### Combine Clinvar annotations, GPD and Annovar annotations

In [None]:
merged_clinvar_df = pd.merge(merged_1, clinvar_annot_df.reset_index(), left_on="point_mutation", right_on="input").groupby("input").aggregate(max)
merged_clinvar_df

In [None]:
merged_clinvar_df.groupby(["GPD_unit", "CLNSIG"]).aggregate("count")