# Setup

In [191]:
from IPython.display import Markdown, display

def print_md(string):
    string = string.replace('\ ', '&nbsp;')
    string = string.replace('\t', '&emsp;')
    string = string.replace('\n', '<br>')
    display(Markdown(string))

In [192]:
conda_env = !echo $CONDA_DEFAULT_ENV
conda_env = conda_env[0]

try:
    if conda_env == 'impact-annotator_env':
        raise Exception("⚠️ <span style='color:red'>Please activate the **impact-annotator_env** conda environment to work with this notebook:</span>\n"
                        "\t\t\ \ <span style='color:blue'>$ source activate impact-annotator_env</span>\n"
                        "\t\t\ \ current environment: " + conda_env)
except Exception as e:
    print_md(str(e))

⚠️ <span style='color:red'>Please activate the **impact-annotator_env** conda environment to work with this notebook:</span><br>&emsp;&emsp;&nbsp;&nbsp;<span style='color:blue'>$ source activate impact-annotator_env</span><br>&emsp;&emsp;&nbsp;&nbsp;current environment: impact-annotator_env

In [195]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
seaborn.set()

In [196]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 1000)

# Reproduce `first_analysis.ipynb`

In [200]:
impact = pd.read_csv('../../data/all_IMPACT_mutations_180508.txt', sep = '\t', low_memory = False)

In [204]:
impact.shape

(588547, 35)

In [213]:
impact.nunique()

Hugo_Symbol                       615
Entrez_Gene_Id                      1
Center                              1
NCBI_Build                          1
Chromosome                         23
Start_Position                 372687
End_Position                   370062
Strand                              1
Consequence                        20
Variant_Type                        5
Reference_Allele                 8780
Tumor_Seq_Allele1                8780
Tumor_Seq_Allele2                1910
dbSNP_RS                            1
Tumor_Sample_Barcode            25181
Matched_Norm_Sample_Barcode         0
Match_Norm_Seq_Allele1           8780
Match_Norm_Seq_Allele2           8780
cDNA_change                    213411
HGVSp_Short                    117245
t_depth                          3427
t_vaf                           61490
t_alt_count                      1923
n_depth                          1964
n_vaf                            5354
n_alt_count                       308
t_ref_plus_c

In [18]:
impact = pd.read_csv('../../data/final_IMPACT_mutations_180508.txt', sep = '\t')

In [36]:
impact.head()

Unnamed: 0,Hugo_Symbol,Chromosome,Start_Position,End_Position,Consequence,Variant_Type,Reference_Allele,Tumor_Seq_Allele2,Tumor_Sample_Barcode,cDNA_change,HGVSp_Short,t_depth,t_vaf,t_alt_count,n_depth,n_vaf,n_alt_count,t_ref_plus_count,t_ref_neg_count,t_alt_plus_count,t_alt_neg_count,confidence_class,sample_coverage,mut_key,VAG_VT,VAG_GENE,VAG_cDNA_CHANGE,VAG_PROTEIN_CHANGE,VAG_EFFECT,VEP_Consequence,VEP_SYMBOL,VEP_HGVSc,VEP_HGVSp,VEP_Amino_acids,VEP_VARIANT_CLASS,VEP_EXON,VEP_INTRON,VEP_IMPACT,VEP_CLIN_SIG,VEP_COSMIC_CNT,VEP_gnomAD_AF,sample_mut_key,patient_key,frequency_in_normals,VEP_SIFT_class,VEP_SIFT_score,VEP_PolyPhen_class,VEP_PolyPhen_score,VEP_in_dbSNP,VEP_gnomAD_total_AF_AFR,VEP_gnomAD_total_AF_AMR,VEP_gnomAD_total_AF_ASJ,VEP_gnomAD_total_AF_EAS,VEP_gnomAD_total_AF_FIN,VEP_gnomAD_total_AF_NFE,VEP_gnomAD_total_AF_OTH,VEP_gnomAD_total_AF_max,VEP_gnomAD_total_AF,Kaviar_AF,is_a_hotspot,is_a_3d_hotspot,oncogenic,gene_type
0,TP53,17,7577515,7577515,nonsynonymous_SNV,SNP,T,G,P-0000012-T02-IM3,c.766A>C,p.T256P,227,0.5022,114,569,0.0,0,59,54,58,56,AUTO_OK,344,17_7577515_T_G,Sub,TP53,c.766A>C,p.T256P,non_synonymous_codon,missense_variant,TP53,c.766A>C,p.T256P,T/P,SNV,7|11,,MODERATE,unknown,1,0.0,P-0000012-T02-IM3_17_7577515_T_G,P-0000012,0.0,deleterious,0.0,probably_damaging,0.999,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unknown,unknown,Likely Oncogenic,tsg
1,PIK3R3,1,46521514,46521514,nonsynonymous_SNV,SNP,G,C,P-0000012-T03-IM3,c.894C>G,p.I298M,733,0.17599,129,1243,0.0,0,288,316,61,68,AUTO_OK,428,1_46521514_G_C,Sub,PIK3R3,c.1032C>G,p.I344M,non_synonymous_codon,missense_variant,PIK3R3,c.894C>G,p.I298M,I/M,SNV,7|10,,MODERATE,unknown,0,0.0,P-0000012-T03-IM3_1_46521514_G_C,P-0000012,0.0,deleterious,0.0,benign,0.277,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unknown,unknown,Unknown,tsg
2,ATR,3,142178126,142178126,nonsynonymous_SNV,SNP,C,A,P-0000012-T03-IM3,c.7292G>T,p.R2431M,482,0.17427,84,581,0.00172,1,221,177,46,38,AUTO_OK,428,3_142178126_C_A,Sub,ATR,c.7292G>T,p.R2431M,non_synonymous_codon,missense_variant,ATR,c.7292G>T,p.R2431M,R/M,SNV,43|47,,MODERATE,unknown,0,4e-06,P-0000012-T03-IM3_3_142178126_C_A,P-0000012,0.0,deleterious,0.0,probably_damaging,0.997,True,0.0,0.0,0.0,0.0,0.0,9e-06,0.0,9e-06,5e-06,7e-06,unknown,unknown,Unknown,tsg
3,PDGFRA,4,55139732,55139732,nonsynonymous_SNV,SNP,T,A,P-0000012-T03-IM3,c.1393T>A,p.L465M,570,0.20351,116,811,0.0,0,252,202,66,50,AUTO_OK,428,4_55139732_T_A,Sub,PDGFRA,c.1393T>A,p.L465M,non_synonymous_codon,missense_variant,PDGFRA,c.1393T>A,p.L465M,L/M,SNV,10|23,,MODERATE,unknown,0,0.0,P-0000012-T03-IM3_4_55139732_T_A,P-0000012,0.0,deleterious,0.01,probably_damaging,0.965,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unknown,unknown,Unknown,oncogene
4,FBXW7,4,153249542,153249542,splicing,SNP,C,A,P-0000012-T03-IM3,c.1237-1G>T,,333,0.25526,85,458,0.0,0,69,179,24,61,AUTO_OK,428,4_153249542_C_A,Sub,FBXW7,c.1237-1G>T,p.?,splice_site_variant,splice_acceptor_variant,FBXW7,c.1237-1G>T,unknown,unknown,SNV,,8|11,HIGH,unknown,0,0.0,P-0000012-T03-IM3_4_153249542_C_A,P-0000012,0.0,unknown,,unknown,,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unknown,unknown,Likely Oncogenic,tsg


In [None]:
na_values

In [40]:
impact.isnull().any() != False

Hugo_Symbol                False
Chromosome                 False
Start_Position             False
End_Position               False
Consequence                False
Variant_Type               False
Reference_Allele           False
Tumor_Seq_Allele2          False
Tumor_Sample_Barcode       False
cDNA_change                 True
HGVSp_Short                 True
t_depth                    False
t_vaf                      False
t_alt_count                False
n_depth                    False
n_vaf                      False
n_alt_count                False
t_ref_plus_count           False
t_ref_neg_count            False
t_alt_plus_count           False
t_alt_neg_count            False
confidence_class           False
sample_coverage            False
mut_key                    False
VAG_VT                     False
VAG_GENE                   False
VAG_cDNA_CHANGE            False
VAG_PROTEIN_CHANGE         False
VAG_EFFECT                 False
VEP_Consequence            False
VEP_SYMBOL

In [27]:
impact.isnull().sum()

Hugo_Symbol                     0
Chromosome                      0
Start_Position                  0
End_Position                    0
Consequence                     0
Variant_Type                    0
Reference_Allele                0
Tumor_Seq_Allele2               0
Tumor_Sample_Barcode            0
cDNA_change                    95
HGVSp_Short                  2680
t_depth                         0
t_vaf                           0
t_alt_count                     0
n_depth                         0
n_vaf                           0
n_alt_count                     0
t_ref_plus_count                0
t_ref_neg_count                 0
t_alt_plus_count                0
t_alt_neg_count                 0
confidence_class                0
sample_coverage                 0
mut_key                         0
VAG_VT                          0
VAG_GENE                        0
VAG_cDNA_CHANGE                 0
VAG_PROTEIN_CHANGE              0
VAG_EFFECT                      0
VEP_Consequenc