In [1]:
import pandas as pd
from dbnsfp_annotation.dbnsfp_annotation import annotate_maf_dbnsfp, annotate_vcf_dbnsfp
from dbnsfp_annotation.utils import *

In order to use dbNSFP database you need to download dbNSFP from the official site and process it according to instruction in db_utils folder

In [2]:
?annotate_maf_dbnsfp

[31mSignature:[39m
annotate_maf_dbnsfp(
    maf: [<[38;5;28;01mclass[39;00m [33m'str'[39m>, <[38;5;28;01mclass[39;00m [33m'pandas.core.frame.DataFrame'[39m>],
    genome_version: str = [33m'hg38'[39m,
    database_path: str = [33m'/uftp/Databases/dbNSFP/dbNSFP4.5a/dbNSFPv4.5a_custombuild.gz'[39m,
    gene_base_path: str = [33m'/uftp/Databases/dbNSFP/dbNSFP4.5a/dbNSFP4.5_gene.complete.gz'[39m,
    n_jobs: int = [32m6[39m,
    host: str = [33m'0.0.0.0'[39m,
) -> pandas.core.frame.DataFrame
[31mDocstring:[39m
Annotates MAF-file SNPs using dbNSFP. DB reference version must be compatible with MAF reference version.
:param database_path: path to dbNSFP joined tabix indexed and GZ-compressed database file, string
:gene_base_path: path to dbNSFP gene_complete annotation file
:param maf: path to tab-separated MAF-file, string
:param genome_version: 'hg19' or 'hg38' version of genome
:param n_jobs: number of concurrent workers for annotation
:param host: ip adress of machin

Next two cells only for demonstration purpose

In [3]:
maf = pd.read_csv('TCGA-A1-A0SB.maf', sep='\t')

In [4]:
maf

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,...,ExAC_AF_NFE,ExAC_AF_OTH,ExAC_AF_SAS,GENE_PHENO,FILTER,COSMIC,CENTERS,CONTEXT,DBVS,NCALLERS
0,ABLIM1,0,.,GRCh37,chr10,116247760,116247760,+,Missense_Mutation,SNP,...,.,.,.,.,PASS,NONE,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,CGGGATGCCAA,.,5
1,ADAMTS20,0,.,GRCh37,chr12,43944926,43944926,+,Missense_Mutation,SNP,...,.,.,.,.,PASS,NONE,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,AGCGATAGTGG,.,5
2,KMT2D,0,.,GRCh37,chr12,49431403,49431404,+,Frame_Shift_Ins,INS,...,.,.,.,.,PASS,NONE,INDELOCATOR*|VARSCANI*|PINDEL,CAGGGGTAACT,.,3
3,SPTB,0,.,GRCh37,chr14,65266493,65266493,+,Missense_Mutation,SNP,...,.,.,.,.,PASS,NONE,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,GTAGGTGCTGA,.,5
4,PIEZO1,0,.,GRCh37,chr16,88790292,88790292,+,Missense_Mutation,SNP,...,.,.,.,.,PASS,NONE,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,CACTCTGTGCC,.,5
5,KAT2A,0,.,GRCh37,chr17,40272381,40272381,+,Silent,SNP,...,.,.,.,.,PASS,NONE,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,GATACGTGGTC,by1000G,5
6,ZNF574,0,.,GRCh37,chr19,42585066,42585066,+,Missense_Mutation,SNP,...,.,.,.,.,PASS,NONE,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,GTCCAGACTGT,.,5
7,ADSS,0,.,GRCh37,chr1,244583577,244583577,+,Missense_Mutation,SNP,...,.,.,.,.,StrandBias,NONE,MUTECT|MUSE,CATTGGTTTAA,.,2
8,SLC6A9,0,.,GRCh37,chr1,44476442,44476442,+,Missense_Mutation,SNP,...,.,.,.,.,PASS,NONE,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,CATTGCCCAGG,.,5
9,OTOR,0,.,GRCh37,chr20,16730581,16730581,+,Missense_Mutation,SNP,...,.,.,.,.,PASS,NONE,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,GAGTCGTGGGT,byFrequency,5


In [18]:
# files below obtained by executing scripts from db_utils folder
# Put your database_path
DATABASE_PATH = '/path/to/dbNSFPv4.5a_custombuild.gz'
# Put your gene_base_path
GENE_BASE_PATH = '/path/to/dbNSFP4.5_gene.complete.gz'

In [5]:
%%time
annotated_maf = annotate_maf_dbnsfp(
    maf=maf,
    genome_version="hg19",
    database_path=DATABASE_PATH,
    gene_base_path=GENE_BASE_PATH,
    n_jobs=6,
    host="0.0.0.0",
)

[32m2024-02-21 13:24:51.931[0m | [1mINFO    [0m | [36mdbnsfp_annotation.dbnsfp_annotation[0m:[36mannotate_maf_dbnsfp[0m:[36m54[0m - [1mThe quality of annotation improves if the following columns are present in maf file:                RS_ID (rs_dbsnp), ensembl transcript ID, HGVSp (any version) and HGVSc.[0m
2024-02-21 13:24:55,319	INFO worker.py:1612 -- Started a local Ray instance. View the dashboard at [1m[32m10.51.4.19:8265 [39m[22m


CPU times: user 3.15 s, sys: 273 ms, total: 3.42 s
Wall time: 10.1 s


In [6]:
annotated_maf

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,...,Essential_gene_CRISPR2,Essential_gene_gene-trap,Gene_indispensability_score,Gene_indispensability_pred,MGI_mouse_gene,MGI_mouse_phenotype,ZFIN_zebrafish_gene,ZFIN_zebrafish_structure,ZFIN_zebrafish_phenotype_quality,ZFIN_zebrafish_phenotype_tag
0,ADSS,0,.,GRCh38,chr1,244420275,244420275,+,Missense_Mutation,SNP,...,S,E,0.954791819295224,E,Adss,.,adssl,trunk,necrotic,abnormal
1,SLC6A9,0,.,GRCh38,chr1,44010770,44010770,+,Missense_Mutation,SNP,...,N,H,0.860167521807743,E,Slc6a9,homeostasis/metabolism phenotype; muscle pheno...,slc6a9,post-vent region,movement quality,abnormal
2,ABLIM1,0,.,GRCh38,chr10,114488001,114488001,+,Missense_Mutation,SNP,...,N,N,0.798943039951219,E,Ablim1,normal phenotype;,.,.,.,.
3,ADAMTS20,0,.,GRCh38,chr12,43551123,43551123,+,Missense_Mutation,SNP,...,N,N,0.0458108729353191,N,Adamts20,limbs/digits/tail phenotype; nervous system ph...,.,.,.,.
4,KMT2D,0,.,GRCh38,chr12,49037620,49037621,+,Frame_Shift_Ins,INS,...,.,N,.,.,Kmt2d,behavior/neurological phenotype (the observabl...,kmt2d,chondrocyte,position,abnormal
5,SPTB,0,.,GRCh38,chr14,64799775,64799775,+,Missense_Mutation,SNP,...,N,N,0.852864495617268,E,Sptb,reproductive system phenotype; mortality/aging...,sptb,nucleate erythrocyte,teardrop-shaped,abnormal
6,PIEZO1,0,.,GRCh38,chr16,88723884,88723884,+,Missense_Mutation,SNP,...,N,N,0.114371359811310,N,Piezo1,homeostasis/metabolism phenotype; growth/size/...,piezo1,nucleate erythrocyte,decreased amount,abnormal
7,KAT2A,0,.,GRCh38,chr17,42120363,42120363,+,Silent,SNP,...,E,N,0.999999999389872,E,Kat2a,nervous system phenotype (the observable morph...,kat2a,paired fin,decreased size,abnormal
8,ZNF574,0,.,GRCh38,chr19,42080914,42080914,+,Missense_Mutation,SNP,...,E,E,0.257979511127052,N,Zfp574,.,znf574,whole organism,dead,abnormal
9,DTNB,0,.,GRCh38,chr2,25455430,25455430,+,Missense_Mutation,SNP,...,N,N,0.478434940235949,N,Dtnb,nervous system phenotype (the observable morph...,.,.,.,.


In [7]:
annotated_maf['BayesDel_addAF_score']

0       0.276577
1       0.442426
2       0.371071
3       0.247473
4            NaN
5     -0.0845586
6     -0.0502235
7            NaN
8      -0.202877
9     -0.0659124
10     -0.107127
11           NaN
12           NaN
13           NaN
14    -0.0892863
15     -0.155635
16           NaN
17      -0.41136
18           NaN
Name: BayesDel_addAF_score, dtype: object

NaN is a result of a wrong position or mutation not being an SNP