# Scope of the notebook

I use pybiomart to retrieve the associated gene for each of the variants

In [1]:
import pybiomart
import pandas as pd

## Load the variants and Ensembl dataset

In [3]:
#read txt file
variant_list = pd.read_csv('variant_list.txt',  header=0)
variant_list.columns = ['variant']  
variant_list['variant']=variant_list['variant'].astype(str)
variant_list.head()

Unnamed: 0,variant
0,rs61751443
1,rs28935168
2,rs61751449
3,rs63750264
4,rs11136000


In [4]:
# Select the database
server = pybiomart.Server(host='http://www.ensembl.org')
server.list_marts()

Unnamed: 0,name,display_name
0,ENSEMBL_MART_ENSEMBL,Ensembl Genes 98
1,ENSEMBL_MART_MOUSE,Mouse strains 98
2,ENSEMBL_MART_SEQUENCE,Sequence
3,ENSEMBL_MART_ONTOLOGY,Ontology
4,ENSEMBL_MART_GENOMIC,Genomic features 98
5,ENSEMBL_MART_SNP,Ensembl Variation 98
6,ENSEMBL_MART_FUNCGEN,Ensembl Regulation 98


In [5]:
# Select the dataset
mart = server['ENSEMBL_MART_SNP']
mart.list_datasets()

Unnamed: 0,name,display_name
0,cfamiliaris_snp,Dog Short Variants (SNPs and indels excluding ...
1,drerio_structvar,Zebrafish Structural Variants (GRCz11)
2,ecaballus_structvar,Horse Structural Variants (EquCab3.0)
3,ptroglodytes_snp,Chimpanzee Short Variants (SNPs and indels exc...
4,cfamiliaris_structvar,Dog Structural Variants (CanFam3.1)
5,mgallopavo_snp,Turkey Short Variants (SNPs and indels excludi...
6,mmusculus_structvar,Mouse Structural Variants (GRCm38.p6)
7,nleucogenys_snp,Gibbon Short Variants (SNPs and indels excludi...
8,fcatus_snp,Cat Short Variants (SNPs and indels excluding ...
9,hsapiens_structvar,Human Structural Variants (GRCh38.p13)


In [7]:
mart.list_datasets()[mart.list_datasets()['name']=='hsapiens_snp']['display_name']

10    Human Short Variants (SNPs and indels excludin...
Name: display_name, dtype: object

In [9]:
 dataset = mart['hsapiens_snp']

In [24]:
# Attributes
dataset.attributes
dataset.list_attributes()[20:30]

Unnamed: 0,name,display_name,description
20,study_type,Study type,
21,study_external_ref,Study External Reference,
22,study_description,Study Description,
23,source_name,Source name,
24,associated_gene,Associated gene with phenotype,
25,phenotype_name,Phenotype name,
26,phenotype_description,Phenotype description,
27,associated_variant_risk_allele,Associated variant risk allele,
28,p_value,P value,
29,set_name,Variant Set Name,


In [22]:
# Attributes for the gene
attr=pd.DataFrame(dataset.list_attributes())
attr[attr['display_name'].str.contains("gene")]

Unnamed: 0,name,display_name,description
24,associated_gene,Associated gene with phenotype,


In [136]:
# Select filters
dataset.filters
dataset.list_filters()

Unnamed: 0,name,type,description
0,link_so_mini_closure,list,
1,link_so_regulation_closure,list,
2,link_so_motif_closure,list,
3,chr_name,text,
4,start,text,
5,end,text,
6,band_start,drop_down_basic_filter,
7,band_end,drop_down_basic_filter,
8,marker_start,drop_down_basic_filter,
9,marker_end,,


In [13]:
filters=pd.DataFrame( dataset.list_filters())
filters[filters['name'].str.contains("snp")]

Unnamed: 0,name,type,description
13,snp_filter,list,
15,snp_synonym_filter,list,


In [29]:
# Get the selected attributes through a query where the condition is the variant name

#r=dataset.query(attributes=['refsnp_id','chr_name','ensembl_gene_stable_id', 'refsnp_source','chrom_start','chrom_end','p_value','minor_allele_freq','distance_to_transcript','set_name','associated_gene'], filters={'snp_filter':variant_list['variant'].values.tolist()})
result=dataset.query(attributes=['refsnp_id','chr_name','ensembl_gene_stable_id', 'refsnp_source','chrom_start','chrom_end','associated_gene'], filters={'snp_filter':variant_list['variant'].values.tolist()})

In [30]:
result.head()

Unnamed: 0,Variant name,Chromosome/scaffold name,Gene stable ID,Variant source,Chromosome/scaffold position start (bp),Chromosome/scaffold position end (bp),Associated gene with phenotype
0,rs11136000,8,ENSG00000120885,dbSNP,27607002,27607002,CLU
1,rs139237860,14,ENSG00000176165,dbSNP,28768377,28768377,
2,rs141088742,14,ENSG00000176165,dbSNP,28767873,28767873,FOXG1
3,rs143223844,14,ENSG00000176165,dbSNP,28768437,28768437,FOXG1
4,rs147154860,14,ENSG00000176165,dbSNP,28768440,28768440,FOXG1


In [31]:
writer = pd.ExcelWriter('gene_associated_SNP.xlsx')
result.to_excel(writer,"gene_list.xlsx")
writer.save()