In [1]:
from biomart import BiomartServer
import csv
import datatable
import pandas as pd

def save_file(file_path, object):
    with open(file_path, "w") as file:
        for line in object:
            file.write(line + "\n")

In [4]:
server = BiomartServer("http://ensembl.org/biomart") # grch37.ensembl.org/biomart
mart = server.datasets['hsapiens_gene_ensembl']

In [5]:
server.show_databases()

{'ENSEMBL_MART_ENSEMBL': Ensembl Genes 112,
 'ENSEMBL_MART_FUNCGEN': Ensembl Regulation 112,
 'ENSEMBL_MART_GENOMIC': Genomic features 112,
 'ENSEMBL_MART_MOUSE': Mouse strains 112,
 'ENSEMBL_MART_ONTOLOGY': Ontology,
 'ENSEMBL_MART_SEQUENCE': Sequence,
 'ENSEMBL_MART_SNP': Ensembl Variation 112}


In [6]:
server.show_datasets()

{'abrachyrhynchus_gene_ensembl': Pink-footed goose genes (ASM259213v1),
 'abrachyrhynchus_genomic_sequence': Pink-footed goose sequences (ASM259213v1),
 'acalliptera_gene_ensembl': Eastern happy genes (fAstCal1.3),
 'acalliptera_genomic_sequence': Eastern happy sequences (fAstCal1.3),
 'acarolinensis_gene_ensembl': Green anole genes (AnoCar2.0v2),
 'acarolinensis_genomic_sequence': Green anole sequences (AnoCar2.0v2),
 'acchrysaetos_gene_ensembl': Golden eagle genes (bAquChr1.2),
 'acchrysaetos_genomic_sequence': Golden eagle sequences (bAquChr1.2),
 'acitrinellus_gene_ensembl': Midas cichlid genes (Midas_v5),
 'acitrinellus_genomic_sequence': Midas cichlid sequences (Midas_v5),
 'amelanoleuca_gene_ensembl': Giant panda genes (ASM200744v2),
 'amelanoleuca_genomic_sequence': Giant panda sequences (ASM200744v2),
 'amexicanus_gene_ensembl': Mexican tetra genes (Astyanax_mexicanus-2.0),
 'amexicanus_genomic_sequence': Mexican tetra sequences (Astyanax_mexicanus-2.0),
 'anancymaae_gene_ense

In [7]:
mart.show_filters()

{'affy_hc_g110': 'AFFY HC G110 probe ID(s) [e.g. 737_at]' (type: id_list, values: []),
 'affy_hg_focus': 'AFFY HG Focus probe ID(s) [e.g. 220771_at]' (type: id_list, values: []),
 'affy_hg_u133_plus_2': 'AFFY HG U133 Plus 2 probe ID(s) [e.g. 1553551_s_at]' (type: id_list, values: []),
 'affy_hg_u133a_2': 'AFFY HG U133A 2 probe ID(s) [e.g. 211600_at]' (type: id_list, values: []),
 'affy_hg_u133b': 'AFFY HG U133B probe ID(s) [e.g. 224372_at]' (type: id_list, values: []),
 'affy_hg_u95a': 'AFFY HG U95A probe ID(s) [e.g. 35984_at]' (type: id_list, values: []),
 'affy_hg_u95av2': 'AFFY HG U95Av2 probe ID(s) [e.g. 35984_at]' (type: id_list, values: []),
 'affy_hg_u95b': 'AFFY HG U95B probe ID(s) [e.g. 47566_at]' (type: id_list, values: []),
 'affy_hg_u95c': 'AFFY HG U95C probe ID(s) [e.g. 48789_at]' (type: id_list, values: []),
 'affy_hg_u95d': 'AFFY HG U95D probe ID(s) [e.g. 73422_at]' (type: id_list, values: []),
 'affy_hg_u95e': 'AFFY HG U95E probe ID(s) [e.g. 88289_at]' (type: id_list, v

In [8]:
mart.show_attributes()

{'3_utr_end': '3' UTR end' (default: False),
 '3_utr_start': '3' UTR start' (default: False),
 '3utr': '3' UTR' (default: False),
 '5_utr_end': '5' UTR end' (default: False),
 '5_utr_start': '5' UTR start' (default: False),
 '5utr': '5' UTR' (default: False),
 'abrachyrhynchus_homolog_associated_gene_name': 'Pink-footed goose gene name' (default: False),
 'abrachyrhynchus_homolog_canonical_transcript_protein': 'Query protein or transcript ID' (default: False),
 'abrachyrhynchus_homolog_chrom_end': 'Pink-footed goose chromosome/scaffold end (bp)' (default: False),
 'abrachyrhynchus_homolog_chrom_start': 'Pink-footed goose chromosome/scaffold start (bp)' (default: False),
 'abrachyrhynchus_homolog_chromosome': 'Pink-footed goose chromosome/scaffold name' (default: False),
 'abrachyrhynchus_homolog_ensembl_gene': 'Pink-footed goose gene stable ID' (default: False),
 'abrachyrhynchus_homolog_ensembl_peptide': 'Pink-footed goose protein or transcript stable ID' (default: False),
 'abrachyrh

In [9]:
attributes = ['ensembl_gene_id', 'external_gene_name', 'chromosome_name', "affy_huex_1_0_st_v2", "hgnc_symbol"]

In [10]:
response = mart.search({
    'attributes': attributes
})


In [11]:
dataset = []
for line in response.iter_lines():
  line = line.decode('utf-8')
  dataset.append(line)

In [12]:
save_file("affy_huex_1_0_st_v2_hg38.txt", dataset)

In [13]:
dataset = pd.read_csv("affy_huex_1_0_st_v2_hg38.txt", sep="\t", header=None, dtype="str")
dataset.columns = attributes

In [14]:
dataset.head(60)

Unnamed: 0,ensembl_gene_id,external_gene_name,chromosome_name,affy_huex_1_0_st_v2,hgnc_symbol
0,ENSG00000210049,MT-TF,MT,4037576,MT-TF
1,ENSG00000210049,MT-TF,MT,4037584,MT-TF
2,ENSG00000210049,MT-TF,MT,3362718,MT-TF
3,ENSG00000210049,MT-TF,MT,4037580,MT-TF
4,ENSG00000210049,MT-TF,MT,3029896,MT-TF
5,ENSG00000210049,MT-TF,MT,4037574,MT-TF
6,ENSG00000210049,MT-TF,MT,3525200,MT-TF
7,ENSG00000210049,MT-TF,MT,2864632,MT-TF
8,ENSG00000210049,MT-TF,MT,4037572,MT-TF
9,ENSG00000210049,MT-TF,MT,4037578,MT-TF
