# JASPAR DB PARSING

-Jaspar SQL Dump was downloaded from http://jaspar.genereg.net/downloads/ .

-Built the database on my computer using HeidiSQL (MySQL style DB)

-Used PyMySQL and Biopython to access my local copy of the database and pull relevant information, which I compiled into a dictionary.

-Saved this dictionary as a JSON file for future use.


In [28]:
import sys
!{sys.executable} -m pip install mysqlclient pymysql
#anaconda is too good



In [29]:
from Bio.motifs.jaspar.db import JASPAR5

JASPAR_DB_HOST = "127.0.0.1" #provided it's on your computer
JASPAR_DB_NAME = DB_NAME
JASPAR_DB_USER = "root" 
JASPAR_DB_PASS = DB_PASS
jdb = JASPAR5(
     host=JASPAR_DB_HOST,
     name=JASPAR_DB_NAME,
     user=JASPAR_DB_USER,
     password=JASPAR_DB_PASS
)

motifs=jdb.fetch_motifs()

-Creating the dictionary below, the format is:
{protein_name: {'class':class,'family':family,'species':species,'acc' = UniProt ID(s),'pubmed':reference, 'motif': motif in pfm format}
Thus, each protein has a dictionary containing its attributes as its value, and the proteins are keys in the overall dictionary.

In [30]:
jaspardict = {}

for motif in motifs:
    jaspardict[motif.name]={'class':motif.tf_class,'family':motif.tf_family,'species':motif.species, 'acc':motif.acc,'pubmed':motif.medline, 'motif':motif.format("pfm")}



In [31]:
import pprint
pprint.pprint(jaspardict)


{'ABF1': {'acc': ['Q9M7Q5'],
          'class': 'Basic leucine zipper factors (bZIP)',
          'family': None,
          'motif': '  0.00   8.00  19.00   0.00  43.00   0.00   0.00   0.00   '
                   '0.00   0.00   0.00  17.00   6.00  13.00  20.00  12.00  '
                   '13.00  10.00  13.00\n'
                   '  4.00   2.00  16.00  35.00   0.00  44.00   0.00   0.00   '
                   '0.00   0.00  44.00  14.00  24.00   6.00  11.00  19.00  '
                   '12.00   6.00   9.00\n'
                   ' 32.00  28.00   0.00   8.00   1.00   0.00  44.00   0.00  '
                   '44.00  38.00   0.00   9.00   9.00  21.00  11.00  13.00  '
                   '17.00  23.00  20.00\n'
                   '  8.00   6.00   9.00   1.00   0.00   0.00   0.00  44.00   '
                   '0.00   6.00   0.00   4.00   5.00   4.00   2.00   0.00   '
                   '2.00   5.00   2.00\n',
          'pubmed': '10636868',
          'species': ['3702']},
 'ABF2': {'acc': ['Q9M

In [32]:
!{sys.executable} -m pip install beautifulsoup4



-The biopython package gives uniprot taxonomical IDs instead of species names, so for future use I'm adding in the species name to the species key for each tf.

In [33]:
import sys,requests
from bs4 import BeautifulSoup as soup

query_start = 'https://www.uniprot.org/taxonomy/'
species_ids = []
for tf in jaspardict.values():
    if tf['species']:
        spec_id = tf['species'][0]
    if spec_id not in species_ids:
        species_ids.append(spec_id)
id_to_species = {}
for spec_id in species_ids:
    r = requests.get(query_start+spec_id+"/")
    s = soup(r.text,'html.parser')
    s.prettify()
    id_to_species[spec_id]=s.title.string
print (id_to_species)
        


{'10090': 'Mus musculus (Mouse)', '7227': 'Drosophila melanogaster (Fruit fly)', '10116': 'Rattus norvegicus (Rat)', '4577': 'Zea mays (Maize)', '9606': 'Homo sapiens (Human)', '4513': 'Hordeum vulgare (Barley)', '3888': 'Pisum sativum (Garden pea)', '4102': 'Petunia hybrida (Petunia)', '4151': 'Antirrhinum majus (Garden snapdragon)', '9031': 'Gallus gallus (Chicken)', '9986': 'Oryctolagus cuniculus (Rabbit)', '7729': 'Halocynthia roretzi (Sea squirt) (Cynthia roretzi)', '3702': 'Arabidopsis thaliana (Mouse-ear cress)', '4565': 'Triticum aestivum (Wheat)', '4094': 'Nicotiana sp. (Tobacco)', '6239': 'Caenorhabditis elegans', '4932': "Saccharomyces cerevisiae (Baker's yeast)", '8355': 'Xenopus laevis (African clawed frog)', '9103': 'Meleagris gallopavo (Wild turkey)', '6238': 'Caenorhabditis briggsae', '559292': "Saccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast)", '5141': 'Neurospora crassa', '3694': 'Populus trichocarpa (Western balsam poplar) (Populus balsamifera su

In [34]:
for tf in jaspardict.values():
    if tf['species']:
        tf['species'] = str(tf['species'])+" "+id_to_species[tf['species'][0]]


In [35]:
pprint.pprint(jaspardict)

{'ABF1': {'acc': ['Q9M7Q5'],
          'class': 'Basic leucine zipper factors (bZIP)',
          'family': None,
          'motif': '  0.00   8.00  19.00   0.00  43.00   0.00   0.00   0.00   '
                   '0.00   0.00   0.00  17.00   6.00  13.00  20.00  12.00  '
                   '13.00  10.00  13.00\n'
                   '  4.00   2.00  16.00  35.00   0.00  44.00   0.00   0.00   '
                   '0.00   0.00  44.00  14.00  24.00   6.00  11.00  19.00  '
                   '12.00   6.00   9.00\n'
                   ' 32.00  28.00   0.00   8.00   1.00   0.00  44.00   0.00  '
                   '44.00  38.00   0.00   9.00   9.00  21.00  11.00  13.00  '
                   '17.00  23.00  20.00\n'
                   '  8.00   6.00   9.00   1.00   0.00   0.00   0.00  44.00   '
                   '0.00   6.00   0.00   4.00   5.00   4.00   2.00   0.00   '
                   '2.00   5.00   2.00\n',
          'pubmed': '10636868',
          'species': "['3702'] Arabidopsis thaliana (M

In [36]:
import json
h = json.dumps(jaspardict)
f= open('jaspar_final.json','w')
f.write(h)
f.close()