# SNPS to Networks 1: KEGG

> Downloading *Zea mays* data from the KEGG database.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
import os
from tqdm import tqdm
import re
import time
import sys
import requests
import numpy as np
import pandas as pd

## Identify genes to be downloaded:

In [None]:
#| export

"Iteratively check for and create directories to store output. Ideally this would just be os.mkdirs() but that function is not available in this version of python"
def ensure_dir_path_exists(dir_path = '../ext_data'):
    import os
    
    for i in range(2, len(dir_path.split('/'))+1):
        path_part = '/'.join(dir_path.split('/')[0:i])
        if not os.path.exists(path_part):
            os.mkdir(path_part)
        

In [None]:
#| export

# Get all the genes for zea mays
# Some gene entries that don't start begin with a chromosome number. These included plastid (Pltd) and mitochondria (MT) genes.

def get_kegg_species_list(species = 'zma'):
    import os
    # make sure the directory exists to hold these data
    ensure_dir_path_exists(dir_path = '../ext_data/'+species+'/kegg')

    file_path = '../ext_data/'+species+'/kegg'+'/'+species+'_list.txt'
    # only retrieve if there isn't a local copy
    if file_path.split('/')[-1] in os.listdir('/'.join(file_path.split('/')[:-1])):
        with open(file_path, 'r') as f:
            r_text = f.read()
    else:
        r = requests.get('https://rest.kegg.jp/list/'+species)
        r_text = r.text
        with open(file_path, 'a') as f:
            f.write(r_text)
    return(r_text)

In [None]:
#| export
def mkdf_kegg_species_list(kegg_species_list = get_kegg_species_list(species = 'zma')):
    import pandas as pd
    kegg_list_zma = pd.DataFrame([e.split('\t') for e in kegg_species_list.split('\n') ])
    # clean up names
    kegg_list_zma = kegg_list_zma.rename(columns = dict(zip(
        [i for i in range(4)],
        ['gene', 'seq_type', 'chromosome_positon', 'gene_type']
    )))

    kegg_list_zma = kegg_list_zma.loc[kegg_list_zma.chromosome_positon.notna(), ]
    return(kegg_list_zma)

The KEGG list contains a gene id that can be used to download further information. The chromosomal position will be key to match up collected SNPs to genes.

In [None]:
kegg_list_zma = mkdf_kegg_species_list(kegg_species_list = get_kegg_species_list(species = 'zma'))

There are also entries that don't start with a chromosome number. These are non-nuclear sequences from plastid (Pltd), Mitochondria (MT), etc.

In [None]:
no_chrm = [e for e in kegg_list_zma['chromosome_positon'] if re.match('\D.+', e)]
no_chrm = list(set(no_chrm))
kegg_list_zma.loc[kegg_list_zma.chromosome_positon.isin(no_chrm), ]

Unnamed: 0,gene,seq_type,chromosome_positon,gene_type
35639,zma:845199,CDS,Pltd:complement(89..1150),"psbA, ZemaCp002; photosystem II protein D1"
35640,zma:845256,tRNA,Pltd:complement(1386..3946),"trnK, ZemaCt121; tRNA-Lys"
35641,zma:845178,CDS,Pltd:complement(1674..3215),"matK, ZemaCp003; maturase K"
35642,zma:845232,CDS,Pltd:complement(4491..5604),"rps16, ZemaCp004; ribosomal protein S16"
35643,zma:845265,tRNA,Pltd:complement(6773..6844),"trnQ, ZemaCt122; tRNA-Gln"
...,...,...,...,...
37707,zma:118475995,rRNA,Unknown,28S ribosomal RNA
37708,zma:118475996,rRNA,Unknown,5.8S ribosomal RNA
37709,zma:5951366,CDS,MT,pBMSmt19_00005; hypothetical protein
37710,zma:5951368,CDS,MT,pBMSmt19_00010; hypothetical protein


In [None]:
#| export

def download_kegg_gene(kegg_gene = 'zma:103644366', **kwargs):    
    """
    Downloads kegg gene entry if it does not exist locally. 
    Can optionally take a numeric value as `sleep_for` to sleep after downloading a file. 
    Useful for controlling the rate requests being sent to the API.
    """
    species = kegg_gene.split(':')[0]
    dir_path = '../ext_data/'+species+'/kegg/gene_entries/'
    ensure_dir_path_exists(dir_path = dir_path)
    
    kegg_gene_safename = kegg_gene.replace(':', '_') # name that's safe for file names
    file_path = dir_path+kegg_gene_safename+'.txt'
    
    # only download if the file doesn't already exist
    if os.path.exists(file_path):
        pass
    else:
        # option to sleep for a given amount of time so tha tthe api isn't accessed to much
        # sleeping here means that we only sleep if there will be a request to download
        if 'sleep_for' in kwargs.keys():
            time.sleep(kwargs['sleep_for'])
        
        r = requests.get('https://rest.kegg.jp/get/'+kegg_gene)
        with open(file_path, 'a') as f:
            f.write(r.text)

In [None]:
#| export
def read_kegg_gene(kegg_gene = 'zma:103644366'):  
    "Reads in locally cached KEGG gene entries. Will download the requested entry if it doesn't exist locally."
    import os
    species = kegg_gene.split(':')[0]
    dir_path = '../ext_data/'+species+'/kegg/gene_entries/'
    ensure_dir_path_exists(dir_path = dir_path)
    
    kegg_gene_safename = kegg_gene.replace(':', '_') # name that's safe for file names
    file_path = dir_path+kegg_gene_safename+'.txt'

    if not os.path.exists(file_path):
        download_kegg_gene(kegg_gene)
    
    with open(file_path, 'r') as f:
        r_text = f.read()
    return(r_text)

Example usage for one gene. Species is inferred and if it doesn't exist in `ext_data` then it will automatically be downloaded with `download_kegg_gene()`.

In [None]:
print(read_kegg_gene(kegg_gene = 'zma:103644366'))

ENTRY       103644366         CDS       T01088
NAME        (RefSeq) uncharacterized protein LOC103644366
ORTHOLOGY   K15032  mTERF domain-containing protein, mitochondrial
ORGANISM    zma  Zea mays (maize)
BRITE       KEGG Orthology (KO) [BR:zma00001]
             09180 Brite Hierarchies
              09182 Protein families: genetic information processing
               03012 Translation factors [BR:zma03012]
                103644366
               03029 Mitochondrial biogenesis [BR:zma03029]
                103644366
            Translation factors [BR:zma03012]
             Eukaryotic type
              Release factors
               103644366
            Mitochondrial biogenesis [BR:zma03029]
             Mitochondrial DNA transcription, translation, and replication factors
              Mitochondrial transcription and translation factors
               Mitochondrial translation factors
                103644366
POSITION    1:34607..40208
MOTIF       Pfam: mTERF
DBLINKS     NCBI-Ge

To cache all genes on KEGG for an organism, loop over all entries in the KEGG list for that species.

In [None]:
for kegg_gene in tqdm(kegg_list_zma.gene):
    try:
        download_kegg_gene(kegg_gene = kegg_gene, 
                           sleep_for = np.random.uniform(0.5, 1.5))
    except:
        print('Problem with '+kegg_gene) 

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 37712/37712 [00:37<00:00, 1009.32it/s]


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()