In [None]:
import os
import requests
import pandas as pd
import numpy as np
import ftplib
import json
from ftplib import FTP
import gzip
from io import StringIO,BytesIO

In [None]:
# get the json file with dropbox path info in Win10
dropbox_path_file = ''.join(['c:/Users/',os.getlogin(),'/AppData/Local/Dropbox/info.json'])
try:
    with open(dropbox_path_file) as data_file:    
        data = json.load(data_file)
except:
    with open('C:/Users/Surbhi/AppData/Local/Dropbox/info.json') as data_file:    
        data = json.load(data_file)
dropbox_path = data['personal']['path']
os.chdir(dropbox_path)

In [None]:
# Gene Ontology download gene2go.gz from NCBI
gene2go_url = 'ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz'
ftp = FTP('ftp.ncbi.nlm.nih.gov')   # connect to host, default port
ftp.login()               # user anonymous, passwd anonymous@
ftp.cwd('./gene/DATA/') # move to target directory
# download gene2go.gz and store it in ./ontology info folder.
ftp.retrbinary('RETR gene2go.gz', open('./Ontology_Info/Gene2GO.gz', 'wb').write)
print('Downloaded gene2go.gz')

In [None]:
# Pathways download bsid2info.gz and  biosystems_gene.gz and parse it leaving only human and mouse pathways
bytesdata = BytesIO()
def bytesdata_writer(fdata):
    # fdata = gzip.GzipFile(fileobj=fdata)
    bytesdata.write(fdata)
ftp = ftplib.FTP('ftp.ncbi.nih.gov')
ftp.login()
ftp.cwd("/pub/biosystems/CURRENT")
ftp.retrbinary("RETR bsid2info.gz",bytesdata_writer)
bytesdata.seek(0)
with gzip.open(bytesdata) as f:
    bsid2info = f.read()
bsid2info = bsid2info.decode('ascii','ignore')
bsid2info = bsid2info.split('\n')
bsid2info = list(map(lambda x: x.split('\t'), bsid2info))
bsid2info = pd.DataFrame(bsid2info)
bsid2info.columns = ['bsid','source','accession','pathway_name','type of biosystem',
                'taxonomic scope of biosystem','NCBI taxid','description of biosystem']
bsid2info = bsid2info[bsid2info['NCBI taxid'].isin(['9606','10090'])]
print('Downloaded and processed bsid2info.gz')
# download and parse biosystems_gene.gz
bytesdata = BytesIO()
ftp.retrbinary("RETR biosystems_gene.gz",bytesdata_writer)
bytesdata.seek(0)
with gzip.open(bytesdata) as f:
    bsid2gene = f.read()
print('Downloaded biosystems_gene.gz')
bsid2gene = bsid2gene.decode('ascii','ignore')
bsid2gene = bsid2gene.split('\n')
bsid2gene = list(map(lambda x: x.split('\t'), bsid2gene))
bsid2gene = pd.DataFrame(bsid2gene)
bsid2gene.columns = ['bsid','geneid','score']
print('Merging bsid2info with bsid2gene...')
bsid2gene_human_mouse = bsid2info.merge(bsid2gene,on='bsid',how='inner')
bsid2gene_human_mouse.ix[:,[0,1,3,6,8]].to_csv('./Ontology_Info/bsid2info.txt',sep = '\t',index = False)
print('Downloaded biosystems_gene.gz, extracted human and mouse pathways, and saved as bsid2info.txt')

In [None]:
# # Mouse Pheontype Get mouse phenotype data
# mp_obo_url = 'http://www.informatics.jax.org/downloads/reports/MPheno_OBO.ontology'
# mp_g2m_url = 'http://www.informatics.jax.org/downloads/reports/MGI_PhenoGenoMP.rpt'
# for i in [mp_obo_url,mp_g2m_url]:
#     file_content = requests.get(i).text
#     filename = i.split('/')[-1]
#     with open(''.join(['./Ontology_Info/',filename]),'wb') as f:
#         file_content = file_content.encode('utf-8')
#         f.write(file_content)
#     print('Downloaded ' + filename)

In [None]:
# This part of codes is used to process MP, but it is now probmatic, and need to be fixed later.
# TODO::
# parse MPheno_OBO
# parsed_MP = pd.DataFrame(columns = ['name'])
# with open('./Ontology_Info/MPheno_OBO.ontology','rb') as f:
#     fdata = f.readlines()
# fdata = [x.strip().decode() for x in fdata]
# for idx in range(len(fdata)):
#     if ('id:' in fdata[idx]) and ('alt id:' not in fdata[idx]):
#         if 'name: ' in fdata[idx+1]:
#             mp_name = fdata[idx+1].replace('name: ','')
#             mp_id = fdata[idx].replace('id: ','')
#             parsed_MP.ix[mp_id,0] = mp_name
            
# # Parse MGI_PhenoGenoMP into a MP to MGI table
# mgi2entrez = pd.read_table('./Ontology_Info/MGI2H_entrez.txt',index_col=0)
# gene2mp = pd.read_table('./Ontology_Info/MGI_PhenoGenoMP.rpt',index_col = -1,header=None)
# gene2mp = pd.DataFrame(gene2mp.ix[:,3])
# gene2mp.columns = ['MP_id']
# gene2mp['MGI'] = gene2mp.index
# gene2mp = gene2mp.groupby('MP_id').agg(lambda x: ','.join(x))
# gene2mp_unstacked = pd.DataFrame(columns=['MGI_id'])
# for idx in gene2mp.index:
#     mgi_ids = gene2mp.ix[idx,0].split(',')
#     tmp_df = pd.DataFrame(mgi_ids,index = [idx]*len(mgi_ids),columns=gene2mp_unstacked.columns)
#     gene2mp_unstacked = gene2mp_unstacked.append(tmp_df)
# Hgene2mp = gene2mp_unstacked.merge(mgi2entrez,left_on='MGI_id',right_index=True)
# Hgene2mp['MP'] = Hgene2mp.index
# Hgene2mp.set_index('Human homologue entrez',inplace = True)

In [None]:
# getting toppgene libraries and save locally
for lib in ['GeneOntologyBiologicalProcess', 'GeneOntologyCellularComponent','MousePheno']:
    data = requests.post("https://toppgene.cchmc.org/table.jsp", data={"category":lib})
    with open(''.join(['./Ontology_Info/',lib,'.txt']), 'w') as f:
        f.write(data.text)
# use toppgene's MP library instead
gene2go = pd.DataFrame()
for lib in ['GeneOntologyBiologicalProcess', 'GeneOntologyCellularComponent','MousePheno']:
    tmp = pd.read_table(''.join(['./Ontology_Info/',lib,'.txt']),index_col = 0)
    tmp = tmp.ix[:,[1,3,2]]
    tmp.columns = ['Term_id','Term_category','Term_name']
    if 'Process' in lib:
        tmp['Term_category'] = 'BP'
    elif 'Component' in lib:
        tmp['Term_category'] = 'CC'
    elif 'MousePheno' in lib:
        tmp['Term_category'] = 'MP'               
    gene2go = gene2go.append(tmp)
# compile gene to term library of GOBP,BOCC, GOMF, Pathway and MP. and term id to term name file
# read gene2go and pathways into a dataframe
pathways = pd.read_table('./Ontology_Info/bsid2info.txt',index_col = 4)
gene2pathwayid = pathways[pathways['NCBI taxid'] == 9606][['bsid','pathway_name']]
gene2pathwayid['Category'] = 'Pathway'
gene2pathwayid = gene2pathwayid[['bsid','Category','pathway_name']]
gene2pathwayid.columns = ['Term_id','Term_category','Term_name']
gene2term = pd.concat([gene2go,gene2pathwayid])
gene2term.ix[:,:2].to_csv('./Ontology_Info/Gene2Term.txt',sep = '\t')
termid2termname = gene2term.drop_duplicates().set_index('Term_id').ix[:,1]
termid2termname.to_csv('./Ontology_Info/TermID2TermName.txt',sep = '\t')

In [None]:
# Get a gene symbole conversion file with human symbol, entrez, mouse symbol, entrez and ensemble id
# actuall address ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt
ftp = FTP('ftp.ebi.ac.uk')   # connect to host, default port
ftp.login()               # user anonymous, passwd anonymous@
ftp.cwd('./pub/databases/genenames/new/tsv/') # move to target directory
ftp.retrbinary('RETR hgnc_complete_set.txt', open('./Ontology_Info/HGNC_complete.txt', 'wb').write)
print('Downloaded HGNC_complete.txt from HGNC')
with open('./Ontology_Info/HGNC_complete.txt','rb') as f:
    data = f.read()
data = data.decode('ascii','ignore')
data = data.split('\n')
data = list(map(lambda x: x.split('\t'), data))
df_data = pd.DataFrame(data)
df_data.columns = df_data.ix[0,:]
df_data.set_index('symbol',inplace = True)
hgnc2mgi = df_data.ix[1:,['hgnc_id','entrez_id','mgd_id','ensembl_gene_id']]
hgnc2mgi['h_symbol'] = hgnc2mgi.index
hgnc2mgi.columns = ['hgnc_id','h_entrez_id','mgd_id','h_ensembl_gene_id','h_symbol']
# down load MGI_id to 
mgi2entrez = requests.get('http://www.informatics.jax.org/downloads/reports/MGI_EntrezGene.rpt')
stringdata = StringIO(mgi2entrez.text)
mgi2entrez = pd.read_table(stringdata, header=None, index_col = 0,dtype = str)
mgi2entrez = mgi2entrez.iloc[:,[0,7]]
mgi2entrez.columns = ['mouse_symbol','m_entrez_id']
hgnc2mgi = hgnc2mgi.merge(mgi2entrez,left_on='mgd_id',right_index=True,how='outer')
hgnc2mgi.to_csv('./ontology_info/Master Gene Conversion Table.csv')
print('Downloaded MGI_EntrezGene.rpt from MGI')
print('Gene symbol conversion complete and saved as ./ontology_info/Master Gene Conversion Table.csv')