Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
HarshitGupta11 committed Jul 24, 2019
1 parent 17aca57 commit 9efb04e
Show file tree
Hide file tree
Showing 19 changed files with 1,729 additions and 0 deletions.
154 changes: 154 additions & 0 deletions access_data_rest.py
@@ -0,0 +1,154 @@
import json
import requests
import progressbar
import sys

def update_protein(gene_seq,gene):
t=0
while(t!=2):
try:
server = "https://rest.ensembl.org"
ext = "/sequence/id/"+str(gene)+"?type=protein;multiple_sequences=1"

r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})

if not r.ok:
r.raise_for_status()
sys.exit()
r=r.json()
if len(r)==1:
r=dict(r[0])
gene_seq[gene]=str(r["seq"])
return
else:
maxi=0
maxlen=0
for i in range(len(r)):
m=r[i]
m=dict(m)
if len(m["seq"])>maxlen:
maxi=i
r=dict(r[maxi])
gene_seq[gene]=str(r["seq"])
return
except :
t+=1
#print("\nError:",e)
continue
gene_seq[gene]=""

def update_rest_protein(data):
gids={}
with open("processed/not_found.json","r") as file:
gids=dict(json.load(file))

gids=list(gids.keys())

geneseq={}

server = "https://rest.ensembl.org"
ext = "/sequence/id?type=protein"
headers={ "Content-Type" : "application/json", "Accept" : "application/json"}

for i in progressbar.progressbar(range(0,len(gids)-50,50)):
ids=dict(ids=list(gids[i:i+50]))
while(1):
try:
r = requests.post(server+ext, headers=headers, data=str(json.dumps(ids)))
if not r.ok:
r.raise_for_status()
gs=r.json()
tgs={}
for g in gs:
tgs[g["query"]]=g["seq"]
geneseq.update(tgs)
break
except Exception as e:
print("Error:",e)
continue

data.update(geneseq)
for genes in gids:
try:
_=data[genes]
except:
print(genes)
update_protein(data,genes)

print("Gene Sequences Updated Successfully")
return data

def update(gene_seq,gene):
t=0
while(t!=2):
try:
server = "https://rest.ensembl.org"
ext = "/sequence/id/"+str(gene)+"?type=cds;multiple_sequences=1"

r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})

if not r.ok:
r.raise_for_status()
sys.exit()
r=r.json()
if len(r)==1:
r=dict(r[0])
gene_seq[gene]=str(r["seq"])
return
else:
maxi=0
maxlen=0
for i in range(len(r)):
m=r[i]
m=dict(m)
if len(m["seq"])>maxlen:
maxi=i
r=dict(r[maxi])
gene_seq[gene]=str(r["seq"])
return
except Exception as e:
t+=1
#print("\nError:",e)
continue
gene_seq[gene]=""

def update_rest(data,fname):
gids={}
with open("processed/not_found_"+fname+".json","r") as file:
gids=dict(json.load(file))

gids=list(gids.keys())

geneseq={}

server = "https://rest.ensembl.org"
ext = "/sequence/id?type=cds"
headers={ "Content-Type" : "application/json", "Accept" : "application/json"}

for i in progressbar.progressbar(range(0,len(gids)-50,50)):
ids=dict(ids=list(gids[i:i+50]))
while(1):
try:
r = requests.post(server+ext, headers=headers, data=str(json.dumps(ids)))
if not r.ok:
r.raise_for_status()
gs=r.json()
tgs={}
for g in gs:
tgs[g["query"]]=g["seq"]
geneseq.update(tgs)
break
except Exception as e:
print("Error:",e)
continue

data.update(geneseq)
for genes in gids:
try:
_=data[genes]
except:
print(genes)
update(data,genes)

print("Gene Sequences Updated Successfully")
return data
15 changes: 15 additions & 0 deletions create_genome_maps.py
@@ -0,0 +1,15 @@
import pandas as pd
import requests
import sys
import pickle
from get_data import get_data_genome

dir_g="data"
cmap,cimap,ld,ldg,a,d=get_data_genome(dir_g)

data=dict(cmap=cmap,cimap=cimap,ld=ld,ldg=ldg,a=a,d=d)

with open("genome_maps","wb") as file:
pickle.dump(data,file)

print("Genome Maps Created Successfully.")
207 changes: 207 additions & 0 deletions dist_matrix

Large diffs are not rendered by default.

96 changes: 96 additions & 0 deletions finalize_dataset.py
@@ -0,0 +1,96 @@
import pandas as pd
import numpy as np
import json
import gc
import pickle
import os
import sys
from tree_data import create_tree_data
from process_negative import read_database_txt
from select_data import read_db_homology

def read_data_homology(dirname,nfname):
lf=os.listdir(dirname)
if len(lf)==0:
print("No Files in the Directory!!!!!!!")
sys.exit(1)
a_h=[]
d_h=[]
for x in lf:
df,n=read_db_homology(dirname,x)
n=n.split()[0]
try:
indexes=np.load("processed/synteny_matrices/"+n+"_indexes.npy")
except:
print("Incomplete data for:",n)
df=df.loc[indexes]
a_h.append(df)
d_h.append(n)
#read the negative dataset
df=read_database_txt(nfname)
indexes=np.load("processed/synteny_matrices/"+nfname.split(".")[0]+"_indexes.npy")
df=df.loc[indexes]
a_h.append(df)
d_h.append(nfname.split(".")[0])
return a_h,d_h

def prepare_features(a_h,d_h,sptree,label):
rows=[]
smg_name="_synteny_matrices_global.npy"
sml_name="_synteny_matrices_local.npy"
smi_name="_indexes.npy"
dir_name="processed/synteny_matrices/"
for i in range(len(a_h)):
df=a_h[i]
n=d_h[i]
try:
smg=np.load(dir_name+n+smg_name)
sml=np.load(dir_name+n+sml_name)
indexes=np.load(dir_name+n+smi_name)
except:
print("Incomplete data for:",n)
continue
df=df.loc[indexes]

branch_length_species,branch_length_homology_species,distance,dist_p_s,dist_p_hs=create_tree_data(sptree,df)
assert(len(branch_length_species)==len(df))
assert(len(sml)==len(distance))

for i in range(len(df)):
index=indexes[i]
row=df.loc[index]
r={}
r["species"]=row["species"]
r["homology_species"]=row["homology_species"]
r["gene_stable_id"]=row["gene_stable_id"]
r["homology_gene_stable_id"]=row["homology_gene_stable_id"]
r["label"]=label[row["homology_type"]]
r["global_alignment_matrix"]=smg[i]
r["local_alignment_matrix"]=sml[i]
r["index_homology_dataset"]=index
r["bls"]=branch_length_species[i]
r["blhs"]=branch_length_homology_species[i]
r["dis"]=distance[i]
r["dps"]=dist_p_s[i]
r["dphs"]=dist_p_hs[i]
rows.append(r)
return rows

def main():
arg=sys.argv
nfname=arg[-1]
a_h,d_h=read_data_homology("data_homology",nfname)
labels=dict(ortholog_one2one=1,
other_paralog=0,
non_homolog=2,
ortholog_one2many=1,
ortholog_many2many=1,
within_species_paralog=0,
gene_split=4)
rows=prepare_features(a_h,d_h,"species_tree.tree",labels)
with open("dataset","wb") as file:
pickle.dump(rows,file)
print("Dataset_Finalized")

if __name__=="__main__":
main()
26 changes: 26 additions & 0 deletions get_data.py
@@ -0,0 +1,26 @@
import sys
import os
from read_data import read_data_genome,read_data_homology
from process_data import list_dict_genomes,create_chromosome_maps

def get_data_genome(dir):
a=[]
d={}
ld=[]
ldg=[]
a,d=read_data_genome(dir,a,d)
assert(len(a)==len(d))
print("Creating Maps:")
ld,ldg=list_dict_genomes(a,d)
cmap,cimap=create_chromosome_maps(a,d)
assert(len(ld)==len(ldg))
for i in range(len(ld)):
assert(len(ld[i])==len(ldg[i]))
return cmap,cimap,ld,ldg,a,d

def get_data_homology(dir):
a_h=[]
d_h={}
a_h,d_h=read_data_homology(dir)
assert(len(a_h)==len(d_h))
return a_h,d_h
53 changes: 53 additions & 0 deletions neighbor_genes.py
@@ -0,0 +1,53 @@
import sys
import numpy as np
import pandas as pd
import json
import os
import gc
import pickle
from select_data import read_db_homology
from process_data import create_data_homology_ls

def read_genome_maps():
data={}
with open("genome_maps","rb") as file:
data=pickle.load(file)
cmap=data["cmap"]
cimap=data["cimap"]
ld=data["ld"]
ldg=data["ldg"]
a=data["a"]
d=data["d"]
return a,d,ld,ldg,cmap,cimap

def read_data_homology(dirname):
lf=os.listdir(dirname)
if len(lf)==0:
print("No Files in the Directory!!!!!!!")
sys.exit(1)
a_h=[]
d_h=[]
for x in lf:
df,n=read_db_homology(dirname,x)
n=n.split()[0]
try:
indexes=np.load("processed/"+n+"_selected_indexes.npy")
except:
print("Incomplete data for:",n)
df=df.loc[indexes]
print(len(df))
a_h.append(df)
d_h.append(n)
return a_h,d_h

def main():
a,d,ld,ldg,cmap,cimap=read_genome_maps()
print("Genome Maps Loaded.")
a_h,d_h=read_data_homology("data_homology")
print("Data Read.")
n=3
_=create_data_homology_ls(a_h,d_h,n,a,d,ld,ldg,cmap,cimap,1)
print("Neighbor Genes Found and Saved Successfully:)")

if __name__=="__main__":
main()

0 comments on commit 9efb04e

Please sign in to comment.