Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
17aca57
commit 9efb04e
Showing
19 changed files
with
1,729 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
import json | ||
import requests | ||
import progressbar | ||
import sys | ||
|
||
def update_protein(gene_seq,gene): | ||
t=0 | ||
while(t!=2): | ||
try: | ||
server = "https://rest.ensembl.org" | ||
ext = "/sequence/id/"+str(gene)+"?type=protein;multiple_sequences=1" | ||
|
||
r = requests.get(server+ext, headers={ "Content-Type" : "application/json"}) | ||
|
||
if not r.ok: | ||
r.raise_for_status() | ||
sys.exit() | ||
r=r.json() | ||
if len(r)==1: | ||
r=dict(r[0]) | ||
gene_seq[gene]=str(r["seq"]) | ||
return | ||
else: | ||
maxi=0 | ||
maxlen=0 | ||
for i in range(len(r)): | ||
m=r[i] | ||
m=dict(m) | ||
if len(m["seq"])>maxlen: | ||
maxi=i | ||
r=dict(r[maxi]) | ||
gene_seq[gene]=str(r["seq"]) | ||
return | ||
except : | ||
t+=1 | ||
#print("\nError:",e) | ||
continue | ||
gene_seq[gene]="" | ||
|
||
def update_rest_protein(data): | ||
gids={} | ||
with open("processed/not_found.json","r") as file: | ||
gids=dict(json.load(file)) | ||
|
||
gids=list(gids.keys()) | ||
|
||
geneseq={} | ||
|
||
server = "https://rest.ensembl.org" | ||
ext = "/sequence/id?type=protein" | ||
headers={ "Content-Type" : "application/json", "Accept" : "application/json"} | ||
|
||
for i in progressbar.progressbar(range(0,len(gids)-50,50)): | ||
ids=dict(ids=list(gids[i:i+50])) | ||
while(1): | ||
try: | ||
r = requests.post(server+ext, headers=headers, data=str(json.dumps(ids))) | ||
if not r.ok: | ||
r.raise_for_status() | ||
gs=r.json() | ||
tgs={} | ||
for g in gs: | ||
tgs[g["query"]]=g["seq"] | ||
geneseq.update(tgs) | ||
break | ||
except Exception as e: | ||
print("Error:",e) | ||
continue | ||
|
||
data.update(geneseq) | ||
for genes in gids: | ||
try: | ||
_=data[genes] | ||
except: | ||
print(genes) | ||
update_protein(data,genes) | ||
|
||
print("Gene Sequences Updated Successfully") | ||
return data | ||
|
||
def update(gene_seq,gene): | ||
t=0 | ||
while(t!=2): | ||
try: | ||
server = "https://rest.ensembl.org" | ||
ext = "/sequence/id/"+str(gene)+"?type=cds;multiple_sequences=1" | ||
|
||
r = requests.get(server+ext, headers={ "Content-Type" : "application/json"}) | ||
|
||
if not r.ok: | ||
r.raise_for_status() | ||
sys.exit() | ||
r=r.json() | ||
if len(r)==1: | ||
r=dict(r[0]) | ||
gene_seq[gene]=str(r["seq"]) | ||
return | ||
else: | ||
maxi=0 | ||
maxlen=0 | ||
for i in range(len(r)): | ||
m=r[i] | ||
m=dict(m) | ||
if len(m["seq"])>maxlen: | ||
maxi=i | ||
r=dict(r[maxi]) | ||
gene_seq[gene]=str(r["seq"]) | ||
return | ||
except Exception as e: | ||
t+=1 | ||
#print("\nError:",e) | ||
continue | ||
gene_seq[gene]="" | ||
|
||
def update_rest(data,fname): | ||
gids={} | ||
with open("processed/not_found_"+fname+".json","r") as file: | ||
gids=dict(json.load(file)) | ||
|
||
gids=list(gids.keys()) | ||
|
||
geneseq={} | ||
|
||
server = "https://rest.ensembl.org" | ||
ext = "/sequence/id?type=cds" | ||
headers={ "Content-Type" : "application/json", "Accept" : "application/json"} | ||
|
||
for i in progressbar.progressbar(range(0,len(gids)-50,50)): | ||
ids=dict(ids=list(gids[i:i+50])) | ||
while(1): | ||
try: | ||
r = requests.post(server+ext, headers=headers, data=str(json.dumps(ids))) | ||
if not r.ok: | ||
r.raise_for_status() | ||
gs=r.json() | ||
tgs={} | ||
for g in gs: | ||
tgs[g["query"]]=g["seq"] | ||
geneseq.update(tgs) | ||
break | ||
except Exception as e: | ||
print("Error:",e) | ||
continue | ||
|
||
data.update(geneseq) | ||
for genes in gids: | ||
try: | ||
_=data[genes] | ||
except: | ||
print(genes) | ||
update(data,genes) | ||
|
||
print("Gene Sequences Updated Successfully") | ||
return data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import pandas as pd | ||
import requests | ||
import sys | ||
import pickle | ||
from get_data import get_data_genome | ||
|
||
dir_g="data" | ||
cmap,cimap,ld,ldg,a,d=get_data_genome(dir_g) | ||
|
||
data=dict(cmap=cmap,cimap=cimap,ld=ld,ldg=ldg,a=a,d=d) | ||
|
||
with open("genome_maps","wb") as file: | ||
pickle.dump(data,file) | ||
|
||
print("Genome Maps Created Successfully.") |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import pandas as pd | ||
import numpy as np | ||
import json | ||
import gc | ||
import pickle | ||
import os | ||
import sys | ||
from tree_data import create_tree_data | ||
from process_negative import read_database_txt | ||
from select_data import read_db_homology | ||
|
||
def read_data_homology(dirname,nfname): | ||
lf=os.listdir(dirname) | ||
if len(lf)==0: | ||
print("No Files in the Directory!!!!!!!") | ||
sys.exit(1) | ||
a_h=[] | ||
d_h=[] | ||
for x in lf: | ||
df,n=read_db_homology(dirname,x) | ||
n=n.split()[0] | ||
try: | ||
indexes=np.load("processed/synteny_matrices/"+n+"_indexes.npy") | ||
except: | ||
print("Incomplete data for:",n) | ||
df=df.loc[indexes] | ||
a_h.append(df) | ||
d_h.append(n) | ||
#read the negative dataset | ||
df=read_database_txt(nfname) | ||
indexes=np.load("processed/synteny_matrices/"+nfname.split(".")[0]+"_indexes.npy") | ||
df=df.loc[indexes] | ||
a_h.append(df) | ||
d_h.append(nfname.split(".")[0]) | ||
return a_h,d_h | ||
|
||
def prepare_features(a_h,d_h,sptree,label): | ||
rows=[] | ||
smg_name="_synteny_matrices_global.npy" | ||
sml_name="_synteny_matrices_local.npy" | ||
smi_name="_indexes.npy" | ||
dir_name="processed/synteny_matrices/" | ||
for i in range(len(a_h)): | ||
df=a_h[i] | ||
n=d_h[i] | ||
try: | ||
smg=np.load(dir_name+n+smg_name) | ||
sml=np.load(dir_name+n+sml_name) | ||
indexes=np.load(dir_name+n+smi_name) | ||
except: | ||
print("Incomplete data for:",n) | ||
continue | ||
df=df.loc[indexes] | ||
|
||
branch_length_species,branch_length_homology_species,distance,dist_p_s,dist_p_hs=create_tree_data(sptree,df) | ||
assert(len(branch_length_species)==len(df)) | ||
assert(len(sml)==len(distance)) | ||
|
||
for i in range(len(df)): | ||
index=indexes[i] | ||
row=df.loc[index] | ||
r={} | ||
r["species"]=row["species"] | ||
r["homology_species"]=row["homology_species"] | ||
r["gene_stable_id"]=row["gene_stable_id"] | ||
r["homology_gene_stable_id"]=row["homology_gene_stable_id"] | ||
r["label"]=label[row["homology_type"]] | ||
r["global_alignment_matrix"]=smg[i] | ||
r["local_alignment_matrix"]=sml[i] | ||
r["index_homology_dataset"]=index | ||
r["bls"]=branch_length_species[i] | ||
r["blhs"]=branch_length_homology_species[i] | ||
r["dis"]=distance[i] | ||
r["dps"]=dist_p_s[i] | ||
r["dphs"]=dist_p_hs[i] | ||
rows.append(r) | ||
return rows | ||
|
||
def main(): | ||
arg=sys.argv | ||
nfname=arg[-1] | ||
a_h,d_h=read_data_homology("data_homology",nfname) | ||
labels=dict(ortholog_one2one=1, | ||
other_paralog=0, | ||
non_homolog=2, | ||
ortholog_one2many=1, | ||
ortholog_many2many=1, | ||
within_species_paralog=0, | ||
gene_split=4) | ||
rows=prepare_features(a_h,d_h,"species_tree.tree",labels) | ||
with open("dataset","wb") as file: | ||
pickle.dump(rows,file) | ||
print("Dataset_Finalized") | ||
|
||
if __name__=="__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import sys | ||
import os | ||
from read_data import read_data_genome,read_data_homology | ||
from process_data import list_dict_genomes,create_chromosome_maps | ||
|
||
def get_data_genome(dir): | ||
a=[] | ||
d={} | ||
ld=[] | ||
ldg=[] | ||
a,d=read_data_genome(dir,a,d) | ||
assert(len(a)==len(d)) | ||
print("Creating Maps:") | ||
ld,ldg=list_dict_genomes(a,d) | ||
cmap,cimap=create_chromosome_maps(a,d) | ||
assert(len(ld)==len(ldg)) | ||
for i in range(len(ld)): | ||
assert(len(ld[i])==len(ldg[i])) | ||
return cmap,cimap,ld,ldg,a,d | ||
|
||
def get_data_homology(dir): | ||
a_h=[] | ||
d_h={} | ||
a_h,d_h=read_data_homology(dir) | ||
assert(len(a_h)==len(d_h)) | ||
return a_h,d_h |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import sys | ||
import numpy as np | ||
import pandas as pd | ||
import json | ||
import os | ||
import gc | ||
import pickle | ||
from select_data import read_db_homology | ||
from process_data import create_data_homology_ls | ||
|
||
def read_genome_maps(): | ||
data={} | ||
with open("genome_maps","rb") as file: | ||
data=pickle.load(file) | ||
cmap=data["cmap"] | ||
cimap=data["cimap"] | ||
ld=data["ld"] | ||
ldg=data["ldg"] | ||
a=data["a"] | ||
d=data["d"] | ||
return a,d,ld,ldg,cmap,cimap | ||
|
||
def read_data_homology(dirname): | ||
lf=os.listdir(dirname) | ||
if len(lf)==0: | ||
print("No Files in the Directory!!!!!!!") | ||
sys.exit(1) | ||
a_h=[] | ||
d_h=[] | ||
for x in lf: | ||
df,n=read_db_homology(dirname,x) | ||
n=n.split()[0] | ||
try: | ||
indexes=np.load("processed/"+n+"_selected_indexes.npy") | ||
except: | ||
print("Incomplete data for:",n) | ||
df=df.loc[indexes] | ||
print(len(df)) | ||
a_h.append(df) | ||
d_h.append(n) | ||
return a_h,d_h | ||
|
||
def main(): | ||
a,d,ld,ldg,cmap,cimap=read_genome_maps() | ||
print("Genome Maps Loaded.") | ||
a_h,d_h=read_data_homology("data_homology") | ||
print("Data Read.") | ||
n=3 | ||
_=create_data_homology_ls(a_h,d_h,n,a,d,ld,ldg,cmap,cimap,1) | ||
print("Neighbor Genes Found and Saved Successfully:)") | ||
|
||
if __name__=="__main__": | ||
main() |
Oops, something went wrong.