In [1]:
import re
import time
import os

import pandas as pd
import pylab as plt
import numpy as np
import seaborn as sns

import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

import Bio
from Bio import Phylo, SeqIO, Entrez
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

import networkx as nx
import igraph
from ete3 import NCBITaxa

In [2]:
Entrez.email = 'burankova.y@gmail.com'

Using truncated TOL (https://itol.embl.de/) — leave only Opisthokonta.

# Truncate TOL

In [177]:
tree = Phylo.read('../data/TOL/normal_newick.txt', "newick")
Phylo.draw_ascii(tree)

                     __________ Nanoarchaeum_equitans
                   _|
                  | |  _______ Pyrobaculum_aerophilum
                  | |_|
                  |   | _____ Aeropyrum_pernix
                  |   ||
                  |    |     __ Sulfolobus_solfataricus
                  |    |____|
              ____|         |_ Sulfolobus_tokodaii
             |    |
             |    |          , Thermoplasma_volcanium
             |    |  ________|
             |    | |        | Thermoplasma_acidophilum
             |    | |
             |    | |  ____ Archaeoglobus_fulgidus
             |    | |,|
             |    |_|||  _______ Halobacterium_sp._NRC-1
             |      |||_|
             |      ||  |   , Methanosarcina_acetivorans
             |      ||  |___|
             |      ||      | Methanosarcina_mazei
             |      ||
             |       |    , Pyrococcus_furiosus
  ___________|       | ___|
 |           |       ||   |, Pyrococcus_horikoshii
 |      

In [61]:
tree.get_nonterminals()

[Clade(),
 Clade(branch_length=0.88776),
 Clade(branch_length=0.43325, comment='100', name='Archaea'),
 Clade(branch_length=0.15444, comment='100', name='Archaea'),
 Clade(branch_length=0.12018, comment='100', name='Thermoprotei'),
 Clade(branch_length=0.09462, comment='94', name='Thermoprotei'),
 Clade(branch_length=0.3372, comment='100', name='Sulfolobus'),
 Clade(branch_length=0.11833, comment='99', name='Euryarchaeota'),
 Clade(branch_length=0.66151, comment='100', name='Thermoplasma'),
 Clade(branch_length=0.06815, comment='62', name='Euryarchaeota'),
 Clade(branch_length=0.10395, comment='100', name='Euryarchaeota'),
 Clade(branch_length=0.12801, comment='100', name='Euryarchaeota'),
 Clade(branch_length=0.30588, comment='100', name='Methanosarcina'),
 Clade(branch_length=0.04469, comment='51', name='Euryarchaeota'),
 Clade(branch_length=0.3622, comment='100', name='Pyrococcus'),
 Clade(branch_length=0.02239, comment='100', name='Pyrococcus'),
 Clade(branch_length=0.06284, commen

In [73]:
small_tree = tree.common_ancestor('Eukaryota', 'Archaea')
Phylo.draw_ascii(small_tree)

                         ____________ Nanoarchaeum_equitans
                       _|
                      | |  ________ Pyrobaculum_aerophilum
                      | |_|
                      |   |  ______ Aeropyrum_pernix
                      |   |_|
                      |     |     __ Sulfolobus_solfataricus
                      |     |____|
                ______|          |__ Sulfolobus_tokodaii
               |      |
               |      |            _ Thermoplasma_volcanium
               |      |  _________|
               |      | |         |_ Thermoplasma_acidophilum
               |      | |
               |      | |  _____ Archaeoglobus_fulgidus
               |      | |,|
               |      |_|||   ________ Halobacterium_sp._NRC-1
               |        |||__|
               |        ||   |   , Methanosarcina_acetivorans
               |        ||   |___|
               |        ||       | Methanosarcina_mazei
               |        ||
               |         

In [76]:
small_tree = tree.common_ancestor('Eukaryota')
Phylo.draw_ascii(small_tree)

                        ____________________ Giardia_lamblia
                       |
                       |   _________________ Leishmania_major
_______________________|  |
                       |  |    ____________ Thalassiosira_pseudonana
                       |  |  ,|
                       |__|  ||      __________ Plasmodium_falciparum
                          |  ||_____|
                          |  |      |________ Cryptosporidium_hominis
                          |  |
                          |__|    ______________ Cyanidioschyzon_merolae
                             | __|
                             ||  |      __ Oryza_sativa
                             ||  |_____|
                             ||        |___ Arabidopsis_thaliana
                             ||
                              | ____________ Dictyostelium_discoideum
                              ||
                              ||       _________ Schizosaccharomyces_pombe
                              ||  

In [77]:
# save small tree
Phylo.write(small_tree, "../data/TOL/Eukaryota_tree.nwk", "newick")

1

In [78]:
# save list with terminal nodes of truncated tree
nodes_list = []

with open("../data/TOL/Eukaryota_tree_nodes.txt", 'w') as f:    
    for clade in small_tree.get_terminals():
        clade_str = ' '.join(clade.name.split('_'))
        nodes_list.append(clade_str)
        f.write(clade_str)
        f.write('\n')

# Download S. cerevisiae genes 

In [286]:
yeast_data = pd.read_csv('../data/yeast.txt', sep='\t')

In [288]:
genes = set(yeast_data.Tf)
genes.update(yeast_data.Tg)

with open("../data/genes_lists/yeast/S_cerevisiae_genes.txt", 'w') as f: 
    f.write("\n".join(list(genes)))

In [289]:
genes_dict = dict.fromkeys(genes)
yeasts_genes = len(genes)

In [290]:
posted_genes = []

In [213]:
YG_URL = "https://www.yeastgenome.org/search"

def fetch(gene_name):
    if genes_dict[gene_name]:
        return None, None
    
    posted_genes.append(gene_name)
    
    params = {
            'q': gene_name,
            'is_quick': 'true',
        }
    
    resp = requests.post(YG_URL, params=params)
    time.sleep(3)
    return resp.content, gene_name
    
    
pool = ThreadPoolExecutor(max_workers=10)
i = 1

for page, gene_name in pool.map(fetch, genes_dict):
    try:
        soup = BeautifulSoup(page, 'lxml')
        locusID = re.search(r'(?<=locusId: ").*(?=",\n)', soup.find('script').text).group(0)
        genes_dict[gene_name] = locusID
        
    except (IndexError, TypeError, AttributeError):
        continue
        
    finally:
        print(f'{i} / {yeasts_genes} [{i/yeasts_genes}]')
        i += 1

0 / 4441 [0.0]
1 / 4441 [0.00022517451024544022]
2 / 4441 [0.00045034902049088043]
3 / 4441 [0.0006755235307363207]
4 / 4441 [0.0009006980409817609]
5 / 4441 [0.001125872551227201]
6 / 4441 [0.0013510470614726414]
7 / 4441 [0.0015762215717180816]
8 / 4441 [0.0018013960819635217]
9 / 4441 [0.002026570592208962]
10 / 4441 [0.002251745102454402]
11 / 4441 [0.0024769196126998424]
12 / 4441 [0.002702094122945283]
13 / 4441 [0.0029272686331907227]
14 / 4441 [0.003152443143436163]
15 / 4441 [0.003377617653681603]
16 / 4441 [0.0036027921639270434]
17 / 4441 [0.003827966674172484]
18 / 4441 [0.004053141184417924]
19 / 4441 [0.004278315694663364]
20 / 4441 [0.004503490204908804]
21 / 4441 [0.0047286647151542445]
22 / 4441 [0.004953839225399685]
23 / 4441 [0.005179013735645125]
24 / 4441 [0.005404188245890566]
25 / 4441 [0.005629362756136005]
26 / 4441 [0.0058545372663814455]
27 / 4441 [0.006079711776626886]
28 / 4441 [0.006304886286872326]
29 / 4441 [0.006530060797117767]
30 / 4441 [0.0067552353

356 / 4441 [0.08016212564737672]
357 / 4441 [0.08038730015762216]
358 / 4441 [0.0806124746678676]
359 / 4441 [0.08083764917811304]
360 / 4441 [0.08106282368835847]
361 / 4441 [0.08128799819860391]
362 / 4441 [0.08151317270884936]
363 / 4441 [0.0817383472190948]
364 / 4441 [0.08196352172934024]
365 / 4441 [0.08218869623958568]
366 / 4441 [0.08241387074983111]
367 / 4441 [0.08263904526007657]
368 / 4441 [0.082864219770322]
369 / 4441 [0.08308939428056744]
370 / 4441 [0.08331456879081288]
371 / 4441 [0.08353974330105832]
372 / 4441 [0.08376491781130375]
373 / 4441 [0.0839900923215492]
374 / 4441 [0.08421526683179464]
375 / 4441 [0.08444044134204008]
376 / 4441 [0.08466561585228552]
377 / 4441 [0.08489079036253096]
378 / 4441 [0.0851159648727764]
379 / 4441 [0.08534113938302185]
380 / 4441 [0.08556631389326728]
381 / 4441 [0.08579148840351272]
382 / 4441 [0.08601666291375816]
383 / 4441 [0.0862418374240036]
384 / 4441 [0.08646701193424905]
385 / 4441 [0.08669218644449449]
386 / 4441 [0.086

606 / 4441 [0.13645575320873676]
607 / 4441 [0.13668092771898221]
608 / 4441 [0.13690610222922764]
609 / 4441 [0.1371312767394731]
610 / 4441 [0.13735645124971854]
611 / 4441 [0.13758162575996397]
612 / 4441 [0.13780680027020942]
613 / 4441 [0.13803197478045484]
614 / 4441 [0.1382571492907003]
615 / 4441 [0.13848232380094574]
616 / 4441 [0.13870749831119117]
617 / 4441 [0.13893267282143662]
618 / 4441 [0.13915784733168204]
619 / 4441 [0.1393830218419275]
620 / 4441 [0.13960819635217295]
621 / 4441 [0.13983337086241837]
622 / 4441 [0.14005854537266382]
623 / 4441 [0.14028371988290925]
624 / 4441 [0.1405088943931547]
625 / 4441 [0.14073406890340012]
626 / 4441 [0.14095924341364557]
627 / 4441 [0.14118441792389103]
628 / 4441 [0.14140959243413645]
629 / 4441 [0.1416347669443819]
630 / 4441 [0.14185994145462733]
631 / 4441 [0.14208511596487278]
632 / 4441 [0.14231029047511823]
633 / 4441 [0.14253546498536365]
634 / 4441 [0.1427606394956091]
635 / 4441 [0.14298581400585453]
636 / 4441 [0.14

905 / 4441 [0.2037829317721234]
906 / 4441 [0.20400810628236885]
907 / 4441 [0.20423328079261427]
908 / 4441 [0.20445845530285972]
909 / 4441 [0.20468362981310514]
910 / 4441 [0.2049088043233506]
911 / 4441 [0.20513397883359605]
912 / 4441 [0.20535915334384147]
913 / 4441 [0.20558432785408692]
914 / 4441 [0.20580950236433235]
915 / 4441 [0.2060346768745778]
916 / 4441 [0.20625985138482325]
917 / 4441 [0.20648502589506867]
918 / 4441 [0.20671020040531413]
919 / 4441 [0.20693537491555955]
920 / 4441 [0.207160549425805]
921 / 4441 [0.20738572393605043]
922 / 4441 [0.20761089844629588]
923 / 4441 [0.20783607295654133]
924 / 4441 [0.20806124746678675]
925 / 4441 [0.2082864219770322]
926 / 4441 [0.20851159648727763]
927 / 4441 [0.20873677099752308]
928 / 4441 [0.20896194550776853]
929 / 4441 [0.20918712001801396]
930 / 4441 [0.2094122945282594]
931 / 4441 [0.20963746903850483]
932 / 4441 [0.20986264354875028]
933 / 4441 [0.21008781805899573]
934 / 4441 [0.21031299256924116]
935 / 4441 [0.210

1152 / 4441 [0.25940103580274715]
1153 / 4441 [0.25962621031299254]
1154 / 4441 [0.259851384823238]
1155 / 4441 [0.26007655933348345]
1156 / 4441 [0.2603017338437289]
1157 / 4441 [0.26052690835397435]
1158 / 4441 [0.26075208286421975]
1159 / 4441 [0.2609772573744652]
1160 / 4441 [0.26120243188471065]
1161 / 4441 [0.2614276063949561]
1162 / 4441 [0.26165278090520155]
1163 / 4441 [0.26187795541544695]
1164 / 4441 [0.2621031299256924]
1165 / 4441 [0.26232830443593785]
1166 / 4441 [0.2625534789461833]
1167 / 4441 [0.26277865345642876]
1168 / 4441 [0.26300382796667415]
1169 / 4441 [0.2632290024769196]
1170 / 4441 [0.26345417698716506]
1171 / 4441 [0.2636793514974105]
1172 / 4441 [0.26390452600765596]
1173 / 4441 [0.26412970051790136]
1174 / 4441 [0.2643548750281468]
1175 / 4441 [0.26458004953839226]
1176 / 4441 [0.2648052240486377]
1177 / 4441 [0.2650303985588831]
1178 / 4441 [0.26525557306912856]
1179 / 4441 [0.265480747579374]
1180 / 4441 [0.26570592208961946]
1181 / 4441 [0.2659310965998

1418 / 4441 [0.31929745552803424]
1419 / 4441 [0.3195226300382797]
1420 / 4441 [0.3197478045485251]
1421 / 4441 [0.31997297905877053]
1422 / 4441 [0.320198153569016]
1423 / 4441 [0.32042332807926144]
1424 / 4441 [0.3206485025895069]
1425 / 4441 [0.3208736770997523]
1426 / 4441 [0.32109885160999774]
1427 / 4441 [0.3213240261202432]
1428 / 4441 [0.32154920063048864]
1429 / 4441 [0.3217743751407341]
1430 / 4441 [0.3219995496509795]
1431 / 4441 [0.32222472416122494]
1432 / 4441 [0.3224498986714704]
1433 / 4441 [0.32267507318171584]
1434 / 4441 [0.3229002476919613]
1435 / 4441 [0.3231254222022067]
1436 / 4441 [0.32335059671245214]
1437 / 4441 [0.3235757712226976]
1438 / 4441 [0.32380094573294305]
1439 / 4441 [0.3240261202431885]
1440 / 4441 [0.3242512947534339]
1441 / 4441 [0.32447646926367935]
1442 / 4441 [0.3247016437739248]
1443 / 4441 [0.32492681828417025]
1444 / 4441 [0.32515199279441565]
1445 / 4441 [0.3253771673046611]
1446 / 4441 [0.32560234181490655]
1447 / 4441 [0.325827516325152]

1845 / 4441 [0.4154469714028372]
1846 / 4441 [0.41567214591308266]
1847 / 4441 [0.41589732042332805]
1848 / 4441 [0.4161224949335735]
1849 / 4441 [0.41634766944381896]
1850 / 4441 [0.4165728439540644]
1851 / 4441 [0.41679801846430986]
1852 / 4441 [0.41702319297455526]
1853 / 4441 [0.4172483674848007]
1854 / 4441 [0.41747354199504616]
1855 / 4441 [0.4176987165052916]
1856 / 4441 [0.41792389101553706]
1857 / 4441 [0.41814906552578246]
1858 / 4441 [0.4183742400360279]
1859 / 4441 [0.41859941454627336]
1860 / 4441 [0.4188245890565188]
1861 / 4441 [0.41904976356676427]
1862 / 4441 [0.41927493807700966]
1863 / 4441 [0.4195001125872551]
1864 / 4441 [0.41972528709750057]
1865 / 4441 [0.419950461607746]
1866 / 4441 [0.42017563611799147]
1867 / 4441 [0.42040081062823687]
1868 / 4441 [0.4206259851384823]
1869 / 4441 [0.42085115964872777]
1870 / 4441 [0.4210763341589732]
1871 / 4441 [0.42130150866921867]
1872 / 4441 [0.42152668317946407]
1873 / 4441 [0.4217518576897095]
1874 / 4441 [0.421977032199

2110 / 4441 [0.47511821661787884]
2111 / 4441 [0.4753433911281243]
2112 / 4441 [0.47556856563836974]
2113 / 4441 [0.4757937401486152]
2114 / 4441 [0.4760189146588606]
2115 / 4441 [0.47624408916910604]
2116 / 4441 [0.4764692636793515]
2117 / 4441 [0.47669443818959695]
2118 / 4441 [0.4769196126998424]
2119 / 4441 [0.4771447872100878]
2120 / 4441 [0.47736996172033325]
2121 / 4441 [0.4775951362305787]
2122 / 4441 [0.47782031074082415]
2123 / 4441 [0.4780454852510696]
2124 / 4441 [0.478270659761315]
2125 / 4441 [0.47849583427156045]
2126 / 4441 [0.4787210087818059]
2127 / 4441 [0.47894618329205135]
2128 / 4441 [0.4791713578022968]
2129 / 4441 [0.4793965323125422]
2130 / 4441 [0.47962170682278765]
2131 / 4441 [0.4798468813330331]
2132 / 4441 [0.48007205584327856]
2133 / 4441 [0.480297230353524]
2134 / 4441 [0.4805224048637694]
2135 / 4441 [0.48074757937401486]
2136 / 4441 [0.4809727538842603]
2137 / 4441 [0.48119792839450576]
2138 / 4441 [0.48142310290475115]
2139 / 4441 [0.4816482774149966]

2358 / 4441 [0.530961495158748]
2359 / 4441 [0.5311866696689934]
2360 / 4441 [0.5314118441792389]
2361 / 4441 [0.5316370186894843]
2362 / 4441 [0.5318621931997298]
2363 / 4441 [0.5320873677099752]
2364 / 4441 [0.5323125422202206]
2365 / 4441 [0.5325377167304661]
2366 / 4441 [0.5327628912407115]
2367 / 4441 [0.532988065750957]
2368 / 4441 [0.5332132402612024]
2369 / 4441 [0.5334384147714478]
2370 / 4441 [0.5336635892816933]
2371 / 4441 [0.5338887637919387]
2372 / 4441 [0.5341139383021842]
2373 / 4441 [0.5343391128124296]
2374 / 4441 [0.534564287322675]
2375 / 4441 [0.5347894618329205]
2376 / 4441 [0.5350146363431659]
2377 / 4441 [0.5352398108534114]
2378 / 4441 [0.5354649853636568]
2379 / 4441 [0.5356901598739022]
2380 / 4441 [0.5359153343841477]
2381 / 4441 [0.5361405088943931]
2382 / 4441 [0.5363656834046386]
2383 / 4441 [0.536590857914884]
2384 / 4441 [0.5368160324251294]
2385 / 4441 [0.5370412069353749]
2386 / 4441 [0.5372663814456203]
2387 / 4441 [0.5374915559558658]
2388 / 4441 [0

2608 / 4441 [0.587255122720108]
2609 / 4441 [0.5874802972303536]
2610 / 4441 [0.587705471740599]
2611 / 4441 [0.5879306462508445]
2612 / 4441 [0.5881558207610899]
2613 / 4441 [0.5883809952713352]
2614 / 4441 [0.5886061697815808]
2615 / 4441 [0.5888313442918262]
2616 / 4441 [0.5890565188020717]
2617 / 4441 [0.5892816933123171]
2618 / 4441 [0.5895068678225625]
2619 / 4441 [0.589732042332808]
2620 / 4441 [0.5899572168430534]
2621 / 4441 [0.5901823913532988]
2622 / 4441 [0.5904075658635443]
2623 / 4441 [0.5906327403737897]
2624 / 4441 [0.5908579148840352]
2625 / 4441 [0.5910830893942806]
2626 / 4441 [0.591308263904526]
2627 / 4441 [0.5915334384147715]
2628 / 4441 [0.5917586129250169]
2629 / 4441 [0.5919837874352624]
2630 / 4441 [0.5922089619455078]
2631 / 4441 [0.5924341364557532]
2632 / 4441 [0.5926593109659987]
2633 / 4441 [0.5928844854762441]
2634 / 4441 [0.5931096599864896]
2635 / 4441 [0.593334834496735]
2636 / 4441 [0.5935600090069804]
2637 / 4441 [0.5937851835172259]
2638 / 4441 [0.

2858 / 4441 [0.6435487502814682]
2859 / 4441 [0.6437739247917136]
2860 / 4441 [0.643999099301959]
2861 / 4441 [0.6442242738122045]
2862 / 4441 [0.6444494483224499]
2863 / 4441 [0.6446746228326954]
2864 / 4441 [0.6448997973429408]
2865 / 4441 [0.6451249718531862]
2866 / 4441 [0.6453501463634317]
2867 / 4441 [0.6455753208736771]
2868 / 4441 [0.6458004953839226]
2869 / 4441 [0.646025669894168]
2870 / 4441 [0.6462508444044134]
2871 / 4441 [0.6464760189146589]
2872 / 4441 [0.6467011934249043]
2873 / 4441 [0.6469263679351498]
2874 / 4441 [0.6471515424453952]
2875 / 4441 [0.6473767169556406]
2876 / 4441 [0.6476018914658861]
2877 / 4441 [0.6478270659761315]
2878 / 4441 [0.648052240486377]
2879 / 4441 [0.6482774149966224]
2880 / 4441 [0.6485025895068678]
2881 / 4441 [0.6487277640171133]
2882 / 4441 [0.6489529385273587]
2883 / 4441 [0.6491781130376042]
2884 / 4441 [0.6494032875478496]
2885 / 4441 [0.649628462058095]
2886 / 4441 [0.6498536365683405]
2887 / 4441 [0.6500788110785859]
2888 / 4441 [0

3110 / 4441 [0.7002927268633191]
3111 / 4441 [0.7005179013735645]
3112 / 4441 [0.7007430758838099]
3113 / 4441 [0.7009682503940554]
3114 / 4441 [0.7011934249043008]
3115 / 4441 [0.7014185994145463]
3116 / 4441 [0.7016437739247917]
3117 / 4441 [0.7018689484350371]
3118 / 4441 [0.7020941229452826]
3119 / 4441 [0.702319297455528]
3120 / 4441 [0.7025444719657735]
3121 / 4441 [0.7027696464760189]
3122 / 4441 [0.7029948209862643]
3123 / 4441 [0.7032199954965098]
3124 / 4441 [0.7034451700067552]
3125 / 4441 [0.7036703445170007]
3126 / 4441 [0.7038955190272461]
3127 / 4441 [0.7041206935374915]
3128 / 4441 [0.704345868047737]
3129 / 4441 [0.7045710425579824]
3130 / 4441 [0.7047962170682279]
3131 / 4441 [0.7050213915784733]
3132 / 4441 [0.7052465660887187]
3133 / 4441 [0.7054717405989642]
3134 / 4441 [0.7056969151092096]
3135 / 4441 [0.7059220896194551]
3136 / 4441 [0.7061472641297005]
3137 / 4441 [0.7063724386399459]
3138 / 4441 [0.7065976131501914]
3139 / 4441 [0.7068227876604368]
3140 / 4441 

3360 / 4441 [0.7565863544246791]
3361 / 4441 [0.7568115289349245]
3362 / 4441 [0.75703670344517]
3363 / 4441 [0.7572618779554154]
3364 / 4441 [0.7574870524656608]
3365 / 4441 [0.7577122269759063]
3366 / 4441 [0.7579374014861517]
3367 / 4441 [0.7581625759963972]
3368 / 4441 [0.7583877505066426]
3369 / 4441 [0.758612925016888]
3370 / 4441 [0.7588380995271335]
3371 / 4441 [0.7590632740373789]
3372 / 4441 [0.7592884485476245]
3373 / 4441 [0.7595136230578698]
3374 / 4441 [0.7597387975681152]
3375 / 4441 [0.7599639720783607]
3376 / 4441 [0.7601891465886061]
3377 / 4441 [0.7604143210988517]
3378 / 4441 [0.760639495609097]
3379 / 4441 [0.7608646701193424]
3380 / 4441 [0.761089844629588]
3381 / 4441 [0.7613150191398333]
3382 / 4441 [0.7615401936500789]
3383 / 4441 [0.7617653681603243]
3384 / 4441 [0.7619905426705696]
3385 / 4441 [0.7622157171808152]
3386 / 4441 [0.7624408916910606]
3387 / 4441 [0.7626660662013061]
3388 / 4441 [0.7628912407115515]
3389 / 4441 [0.7631164152217969]
3390 / 4441 [0.

3610 / 4441 [0.8128799819860392]
3611 / 4441 [0.8131051564962847]
3612 / 4441 [0.8133303310065301]
3613 / 4441 [0.8135555055167755]
3614 / 4441 [0.813780680027021]
3615 / 4441 [0.8140058545372664]
3616 / 4441 [0.8142310290475118]
3617 / 4441 [0.8144562035577573]
3618 / 4441 [0.8146813780680027]
3619 / 4441 [0.8149065525782482]
3620 / 4441 [0.8151317270884936]
3621 / 4441 [0.815356901598739]
3622 / 4441 [0.8155820761089845]
3623 / 4441 [0.8158072506192299]
3624 / 4441 [0.8160324251294754]
3625 / 4441 [0.8162575996397208]
3626 / 4441 [0.8164827741499662]
3627 / 4441 [0.8167079486602117]
3628 / 4441 [0.8169331231704571]
3629 / 4441 [0.8171582976807026]
3630 / 4441 [0.817383472190948]
3631 / 4441 [0.8176086467011934]
3632 / 4441 [0.8178338212114389]
3633 / 4441 [0.8180589957216843]
3634 / 4441 [0.8182841702319298]
3635 / 4441 [0.8185093447421752]
3636 / 4441 [0.8187345192524206]
3637 / 4441 [0.8189596937626661]
3638 / 4441 [0.8191848682729115]
3639 / 4441 [0.819410042783157]
3640 / 4441 [0

3861 / 4441 [0.8693987840576447]
3862 / 4441 [0.8696239585678901]
3863 / 4441 [0.8698491330781356]
3864 / 4441 [0.870074307588381]
3865 / 4441 [0.8702994820986264]
3866 / 4441 [0.8705246566088719]
3867 / 4441 [0.8707498311191173]
3868 / 4441 [0.8709750056293628]
3869 / 4441 [0.8712001801396082]
3870 / 4441 [0.8714253546498536]
3871 / 4441 [0.8716505291600991]
3872 / 4441 [0.8718757036703445]
3873 / 4441 [0.87210087818059]
3874 / 4441 [0.8723260526908354]
3875 / 4441 [0.8725512272010808]
3876 / 4441 [0.8727764017113263]
3877 / 4441 [0.8730015762215717]
3878 / 4441 [0.8732267507318171]
3879 / 4441 [0.8734519252420626]
3880 / 4441 [0.873677099752308]
3881 / 4441 [0.8739022742625535]
3882 / 4441 [0.8741274487727989]
3883 / 4441 [0.8743526232830443]
3884 / 4441 [0.8745777977932898]
3885 / 4441 [0.8748029723035352]
3886 / 4441 [0.8750281468137807]
3887 / 4441 [0.8752533213240261]
3888 / 4441 [0.8754784958342715]
3889 / 4441 [0.875703670344517]
3890 / 4441 [0.8759288448547624]
3891 / 4441 [0.

4111 / 4441 [0.9256924116190047]
4112 / 4441 [0.9259175861292501]
4113 / 4441 [0.9261427606394956]
4114 / 4441 [0.926367935149741]
4115 / 4441 [0.9265931096599865]
4116 / 4441 [0.9268182841702319]
4117 / 4441 [0.9270434586804773]
4118 / 4441 [0.9272686331907228]
4119 / 4441 [0.9274938077009682]
4120 / 4441 [0.9277189822112137]
4121 / 4441 [0.9279441567214591]
4122 / 4441 [0.9281693312317045]
4123 / 4441 [0.92839450574195]
4124 / 4441 [0.9286196802521954]
4125 / 4441 [0.9288448547624409]
4126 / 4441 [0.9290700292726863]
4127 / 4441 [0.9292952037829317]
4128 / 4441 [0.9295203782931772]
4129 / 4441 [0.9297455528034226]
4130 / 4441 [0.9299707273136681]
4131 / 4441 [0.9301959018239135]
4132 / 4441 [0.9304210763341589]
4133 / 4441 [0.9306462508444044]
4134 / 4441 [0.9308714253546498]
4135 / 4441 [0.9310965998648953]
4136 / 4441 [0.9313217743751407]
4137 / 4441 [0.9315469488853861]
4138 / 4441 [0.9317721233956316]
4139 / 4441 [0.931997297905877]
4140 / 4441 [0.9322224724161225]
4141 / 4441 [0

4363 / 4441 [0.9824363882008557]
4364 / 4441 [0.982661562711101]
4365 / 4441 [0.9828867372213466]
4366 / 4441 [0.983111911731592]
4367 / 4441 [0.9833370862418375]
4368 / 4441 [0.9835622607520829]
4369 / 4441 [0.9837874352623283]
4370 / 4441 [0.9840126097725738]
4371 / 4441 [0.9842377842828192]
4372 / 4441 [0.9844629587930647]
4373 / 4441 [0.9846881333033101]
4374 / 4441 [0.9849133078135555]
4375 / 4441 [0.985138482323801]
4376 / 4441 [0.9853636568340464]
4377 / 4441 [0.9855888313442919]
4378 / 4441 [0.9858140058545373]
4379 / 4441 [0.9860391803647827]
4380 / 4441 [0.9862643548750282]
4381 / 4441 [0.9864895293852736]
4382 / 4441 [0.9867147038955191]
4383 / 4441 [0.9869398784057645]
4384 / 4441 [0.9871650529160099]
4385 / 4441 [0.9873902274262554]
4386 / 4441 [0.9876154019365008]
4387 / 4441 [0.9878405764467463]
4388 / 4441 [0.9880657509569917]
4389 / 4441 [0.9882909254672371]
4390 / 4441 [0.9885160999774826]
4391 / 4441 [0.988741274487728]
4392 / 4441 [0.9889664489979735]
4393 / 4441 [0

In [216]:
i = 0
for gene in genes_dict:
    if not genes_dict[gene]:
        i += 1
i

211

In [247]:
search_genes = []
downloaded_genes = []

In [250]:
def fetch(gene_name):
    if gene_name in downloaded_genes:
        return None, None
    
    if not genes_dict[gene_name]:
        search_genes.append(gene_name)
        return None, None
    
    URL = os.path.join('https://www.yeastgenome.org/backend/locus/', genes_dict[gene_name], 'sequence_details')
    
    new_resp = requests.get(URL)
    search_genes.append(gene_name)
    time.sleep(3)
    
    return new_resp.content, gene_name
    
    
pool = ThreadPoolExecutor(max_workers=10)
i = 1

for page, gene_name in pool.map(fetch, genes_dict):
    try:
        soup = BeautifulSoup(page, 'lxml')
        
        dnas = soup.find('p').text.split('coding_dna')[1]
        dna = dnas.split('"residues": "')[1].split('",')[0]
        
        with open("../data/genes_lists/S_cerevisiae_genes_seq.fa", "a") as output_handle:
            SeqIO.write(SeqRecord(Seq(dna), id = gene_name, description='S.cerevisiae'), output_handle, "fasta")
        
        downloaded_genes.append(gene_name)
        
    except (IndexError, TypeError, AttributeError):
        continue
        
    finally:
        print(f'{i} / {yeasts_genes} [{i/yeasts_genes}]')
        i += 1

0 / 4441 [0.0]
1 / 4441 [0.00022517451024544022]
2 / 4441 [0.00045034902049088043]
3 / 4441 [0.0006755235307363207]
4 / 4441 [0.0009006980409817609]
5 / 4441 [0.001125872551227201]
6 / 4441 [0.0013510470614726414]
7 / 4441 [0.0015762215717180816]
8 / 4441 [0.0018013960819635217]
9 / 4441 [0.002026570592208962]
10 / 4441 [0.002251745102454402]
11 / 4441 [0.0024769196126998424]
12 / 4441 [0.002702094122945283]
13 / 4441 [0.0029272686331907227]
14 / 4441 [0.003152443143436163]
15 / 4441 [0.003377617653681603]
16 / 4441 [0.0036027921639270434]
17 / 4441 [0.003827966674172484]
18 / 4441 [0.004053141184417924]
19 / 4441 [0.004278315694663364]
20 / 4441 [0.004503490204908804]
21 / 4441 [0.0047286647151542445]
22 / 4441 [0.004953839225399685]
23 / 4441 [0.005179013735645125]
24 / 4441 [0.005404188245890566]
25 / 4441 [0.005629362756136005]
26 / 4441 [0.0058545372663814455]
27 / 4441 [0.006079711776626886]
28 / 4441 [0.006304886286872326]
29 / 4441 [0.006530060797117767]
30 / 4441 [0.0067552353

250 / 4441 [0.056293627561360055]
251 / 4441 [0.05651880207160549]
252 / 4441 [0.05674397658185094]
253 / 4441 [0.056969151092096375]
254 / 4441 [0.05719432560234181]
255 / 4441 [0.05741950011258726]
256 / 4441 [0.057644674622832695]
257 / 4441 [0.05786984913307813]
258 / 4441 [0.05809502364332358]
259 / 4441 [0.058320198153569015]
260 / 4441 [0.05854537266381445]
261 / 4441 [0.0587705471740599]
262 / 4441 [0.058995721684305336]
263 / 4441 [0.05922089619455078]
264 / 4441 [0.05944607070479622]
265 / 4441 [0.059671245215041656]
266 / 4441 [0.0598964197252871]
267 / 4441 [0.06012159423553254]
268 / 4441 [0.060346768745777976]
269 / 4441 [0.06057194325602342]
270 / 4441 [0.06079711776626886]
271 / 4441 [0.061022292276514296]
272 / 4441 [0.06124746678675974]
273 / 4441 [0.06147264129700518]
274 / 4441 [0.06169781580725062]
275 / 4441 [0.06192299031749606]
276 / 4441 [0.0621481648277415]
277 / 4441 [0.06237333933798694]
278 / 4441 [0.06259851384823238]
279 / 4441 [0.06282368835847782]
280 /

500 / 4441 [0.11258725512272011]
501 / 4441 [0.11281242963296555]
502 / 4441 [0.11303760414321098]
503 / 4441 [0.11326277865345642]
504 / 4441 [0.11348795316370187]
505 / 4441 [0.11371312767394731]
506 / 4441 [0.11393830218419275]
507 / 4441 [0.11416347669443819]
508 / 4441 [0.11438865120468363]
509 / 4441 [0.11461382571492908]
510 / 4441 [0.11483900022517451]
511 / 4441 [0.11506417473541995]
512 / 4441 [0.11528934924566539]
513 / 4441 [0.11551452375591083]
514 / 4441 [0.11573969826615627]
515 / 4441 [0.11596487277640172]
516 / 4441 [0.11619004728664716]
517 / 4441 [0.11641522179689259]
518 / 4441 [0.11664039630713803]
519 / 4441 [0.11686557081738347]
520 / 4441 [0.1170907453276289]
521 / 4441 [0.11731591983787436]
522 / 4441 [0.1175410943481198]
523 / 4441 [0.11776626885836523]
524 / 4441 [0.11799144336861067]
525 / 4441 [0.11821661787885611]
526 / 4441 [0.11844179238910156]
527 / 4441 [0.118666966899347]
528 / 4441 [0.11889214140959244]
529 / 4441 [0.11911731591983787]
530 / 4441 [0.

751 / 4441 [0.16910605719432562]
752 / 4441 [0.16933123170457104]
753 / 4441 [0.1695564062148165]
754 / 4441 [0.16978158072506191]
755 / 4441 [0.17000675523530737]
756 / 4441 [0.1702319297455528]
757 / 4441 [0.17045710425579824]
758 / 4441 [0.1706822787660437]
759 / 4441 [0.17090745327628912]
760 / 4441 [0.17113262778653457]
761 / 4441 [0.17135780229678]
762 / 4441 [0.17158297680702544]
763 / 4441 [0.1718081513172709]
764 / 4441 [0.17203332582751632]
765 / 4441 [0.17225850033776177]
766 / 4441 [0.1724836748480072]
767 / 4441 [0.17270884935825265]
768 / 4441 [0.1729340238684981]
769 / 4441 [0.17315919837874352]
770 / 4441 [0.17338437288898897]
771 / 4441 [0.1736095473992344]
772 / 4441 [0.17383472190947985]
773 / 4441 [0.17405989641972527]
774 / 4441 [0.17428507092997073]
775 / 4441 [0.17451024544021618]
776 / 4441 [0.1747354199504616]
777 / 4441 [0.17496059446070705]
778 / 4441 [0.17518576897095248]
779 / 4441 [0.17541094348119793]
780 / 4441 [0.17563611799144338]
781 / 4441 [0.1758612

1003 / 4441 [0.22585003377617655]
1004 / 4441 [0.22607520828642197]
1005 / 4441 [0.22630038279666742]
1006 / 4441 [0.22652555730691284]
1007 / 4441 [0.2267507318171583]
1008 / 4441 [0.22697590632740375]
1009 / 4441 [0.22720108083764917]
1010 / 4441 [0.22742625534789462]
1011 / 4441 [0.22765142985814005]
1012 / 4441 [0.2278766043683855]
1013 / 4441 [0.22810177887863095]
1014 / 4441 [0.22832695338887637]
1015 / 4441 [0.22855212789912183]
1016 / 4441 [0.22877730240936725]
1017 / 4441 [0.2290024769196127]
1018 / 4441 [0.22922765142985815]
1019 / 4441 [0.22945282594010358]
1020 / 4441 [0.22967800045034903]
1021 / 4441 [0.22990317496059445]
1022 / 4441 [0.2301283494708399]
1023 / 4441 [0.23035352398108533]
1024 / 4441 [0.23057869849133078]
1025 / 4441 [0.23080387300157623]
1026 / 4441 [0.23102904751182166]
1027 / 4441 [0.2312542220220671]
1028 / 4441 [0.23147939653231253]
1029 / 4441 [0.23170457104255798]
1030 / 4441 [0.23192974555280343]
1031 / 4441 [0.23215492006304886]
1032 / 4441 [0.2323

1249 / 4441 [0.28124296329655485]
1250 / 4441 [0.28146813780680024]
1251 / 4441 [0.2816933123170457]
1252 / 4441 [0.28191848682729115]
1253 / 4441 [0.2821436613375366]
1254 / 4441 [0.28236883584778205]
1255 / 4441 [0.28259401035802745]
1256 / 4441 [0.2828191848682729]
1257 / 4441 [0.28304435937851835]
1258 / 4441 [0.2832695338887638]
1259 / 4441 [0.28349470839900925]
1260 / 4441 [0.28371988290925465]
1261 / 4441 [0.2839450574195001]
1262 / 4441 [0.28417023192974555]
1263 / 4441 [0.284395406439991]
1264 / 4441 [0.28462058095023646]
1265 / 4441 [0.28484575546048185]
1266 / 4441 [0.2850709299707273]
1267 / 4441 [0.28529610448097276]
1268 / 4441 [0.2855212789912182]
1269 / 4441 [0.28574645350146366]
1270 / 4441 [0.28597162801170906]
1271 / 4441 [0.2861968025219545]
1272 / 4441 [0.28642197703219996]
1273 / 4441 [0.2866471515424454]
1274 / 4441 [0.28687232605269086]
1275 / 4441 [0.28709750056293626]
1276 / 4441 [0.2873226750731817]
1277 / 4441 [0.28754784958342716]
1278 / 4441 [0.28777302409

1496 / 4441 [0.3368610673271786]
1497 / 4441 [0.337086241837424]
1498 / 4441 [0.3373114163476694]
1499 / 4441 [0.3375365908579149]
1500 / 4441 [0.3377617653681603]
1501 / 4441 [0.3379869398784058]
1502 / 4441 [0.33821211438865123]
1503 / 4441 [0.3384372888988966]
1504 / 4441 [0.3386624634091421]
1505 / 4441 [0.33888763791938753]
1506 / 4441 [0.339112812429633]
1507 / 4441 [0.33933798693987843]
1508 / 4441 [0.33956316145012383]
1509 / 4441 [0.3397883359603693]
1510 / 4441 [0.34001351047061473]
1511 / 4441 [0.3402386849808602]
1512 / 4441 [0.3404638594911056]
1513 / 4441 [0.34068903400135103]
1514 / 4441 [0.3409142085115965]
1515 / 4441 [0.34113938302184194]
1516 / 4441 [0.3413645575320874]
1517 / 4441 [0.3415897320423328]
1518 / 4441 [0.34181490655257823]
1519 / 4441 [0.3420400810628237]
1520 / 4441 [0.34226525557306914]
1521 / 4441 [0.3424904300833146]
1522 / 4441 [0.34271560459356]
1523 / 4441 [0.34294077910380544]
1524 / 4441 [0.3431659536140509]
1525 / 4441 [0.34339112812429634]
152

1743 / 4441 [0.3924791713578023]
1744 / 4441 [0.39270434586804776]
1745 / 4441 [0.39292952037829315]
1746 / 4441 [0.3931546948885386]
1747 / 4441 [0.39337986939878405]
1748 / 4441 [0.3936050439090295]
1749 / 4441 [0.39383021841927496]
1750 / 4441 [0.39405539292952035]
1751 / 4441 [0.3942805674397658]
1752 / 4441 [0.39450574195001126]
1753 / 4441 [0.3947309164602567]
1754 / 4441 [0.39495609097050216]
1755 / 4441 [0.39518126548074756]
1756 / 4441 [0.395406439990993]
1757 / 4441 [0.39563161450123846]
1758 / 4441 [0.3958567890114839]
1759 / 4441 [0.39608196352172936]
1760 / 4441 [0.39630713803197476]
1761 / 4441 [0.3965323125422202]
1762 / 4441 [0.39675748705246566]
1763 / 4441 [0.3969826615627111]
1764 / 4441 [0.39720783607295657]
1765 / 4441 [0.39743301058320196]
1766 / 4441 [0.3976581850934474]
1767 / 4441 [0.39788335960369287]
1768 / 4441 [0.3981085341139383]
1769 / 4441 [0.39833370862418377]
1770 / 4441 [0.39855888313442916]
1771 / 4441 [0.3987840576446746]
1772 / 4441 [0.399009232154

1990 / 4441 [0.44809727538842603]
1991 / 4441 [0.4483224498986715]
1992 / 4441 [0.44854762440891693]
1993 / 4441 [0.44877279891916233]
1994 / 4441 [0.4489979734294078]
1995 / 4441 [0.44922314793965323]
1996 / 4441 [0.4494483224498987]
1997 / 4441 [0.44967349696014414]
1998 / 4441 [0.44989867147038953]
1999 / 4441 [0.450123845980635]
2000 / 4441 [0.45034902049088044]
2001 / 4441 [0.4505741950011259]
2002 / 4441 [0.45079936951137134]
2003 / 4441 [0.45102454402161674]
2004 / 4441 [0.4512497185318622]
2005 / 4441 [0.45147489304210764]
2006 / 4441 [0.4517000675523531]
2007 / 4441 [0.4519252420625985]
2008 / 4441 [0.45215041657284394]
2009 / 4441 [0.4523755910830894]
2010 / 4441 [0.45260076559333484]
2011 / 4441 [0.4528259401035803]
2012 / 4441 [0.4530511146138257]
2013 / 4441 [0.45327628912407114]
2014 / 4441 [0.4535014636343166]
2015 / 4441 [0.45372663814456204]
2016 / 4441 [0.4539518126548075]
2017 / 4441 [0.4541769871650529]
2018 / 4441 [0.45440216167529834]
2019 / 4441 [0.45462733618554

2236 / 4441 [0.5034902049088044]
2237 / 4441 [0.5037153794190498]
2238 / 4441 [0.5039405539292952]
2239 / 4441 [0.5041657284395407]
2240 / 4441 [0.5043909029497861]
2241 / 4441 [0.5046160774600316]
2242 / 4441 [0.504841251970277]
2243 / 4441 [0.5050664264805224]
2244 / 4441 [0.5052916009907679]
2245 / 4441 [0.5055167755010133]
2246 / 4441 [0.5057419500112588]
2247 / 4441 [0.5059671245215042]
2248 / 4441 [0.5061922990317496]
2249 / 4441 [0.5064174735419951]
2250 / 4441 [0.5066426480522405]
2251 / 4441 [0.506867822562486]
2252 / 4441 [0.5070929970727314]
2253 / 4441 [0.5073181715829768]
2254 / 4441 [0.5075433460932223]
2255 / 4441 [0.5077685206034677]
2256 / 4441 [0.5079936951137132]
2257 / 4441 [0.5082188696239586]
2258 / 4441 [0.508444044134204]
2259 / 4441 [0.5086692186444495]
2260 / 4441 [0.5088943931546949]
2261 / 4441 [0.5091195676649404]
2262 / 4441 [0.5093447421751858]
2263 / 4441 [0.5095699166854312]
2264 / 4441 [0.5097950911956767]
2265 / 4441 [0.5100202657059221]
2266 / 4441 [

2487 / 4441 [0.5600090069804098]
2488 / 4441 [0.5602341814906553]
2489 / 4441 [0.5604593560009007]
2490 / 4441 [0.5606845305111461]
2491 / 4441 [0.5609097050213916]
2492 / 4441 [0.561134879531637]
2493 / 4441 [0.5613600540418825]
2494 / 4441 [0.5615852285521279]
2495 / 4441 [0.5618104030623733]
2496 / 4441 [0.5620355775726188]
2497 / 4441 [0.5622607520828642]
2498 / 4441 [0.5624859265931097]
2499 / 4441 [0.5627111011033551]
2500 / 4441 [0.5629362756136005]
2501 / 4441 [0.563161450123846]
2502 / 4441 [0.5633866246340914]
2503 / 4441 [0.5636117991443369]
2504 / 4441 [0.5638369736545823]
2505 / 4441 [0.5640621481648277]
2506 / 4441 [0.5642873226750732]
2507 / 4441 [0.5645124971853186]
2508 / 4441 [0.5647376716955641]
2509 / 4441 [0.5649628462058095]
2510 / 4441 [0.5651880207160549]
2511 / 4441 [0.5654131952263004]
2512 / 4441 [0.5656383697365458]
2513 / 4441 [0.5658635442467913]
2514 / 4441 [0.5660887187570367]
2515 / 4441 [0.5663138932672821]
2516 / 4441 [0.5665390677775276]
2517 / 4441 

2739 / 4441 [0.6167529835622607]
2740 / 4441 [0.6169781580725062]
2741 / 4441 [0.6172033325827516]
2742 / 4441 [0.6174285070929971]
2743 / 4441 [0.6176536816032425]
2744 / 4441 [0.6178788561134879]
2745 / 4441 [0.6181040306237334]
2746 / 4441 [0.6183292051339788]
2747 / 4441 [0.6185543796442243]
2748 / 4441 [0.6187795541544697]
2749 / 4441 [0.6190047286647151]
2750 / 4441 [0.6192299031749606]
2751 / 4441 [0.619455077685206]
2752 / 4441 [0.6196802521954514]
2753 / 4441 [0.6199054267056969]
2754 / 4441 [0.6201306012159423]
2755 / 4441 [0.6203557757261878]
2756 / 4441 [0.6205809502364332]
2757 / 4441 [0.6208061247466786]
2758 / 4441 [0.6210312992569241]
2759 / 4441 [0.6212564737671695]
2760 / 4441 [0.621481648277415]
2761 / 4441 [0.6217068227876604]
2762 / 4441 [0.6219319972979058]
2763 / 4441 [0.6221571718081513]
2764 / 4441 [0.6223823463183967]
2765 / 4441 [0.6226075208286422]
2766 / 4441 [0.6228326953388876]
2767 / 4441 [0.623057869849133]
2768 / 4441 [0.6232830443593785]
2769 / 4441 [

2990 / 4441 [0.6732717856338662]
2991 / 4441 [0.6734969601441116]
2992 / 4441 [0.6737221346543572]
2993 / 4441 [0.6739473091646025]
2994 / 4441 [0.674172483674848]
2995 / 4441 [0.6743976581850935]
2996 / 4441 [0.6746228326953388]
2997 / 4441 [0.6748480072055844]
2998 / 4441 [0.6750731817158298]
2999 / 4441 [0.6752983562260753]
3000 / 4441 [0.6755235307363207]
3001 / 4441 [0.675748705246566]
3002 / 4441 [0.6759738797568116]
3003 / 4441 [0.676199054267057]
3004 / 4441 [0.6764242287773025]
3005 / 4441 [0.6766494032875479]
3006 / 4441 [0.6768745777977933]
3007 / 4441 [0.6770997523080388]
3008 / 4441 [0.6773249268182842]
3009 / 4441 [0.6775501013285297]
3010 / 4441 [0.6777752758387751]
3011 / 4441 [0.6780004503490205]
3012 / 4441 [0.678225624859266]
3013 / 4441 [0.6784507993695114]
3014 / 4441 [0.6786759738797569]
3015 / 4441 [0.6789011483900023]
3016 / 4441 [0.6791263229002477]
3017 / 4441 [0.6793514974104932]
3018 / 4441 [0.6795766719207386]
3019 / 4441 [0.679801846430984]
3020 / 4441 [0.

3240 / 4441 [0.7295654131952263]
3241 / 4441 [0.7297905877054718]
3242 / 4441 [0.7300157622157172]
3243 / 4441 [0.7302409367259626]
3244 / 4441 [0.7304661112362081]
3245 / 4441 [0.7306912857464535]
3246 / 4441 [0.730916460256699]
3247 / 4441 [0.7311416347669444]
3248 / 4441 [0.7313668092771898]
3249 / 4441 [0.7315919837874353]
3250 / 4441 [0.7318171582976807]
3251 / 4441 [0.7320423328079262]
3252 / 4441 [0.7322675073181716]
3253 / 4441 [0.732492681828417]
3254 / 4441 [0.7327178563386625]
3255 / 4441 [0.7329430308489079]
3256 / 4441 [0.7331682053591534]
3257 / 4441 [0.7333933798693988]
3258 / 4441 [0.7336185543796442]
3259 / 4441 [0.7338437288898897]
3260 / 4441 [0.7340689034001351]
3261 / 4441 [0.7342940779103806]
3262 / 4441 [0.734519252420626]
3263 / 4441 [0.7347444269308714]
3264 / 4441 [0.7349696014411169]
3265 / 4441 [0.7351947759513623]
3266 / 4441 [0.7354199504616078]
3267 / 4441 [0.7356451249718532]
3268 / 4441 [0.7358702994820986]
3269 / 4441 [0.7360954739923441]
3270 / 4441 [

3490 / 4441 [0.7858590407565863]
3491 / 4441 [0.7860842152668318]
3492 / 4441 [0.7863093897770772]
3493 / 4441 [0.7865345642873227]
3494 / 4441 [0.7867597387975681]
3495 / 4441 [0.7869849133078135]
3496 / 4441 [0.787210087818059]
3497 / 4441 [0.7874352623283044]
3498 / 4441 [0.7876604368385499]
3499 / 4441 [0.7878856113487953]
3500 / 4441 [0.7881107858590407]
3501 / 4441 [0.7883359603692862]
3502 / 4441 [0.7885611348795316]
3503 / 4441 [0.7887863093897771]
3504 / 4441 [0.7890114839000225]
3505 / 4441 [0.7892366584102679]
3506 / 4441 [0.7894618329205134]
3507 / 4441 [0.7896870074307588]
3508 / 4441 [0.7899121819410043]
3509 / 4441 [0.7901373564512497]
3510 / 4441 [0.7903625309614951]
3511 / 4441 [0.7905877054717406]
3512 / 4441 [0.790812879981986]
3513 / 4441 [0.7910380544922315]
3514 / 4441 [0.7912632290024769]
3515 / 4441 [0.7914884035127223]
3516 / 4441 [0.7917135780229678]
3517 / 4441 [0.7919387525332132]
3518 / 4441 [0.7921639270434587]
3519 / 4441 [0.7923891015537041]
3520 / 4441 

3740 / 4441 [0.8421526683179464]
3741 / 4441 [0.8423778428281918]
3742 / 4441 [0.8426030173384373]
3743 / 4441 [0.8428281918486827]
3744 / 4441 [0.8430533663589281]
3745 / 4441 [0.8432785408691736]
3746 / 4441 [0.843503715379419]
3747 / 4441 [0.8437288898896644]
3748 / 4441 [0.8439540643999099]
3749 / 4441 [0.8441792389101553]
3750 / 4441 [0.8444044134204008]
3751 / 4441 [0.8446295879306462]
3752 / 4441 [0.8448547624408916]
3753 / 4441 [0.8450799369511371]
3754 / 4441 [0.8453051114613825]
3755 / 4441 [0.845530285971628]
3756 / 4441 [0.8457554604818734]
3757 / 4441 [0.8459806349921188]
3758 / 4441 [0.8462058095023643]
3759 / 4441 [0.8464309840126097]
3760 / 4441 [0.8466561585228553]
3761 / 4441 [0.8468813330331006]
3762 / 4441 [0.847106507543346]
3763 / 4441 [0.8473316820535916]
3764 / 4441 [0.847556856563837]
3765 / 4441 [0.8477820310740825]
3766 / 4441 [0.8480072055843279]
3767 / 4441 [0.8482323800945732]
3768 / 4441 [0.8484575546048188]
3769 / 4441 [0.8486827291150641]
3770 / 4441 [0

3992 / 4441 [0.8988966448997974]
3993 / 4441 [0.8991218194100428]
3994 / 4441 [0.8993469939202883]
3995 / 4441 [0.8995721684305337]
3996 / 4441 [0.8997973429407791]
3997 / 4441 [0.9000225174510246]
3998 / 4441 [0.90024769196127]
3999 / 4441 [0.9004728664715155]
4000 / 4441 [0.9006980409817609]
4001 / 4441 [0.9009232154920063]
4002 / 4441 [0.9011483900022518]
4003 / 4441 [0.9013735645124972]
4004 / 4441 [0.9015987390227427]
4005 / 4441 [0.9018239135329881]
4006 / 4441 [0.9020490880432335]
4007 / 4441 [0.902274262553479]
4008 / 4441 [0.9024994370637244]
4009 / 4441 [0.9027246115739699]
4010 / 4441 [0.9029497860842153]
4011 / 4441 [0.9031749605944607]
4012 / 4441 [0.9034001351047062]
4013 / 4441 [0.9036253096149516]
4014 / 4441 [0.903850484125197]
4015 / 4441 [0.9040756586354425]
4016 / 4441 [0.9043008331456879]
4017 / 4441 [0.9045260076559334]
4018 / 4441 [0.9047511821661788]
4019 / 4441 [0.9049763566764242]
4020 / 4441 [0.9052015311866697]
4021 / 4441 [0.9054267056969151]
4022 / 4441 [0

4242 / 4441 [0.9551902724611574]
4243 / 4441 [0.9554154469714028]
4244 / 4441 [0.9556406214816483]
4245 / 4441 [0.9558657959918937]
4246 / 4441 [0.9560909705021392]
4247 / 4441 [0.9563161450123846]
4248 / 4441 [0.95654131952263]
4249 / 4441 [0.9567664940328755]
4250 / 4441 [0.9569916685431209]
4251 / 4441 [0.9572168430533664]
4252 / 4441 [0.9574420175636118]
4253 / 4441 [0.9576671920738572]
4254 / 4441 [0.9578923665841027]
4255 / 4441 [0.9581175410943481]
4256 / 4441 [0.9583427156045936]
4257 / 4441 [0.958567890114839]
4258 / 4441 [0.9587930646250844]
4259 / 4441 [0.9590182391353299]
4260 / 4441 [0.9592434136455753]
4261 / 4441 [0.9594685881558208]
4262 / 4441 [0.9596937626660662]
4263 / 4441 [0.9599189371763116]
4264 / 4441 [0.9601441116865571]
4265 / 4441 [0.9603692861968025]
4266 / 4441 [0.960594460707048]
4267 / 4441 [0.9608196352172934]
4268 / 4441 [0.9610448097275388]
4269 / 4441 [0.9612699842377843]
4270 / 4441 [0.9614951587480297]
4271 / 4441 [0.9617203332582752]
4272 / 4441 [0

# Download blast seq IDs

fore truncated blast DB

In [145]:
with open('../data/TOL/Eukaryota_tree_nodes.txt', 'r') as file:
    nodes = file.readlines()
    for i, node in enumerate(nodes):
        nodes_list[i] = node.strip()

Create dict with taxids:

In [146]:
ncbi = NCBITaxa()
name2taxid = ncbi.get_name_translator(nodes_list)

In [147]:
print(name2taxid)

{'Anopheles gambiae': [7165], 'Arabidopsis thaliana': [3702], 'Caenorhabditis briggsae': [6238], 'Caenorhabditis elegans': [6239], 'Cryptosporidium hominis': [237895], 'Cyanidioschyzon merolae': [45157], 'Danio rerio': [7955], 'Dictyostelium discoideum': [44689], 'Drosophila melanogaster': [7227], 'Eremothecium gossypii': [33169], 'Gallus gallus': [9031], 'Homo sapiens': [9606], 'Leishmania major': [5664], 'Mus musculus': [10090], 'Oryza sativa': [4530], 'Pan troglodytes': [9598], 'Plasmodium falciparum': [5833], 'Rattus norvegicus': [10116], 'Saccharomyces cerevisiae': [4932], 'Schizosaccharomyces pombe': [4896], 'Takifugu rubripes': [31033], 'Thalassiosira pseudonana': [35128], 'Giardia lamblia': [5741]}


In [273]:
with open('../data/TOL/Eukaryota_tree_nodes_taxID.txt', 'w') as f:
    for taxa in name2taxid:
        print(taxa, '\t', *name2taxid[taxa], file=f, end='\n')

Search the Entrez Protein database:

In [312]:
def protein_entrez(taxid):
    term = ''.join(['txid', taxid, '[ORGN]'])
    handle = Entrez.esearch(db="protein", term=term)
    record = Entrez.read(handle)
    
    try:
        count = record['Count']
        handle = Entrez.esearch(db="protein", term=term, retmax=count)
        record = Entrez.read(handle)
        return record['IdList']

    except IncompleteRead:
        print('count: ', count)
        return None

In [314]:
for taxa in name2taxid:
    protein_ids = protein_entrez(str(name2taxid[taxa][0]))
    
    if protein_ids:
        path = os.path.join('../data/for_blastdbcmd/GI_proteins/', ''.join([str(name2taxid[taxa][0]), '.gi']))
        with open(path, 'w') as f:
            f.write('\n'.join(protein_ids))

Search for Accession numbers (long run)

In [330]:
i = 1

for taxa in name2taxid:   
    from_path = os.path.join('../data/for_blastdbcmd/GI_proteins/', ''.join([str(name2taxid[taxa][0]), '.gi']))
    to_path = os.path.join('../data/for_blastdbcmd/accessions', ''.join([str(name2taxid[taxa][0]), '.txt']))
    tofile_list = []
    
    with open(from_path, 'r') as fromfile:
        for line in fromfile:
            rec = line.strip()            
            temphandle = Entrez.read(Entrez.esummary(db="protein", id=rec, retmode="text"))
            tofile_list.append(temphandle[0]['Caption'])
    
    with open(to_path, 'w') as tofile:
        tofile.write('\n'.join(tofile_list))
        
    print(i)
    i += 1
    

KeyboardInterrupt: 

In [None]:
# all to one file
! cd ../data/for_blastdbcmd/accessions
! for file in *.txt; do (cat "${file}"; echo) >> concatenated.txt; done

# Prepare DB

DB https://ftp.ncbi.nih.gov/blast/db/FASTA/   --  nr

In [None]:
! makeblastdb -in ./blast_db/test_nr.fasta  -dbtype prot -parse_seqids -blastdb_version 5 

Extract part from DB

In [None]:
# index big fasta DB
! makeblastdb -in nr.fasta -dbtype prot -parse_seqids

# extract
! blastdbcmd -db nr.fasta -dbtype prot -entry_batch concatenated.txt -out ../red_db/reduced_db.fa

# prepare new db
! cd ../red_db/
# delete duplicate in swiss dp
# seqkit rmdup blast_db/red_swiss/swiss_red.fa > blast_db/red_swiss/swiss_red_uniq.fa
! makeblastdb -in reduced_db.fa -dbtype prot -parse_seqids  -blastdb_version 5

# BLAST

https://www.biostars.org/p/208772/

{'Anopheles gambiae': [7165], \
'Arabidopsis thaliana': [3702], \
'Caenorhabditis briggsae': [6238], \
'Caenorhabditis elegans': [6239], \
'Cryptosporidium hominis': [237895], 'Cyanidioschyzon merolae': [45157], 'Danio rerio': [7955], 'Dictyostelium discoideum': [44689], 'Drosophila melanogaster': [7227], 'Eremothecium gossypii': [33169], 'Gallus gallus': [9031], 'Homo sapiens': [9606], 'Leishmania major': [5664], 'Mus musculus': [10090], 'Oryza sativa': [4530], 'Pan troglodytes': [9598], 'Plasmodium falciparum': [5833], 'Rattus norvegicus': [10116], 'Saccharomyces cerevisiae': [4932], 'Schizosaccharomyces pombe': [4896], 'Takifugu rubripes': [31033], 'Thalassiosira pseudonana': [35128], 'Giardia lamblia': [5741]}

In [331]:
# Create taxidlist
with open('../data/TOL/Eukaryota_taxidlist.txids', 'w') as f:
    for taxa in name2taxid:
        print(*name2taxid[taxa], file=f, end='\n')

In [None]:
! /home/yuliya/soft/ncbi-blast-2.13.0+/bin/blastx -db ./blast_db/test_nr.fasta -query ./genes_lists/S_cerevisiae_genes_seq.fa -taxidlist ./TOL/Eukaryota_taxidlist.txids  -out ./blast/blast_res/blast_result_local.table

In [None]:
/home/yuliya/soft/ncbi-blast-2.13.0+/bin/blastx -db ./blast_db/swiss/data -query ./genes_lists/S_cerevisiae_genes_seq.fa -num_threads 2 -evalue 1e-3 -out ./blast/blast_res/blast_result_local.table


Local run after reducing DB:
- format output https://www.metagenomics.wiki/tools/blast/blastn-output-format-6

In [None]:
# run on reduced db (first 1000 seq)
! /home/yuliya/soft/ncbi-blast-2.13.0+/bin/blastx -db blast_db/reduced/reduced.fa -query ./genes_lists/S_cerevisiae_genes_seq.fa -num_threads 2 -evalue 1e-3 -out ./blast/blast_res/blast_eukaryotes_local_3.table -outfmt "6 qseqid sseqid stitle pident evalue bitscore"

In [None]:
# run on swiss db
! /home/yuliya/soft/ncbi-blast-2.13.0+/bin/blastx -db blast_db/red_swiss/swiss_red_uniq.fa -query ./genes_lists/S_cerevisiae_genes_seq.fa -num_threads 2 -evalue 1e-3 -out ./blast/blast_res/blast_swiss_local.table -outfmt "6 qseqid sseqid stitle pident evalue bitscore"

### remote

In [343]:
new_nodes = []

for node in nodes:
    new_nodes.append(node.strip())

'[ORGN] OR '.join(new_nodes) + '[ORGN]'

'Giardia lamblia[ORGN] OR Leishmania major[ORGN] OR Thalassiosira pseudonana[ORGN] OR Plasmodium falciparum[ORGN] OR Cryptosporidium hominis[ORGN] OR Cyanidioschyzon merolae[ORGN] OR Oryza sativa[ORGN] OR Arabidopsis thaliana[ORGN] OR Dictyostelium discoideum[ORGN] OR Schizosaccharomyces pombe[ORGN] OR Saccharomyces cerevisiae[ORGN] OR Eremothecium gossypii[ORGN] OR Caenorhabditis elegans[ORGN] OR Caenorhabditis briggsae[ORGN] OR Drosophila melanogaster[ORGN] OR Anopheles gambiae[ORGN] OR Danio rerio[ORGN] OR Takifugu rubripes[ORGN] OR Gallus gallus[ORGN] OR Homo sapiens[ORGN] OR Pan troglodytes[ORGN] OR Rattus norvegicus[ORGN] OR Mus musculus[ORGN]'

CMD run:


```/home/yuliya/soft/ncbi-blast-2.13.0+/bin/blastx -db nr -query ./genes_lists/S_cerevisiae_genes_seq.fa -entrez_query "Giardia lamblia[ORGN] OR Leishmania major[ORGN] OR Thalassiosira pseudonana[ORGN] OR Plasmodium falciparum[ORGN] OR Cryptosporidium hominis[ORGN] OR Cyanidioschyzon merolae[ORGN] OR Oryza sativa[ORGN] OR Arabidopsis thaliana[ORGN] OR Dictyostelium discoideum[ORGN] OR Schizosaccharomyces pombe[ORGN] OR Saccharomyces cerevisiae[ORGN] OR Eremothecium gossypii[ORGN] OR Caenorhabditis elegans[ORGN] OR Caenorhabditis briggsae[ORGN] OR Drosophila melanogaster[ORGN] OR Anopheles gambiae[ORGN] OR Danio rerio[ORGN] OR Takifugu rubripes[ORGN] OR Gallus gallus[ORGN] OR Homo sapiens[ORGN] OR Pan troglodytes[ORGN] OR Rattus norvegicus[ORGN] OR Mus musculus [ORGN]"  -out ./blast/blast_res/blast_result.table -remote```



In [342]:
id_list = []
for taxa in name2taxid:
    id_list.append(str(name2taxid[taxa][0]))
    
print(', '.join(id_list))

7165, 3702, 6238, 6239, 237895, 45157, 7955, 44689, 7227, 33169, 9031, 9606, 5664, 10090, 4530, 9598, 5833, 10116, 4932, 4896, 31033, 35128, 5741


# Results analysis

In [84]:
doc = '../data/blast/blast_res/blast_swiss_red_local.table'
result_table = pd.read_csv(doc, sep='\t', header=None, names=['yeast_gene', 'seq_id', 'sub_title', 'p_ident', 'e_value', 'bit_score'])
result_table

Unnamed: 0,yeast_gene,seq_id,sub_title,p_ident,e_value,bit_score
0,YDL003W,sp|Q12158.1|,RecName: Full=Sister chromatid cohesion protei...,100.000,0.000000e+00,1131.0
1,YDL003W,sp|P30776.1|,RecName: Full=Cohesin subunit rad21; AltName: ...,30.233,9.990000e-11,66.2
2,YDL003W,sp|A2AU37.2|,RecName: Full=Double-strand-break repair prote...,34.783,3.890000e-09,61.2
3,YDL003W,sp|O60216.2|,RecName: Full=Double-strand-break repair prote...,32.967,4.800000e-09,60.8
4,YDL003W,sp|Q61550.3|,RecName: Full=Double-strand-break repair prote...,32.967,5.070000e-09,60.8
...,...,...,...,...,...,...
259993,YDL007W,sp|Q5T9A4.1|,RecName: Full=ATPase family AAA domain-contain...,25.166,3.680000e-05,48.1
259994,YDL007W,sp|F4K7F6.1|,RecName: Full=CLP protease regulatory subunit ...,36.275,6.670000e-05,47.0
259995,YDL007W,sp|Q86XH1.1|,RecName: Full=Dynein regulatory complex protei...,25.444,2.610000e-04,45.4
259996,YDL007W,sp|P38126.2|,RecName: Full=Pachytene checkpoint protein 2 [...,30.539,3.040000e-04,45.1


In [85]:
result_table['species'] = result_table['sub_title'].apply(lambda x: re.findall('(?<!Full\=)\[(.*?)\]', x))
result_table

Unnamed: 0,yeast_gene,seq_id,sub_title,p_ident,e_value,bit_score,species
0,YDL003W,sp|Q12158.1|,RecName: Full=Sister chromatid cohesion protei...,100.000,0.000000e+00,1131.0,[Saccharomyces cerevisiae S288C]
1,YDL003W,sp|P30776.1|,RecName: Full=Cohesin subunit rad21; AltName: ...,30.233,9.990000e-11,66.2,[Schizosaccharomyces pombe 972h-]
2,YDL003W,sp|A2AU37.2|,RecName: Full=Double-strand-break repair prote...,34.783,3.890000e-09,61.2,[Mus musculus]
3,YDL003W,sp|O60216.2|,RecName: Full=Double-strand-break repair prote...,32.967,4.800000e-09,60.8,[Homo sapiens]
4,YDL003W,sp|Q61550.3|,RecName: Full=Double-strand-break repair prote...,32.967,5.070000e-09,60.8,[Mus musculus]
...,...,...,...,...,...,...,...
259993,YDL007W,sp|Q5T9A4.1|,RecName: Full=ATPase family AAA domain-contain...,25.166,3.680000e-05,48.1,[Homo sapiens]
259994,YDL007W,sp|F4K7F6.1|,RecName: Full=CLP protease regulatory subunit ...,36.275,6.670000e-05,47.0,[Arabidopsis thaliana]
259995,YDL007W,sp|Q86XH1.1|,RecName: Full=Dynein regulatory complex protei...,25.444,2.610000e-04,45.4,[Homo sapiens]
259996,YDL007W,sp|P38126.2|,RecName: Full=Pachytene checkpoint protein 2 [...,30.539,3.040000e-04,45.1,[Saccharomyces cerevisiae S288C]


In [264]:
protein_species = {k: set() for k in result_table['yeast_gene']}

for index, row in result_table.iterrows():
    protein_species[row['yeast_gene']].update(row['species'])

In [265]:
nodes_list=[]

with open('../data/TOL/Eukaryota_tree_nodes.txt', 'r') as file:
    nodes_list = file.readlines()
    for i, node in enumerate(nodes):
        nodes_list[i] = '_'.join(node.strip().split())

In [266]:
for protein in protein_species:
    new_list = []
    for species in protein_species[protein]:
        new_species = species.split()
        if len(new_species) > 1:
            new_species = '_'.join(new_species[:2])
            if new_species in nodes_list:
                new_list.append(new_species)
                
    protein_species[protein] = set(new_list)

In [267]:
# Save results with all ages:
with open('../data/genes_lists/yeast/yeas_protein_species.txt', 'w') as f:
    for protein in protein_species:
        print(protein, '\t', protein_species[protein], file=f, end='\n')

In [268]:
get_linage = dict.fromkeys(nodes_list)

for taxa in name2taxid:
    handle = Entrez.efetch(db='taxonomy', id=name2taxid[taxa], retmode='xml')
    record = Entrez.read(handle, validate=False)
    get_linage['_'.join(taxa.split())] = record[0]['Lineage'].split('; ')
    handle.close()

In [269]:
age_level = dict.fromkeys(get_linage['Saccharomyces_cerevisiae'])

for i, level in enumerate(age_level):
    age_level[level] = i

age_level

{'cellular organisms': 0,
 'Eukaryota': 1,
 'Opisthokonta': 2,
 'Fungi': 3,
 'Dikarya': 4,
 'Ascomycota': 5,
 'saccharomyceta': 6,
 'Saccharomycotina': 7,
 'Saccharomycetes': 8,
 'Saccharomycetales': 9,
 'Saccharomycetaceae': 10,
 'Saccharomyces': 11}

In [270]:
ages_for_node = dict.fromkeys(nodes_list)

for taxa in get_linage:
    for i, level in enumerate(get_linage[taxa]):
        if level == get_linage['Saccharomyces_cerevisiae'][i]:
            mca = level
            continue
        else:
            break
    ages_for_node[taxa] = mca

In [271]:
ages_for_node

{'Giardia_lamblia': 'Eukaryota',
 'Leishmania_major': 'Eukaryota',
 'Thalassiosira_pseudonana': 'Eukaryota',
 'Plasmodium_falciparum': 'Eukaryota',
 'Cryptosporidium_hominis': 'Eukaryota',
 'Cyanidioschyzon_merolae': 'Eukaryota',
 'Oryza_sativa': 'Eukaryota',
 'Arabidopsis_thaliana': 'Eukaryota',
 'Dictyostelium_discoideum': 'Eukaryota',
 'Schizosaccharomyces_pombe': 'Ascomycota',
 'Saccharomyces_cerevisiae': 'Saccharomyces',
 'Eremothecium_gossypii': 'Saccharomycetaceae',
 'Caenorhabditis_elegans': 'Opisthokonta',
 'Caenorhabditis_briggsae': 'Opisthokonta',
 'Drosophila_melanogaster': 'Opisthokonta',
 'Anopheles_gambiae': 'Opisthokonta',
 'Danio_rerio': 'Opisthokonta',
 'Takifugu_rubripes': 'Opisthokonta',
 'Gallus_gallus': 'Opisthokonta',
 'Homo_sapiens': 'Opisthokonta',
 'Pan_troglodytes': 'Opisthokonta',
 'Rattus_norvegicus': 'Opisthokonta',
 'Mus_musculus': 'Opisthokonta'}

In [272]:
yest_gene_ages_lists = {k: [] for k in result_table['yeast_gene']}
yest_gene_ages = {k: [] for k in result_table['yeast_gene']}

for protein in protein_species:
    temp = []
    for species in protein_species[protein]:
        yest_gene_ages_lists[protein].append(ages_for_node[species])
        temp.append(age_level[ages_for_node[species]])
        
    yest_gene_ages[protein] = get_linage['Saccharomyces_cerevisiae'][min(temp)]

In [281]:
yeast_age_hits = pd.DataFrame(columns=['Gene_ID', *get_linage['Saccharomyces_cerevisiae'][1:]])

for i, gene in enumerate(yest_gene_ages_lists):
    yeast_age_hits.loc[gene] = [gene, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    for level in yest_gene_ages_lists[gene]:
        yeast_age_hits.loc[gene, level] += 1

In [282]:
yeast_age_hits

Unnamed: 0,Gene_ID,Eukaryota,Opisthokonta,Fungi,Dikarya,Ascomycota,saccharomyceta,Saccharomycotina,Saccharomycetes,Saccharomycetales,Saccharomycetaceae,Saccharomyces
YDL003W,YDL003W,1,3,0,0,1,0,0,0,0,0,1
YKR060W,YKR060W,0,2,0,0,1,0,0,0,0,0,1
YMR254C,YMR254C,0,0,0,0,0,0,0,0,0,0,1
YEL059W,YEL059W,0,0,0,0,0,0,0,0,0,0,1
YHR030C,YHR030C,4,8,0,0,1,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
YKL037W,YKL037W,0,0,0,0,0,0,0,0,0,0,1
YOR209C,YOR209C,1,1,0,0,1,0,0,0,0,0,1
YHR005C,YHR005C,3,7,0,0,1,0,0,0,0,0,1
YPR189W,YPR189W,1,2,0,0,1,0,0,0,0,1,1


In [297]:
# Save results with all ages:
with open('../data/genes_lists/yeast/yest_gene_ages_lists.txt', 'w') as f:
    for gene in yest_gene_ages_lists:
        print(gene, *yest_gene_ages_lists[gene], file=f, end='\n')
        
# Save results with all ages:
with open('../data/genes_lists/yeast/yest_gene_ages.txt', 'w') as f:
    for gene in yest_gene_ages:
        print(f'{gene}\t{yest_gene_ages[gene]}', file=f, end='\n')
        
        
yeast_age_hits.to_csv('../data/genes_lists/yeast/yeast_age_hits.csv', index=False)

In [298]:
get_linage['Saccharomyces_cerevisiae']

['cellular organisms',
 'Eukaryota',
 'Opisthokonta',
 'Fungi',
 'Dikarya',
 'Ascomycota',
 'saccharomyceta',
 'Saccharomycotina',
 'Saccharomycetes',
 'Saccharomycetales',
 'Saccharomycetaceae',
 'Saccharomyces']