In [1]:
import numpy as np
import pandas as pd
import scrapy
import requests
from lxml import etree
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from io import StringIO
from scrapy.crawler import CrawlerProcess
from scrapy.pipelines.files import FilesPipeline
from urllib.parse import urlparse
from pathlib import Path
import time
import datetime

# Constants

In [2]:
PATH_DATA = Path.cwd().parent.parent / 'data'
PATH_DATA.exists()

True

In [3]:
PATH_SCRAPED = PATH_DATA / 'raw' / 'scraped'
PATH_SCRAPED.exists()

True

In [4]:
PATH_GENOMIC_FNA = PATH_DATA / 'raw' / 'genomes' \
    / 'archaea' / 'genomic_fna'
PATH_GENOMIC_FNA.exists()

True

In [5]:
PATH_TEMP_GENOMIC_FNA = PATH_DATA / 'raw' / 'genomes' \
    / 'archaea' / 'temp_genomic_fna'
PATH_TEMP_GENOMIC_FNA.exists()

True

In [6]:
PATH_GENOMIC_BACTERIA_FNA = PATH_DATA / 'raw' / 'genomes' \
    / 'bacteria' / 'genomic_fna'
PATH_GENOMIC_BACTERIA_FNA.exists()

True

In [7]:
PATH_TEMP_BACTERIA_GENOMIC_FNA = PATH_DATA / 'raw' / 'genomes' \
    / 'bacteria' / 'temp_genomic_fna'
PATH_TEMP_BACTERIA_GENOMIC_FNA.exists()

True

In [8]:
PATH_RESULTS_BACTERIA = (PATH_DATA / "results" / "bacteria")
PATH_RESULTS_BACTERIA.exists()

True

# Data Recovery

## archaea

In [6]:
df_archaea = pd.read_csv(PATH_DATA / 'raw' / \
    'scraped' / 'archaea_name_url.csv')
df_archaea

Unnamed: 0,name,url
0,ANME-2_cluster_archaeon_HR1,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...
1,Acidianus_ambivalens,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...
2,Acidianus_brierleyi,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...
3,Acidianus_hospitalis,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...
4,Acidianus_infernus,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...
...,...,...
1232,uncultured_Nitrososphaera_sp.,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...
1233,uncultured_archaeon_A07HB70,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...
1234,uncultured_archaeon_A07HN63,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...
1235,uncultured_archaeon_A07HR60,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...


In [7]:
urls_archaea_organisms = df_archaea['url'].values
print(len(urls_archaea_organisms))
urls_archaea_organisms

1237


array(['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_ambivalens/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_brierleyi/',
       ...,
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/uncultured_archaeon_A07HN63/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/uncultured_archaeon_A07HR60/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/uncultured_archaeon_A07HR67/'],
      dtype=object)

## bacteria

In [9]:
df_bacteria = pd.read_csv(PATH_DATA / 'raw' / \
    'scraped' / 'bacteria_name_url.csv')
df_bacteria

Unnamed: 0,name,url
0,Abditibacterium_utsteinense,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ba...
1,Abiotrophia_defectiva,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ba...
2,Abiotrophia_sp.,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ba...
3,Abiotrophia_sp._HMSC24B09,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ba...
4,Absicoccus_porci,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ba...
...,...,...
52904,zeta_proteobacterium_SCGC_AB-137-J06,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ba...
52905,zeta_proteobacterium_SCGC_AB-602-C20,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ba...
52906,zeta_proteobacterium_SCGC_AB-602-E04,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ba...
52907,zeta_proteobacterium_SCGC_AB-604-B04,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ba...


In [10]:
urls_bacteria_organisms = df_bacteria['url'].values
print(len(urls_bacteria_organisms))
urls_bacteria_organisms

52909


array(['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abditibacterium_utsteinense/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_defectiva/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_sp./',
       ...,
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-602-E04/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-604-B04/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-604-O16/'],
      dtype=object)

In [11]:
df_mfa_bacteria = pd.read_csv(PATH_RESULTS_BACTERIA / 'mfa_bacteria_35000_52909_pow7.csv', sep=';')
df_mfa_bacteria

Unnamed: 0,Organism,path,seq_length,GC_content,Q,Tau(Q),D(Q),r_squared,Delta_Dq
0,pseudodesulfovibrio_indicus,/home/diegoflm/Documents/UV/courses_and_subjec...,3966573,0634883563216913,-200,-5822372404009182,277255828762342,0998439748513986,12478087067971309
1,pseudodesulfovibrio_indicus,/home/diegoflm/Documents/UV/courses_and_subjec...,3966573,0634883563216913,-190,-5531755996878647,27658779984393234,09984547716999835,12478087067971309
2,pseudodesulfovibrio_indicus,/home/diegoflm/Documents/UV/courses_and_subjec...,3966573,0634883563216913,-180,-5241184580682713,27585182003593225,09984714685532762,12478087067971309
3,pseudodesulfovibrio_indicus,/home/diegoflm/Documents/UV/courses_and_subjec...,3966573,0634883563216913,-170,-49506696847686705,2750372047093706,09984901100821286,12478087067971309
4,pseudodesulfovibrio_indicus,/home/diegoflm/Documents/UV/courses_and_subjec...,3966573,0634883563216913,-160,-46602267713291056,27413098654877093,09985110304927113,12478087067971309
...,...,...,...,...,...,...,...,...,...
3417591,zymomonas_sp._segment_3,/home/diegoflm/Documents/UV/courses_and_subjec...,640617,045795381639887794,160,24041505273994705,16027670182663136,09983998418704588,12856818438370474
3417592,zymomonas_sp._segment_3,/home/diegoflm/Documents/UV/courses_and_subjec...,640617,045795381639887794,170,25564688499910517,15977930312444073,09983302561291089,12856818438370474
3417593,zymomonas_sp._segment_3,/home/diegoflm/Documents/UV/courses_and_subjec...,640617,045795381639887794,180,2708653137280582,15933253748709306,09982699578190471,12856818438370474
3417594,zymomonas_sp._segment_3,/home/diegoflm/Documents/UV/courses_and_subjec...,640617,045795381639887794,190,28607184008911975,15892880004951098,09982175466913437,12856818438370474


In [12]:
df_mfa_bacteria['Organism'].unique()[-15:]

array(['zwartia_vadi_segment_1', 'zwartia_vadi_segment_2',
       'zwartia_vadi_segment_3', 'zymobacter_palmae',
       'zymobacter_palmae_segment_1', 'zymobacter_palmae_segment_2',
       'zymobacter_palmae_segment_3', 'zymomonas_mobilis',
       'zymomonas_mobilis_segment_1', 'zymomonas_mobilis_segment_2',
       'zymomonas_mobilis_segment_3', 'zymomonas_sp.',
       'zymomonas_sp._segment_1', 'zymomonas_sp._segment_2',
       'zymomonas_sp._segment_3'], dtype=object)

# Genome Scrapers

## Requests

In [8]:
# ANME-2_cluster_archaeon_HR1

try:
    request_archaea_0 = requests.get(urls_archaea_organisms[0])
    print(request_archaea_0.status_code)
except Exception as e:
    print(e)

200


In [9]:
soup_archaea_0 = BeautifulSoup(request_archaea_0.text, 'html.parser')
type(soup_archaea_0)

bs4.BeautifulSoup

In [10]:
soup_archaea_0

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">

<html>
<head>
<title>Index of /genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1</title>
</head>
<body>
<h1>Index of /genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1</h1>
<pre>Name                            Last modified      Size  <hr/><a href="/genomes/refseq/archaea/">Parent Directory</a>                                     -   
<a href="all_assembly_versions/">all_assembly_versions/</a>          2024-03-18 00:38    -   
<a href="annotation_hashes.txt">annotation_hashes.txt</a>           2024-03-16 19:37  410   
<a href="assembly_summary_historical.txt">assembly_summary_historical.txt</a> 2024-03-16 19:08  1.1K  
<hr/></pre>
<a href="https://www.hhs.gov/vulnerability-disclosure-policy/index.html">HHS Vulnerability Disclosure</a>
</body></html>

In [11]:
result_set_archaea_0 = soup_archaea_0.find('pre').find_all('a')
result_set_archaea_0

[<a href="/genomes/refseq/archaea/">Parent Directory</a>,
 <a href="all_assembly_versions/">all_assembly_versions/</a>,
 <a href="annotation_hashes.txt">annotation_hashes.txt</a>,
 <a href="assembly_summary_historical.txt">assembly_summary_historical.txt</a>]

In [12]:
items = []
for i in result_set_archaea_0[1:]:
    items.append( i.get('href') )
items

['all_assembly_versions/',
 'annotation_hashes.txt',
 'assembly_summary_historical.txt']

In [13]:
urls_archaea_0 = []
for i in result_set_archaea_0[1:]:
    urls_archaea_0.append( urljoin( request_archaea_0.url, i.get('href') ) )
urls_archaea_0

['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/all_assembly_versions/',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/annotation_hashes.txt',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/assembly_summary_historical.txt']

In [14]:
# aav := all_assembly_versions
url_aav_archaea_0 = [url for url \
    in urls_archaea_0 if 'all_assembly_versions' in url]
print(url_aav_archaea_0)

try:
    request_aav_archaea_0 =  requests.get(urls_archaea_0[0])
    print(request_archaea_0.status_code)
except Exception as e:
    print(e)

['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/all_assembly_versions/']
200


In [15]:
urls_archaea_organisms[:5]

array(['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_ambivalens/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_brierleyi/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_hospitalis/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_infernus/'],
      dtype=object)

In [16]:
for url in urls_archaea_organisms[:5]:
    print(urljoin(url, 'all_assembly_versions/'))

https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/all_assembly_versions/
https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_ambivalens/all_assembly_versions/
https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_brierleyi/all_assembly_versions/
https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_hospitalis/all_assembly_versions/
https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_infernus/all_assembly_versions/


In [None]:
url_aav_ok = 0
url_aav_not_ok = 0
for url in urls_archaea_organisms[:5]:
    try:
        response = requests.get(urljoin(url, 'all_assembly_versions/'))
        print( response.url )
        # access elements of the response.text using xpath


        # Assuming 'response' is the requests response object
        html_parser = etree.HTMLParser()
        parsed_html = etree.HTML(response.text, parser=html_parser)

        # Use XPath expressions to select elements
        elements = parsed_html.xpath('//pre/a')  
        # print(elements)

        urls_aav = [urljoin(url, "all_assembly_versions/") for url in urls_archaea_organisms[:5]]
        print("urls_aav", urls_aav)

        # Process the selected elements
        for element in elements:
            # Access element properties or extract text
            print("-----ELEMENT-----")
            property_value = element.get('href')  # Replace 'property_name' with the desired property name
            print("href: ", property_value)
            element_text = element.text
            print(element_text)
            # Perform further processing on the selected elements
        


        url_aav_ok += 1
        # print(request.status_code)
    except Exception as e:
        print(e)
        url_aav_not_ok += 1

# last runtime: 14m 11.9s

print(url_aav_ok, url_aav_not_ok)

## Item and Custom Pipeline

In [13]:
class genomeNameItem(scrapy.Item):
    file_urls = scrapy.Field()
    genome_file_name = scrapy.Field()

In [14]:
class CustomFilesPipeline(FilesPipeline):
    def get_media_requests(self, item, info):
        # This method is called for each item and must return the Request objects to download.
        # Here, we customize it to use our item's `file_urls` field.
        return [scrapy.Request(x, meta={'genome_file_name': item['genome_file_name']}) for x in item.get('file_urls', [])]

    def file_path(self, request, response=None, info=None, *, item=None):
        # This method determines the path (including the filename) where the file will be saved.
        # Here, we use the filename from the request's meta, which we set in get_media_requests.
        return request.meta['genome_file_name']
    
    # def file_path(self, request, response=None, info=None, *, item=None):
    #     # Extract the desired filename from the item
    #     # This assumes you have a 'genome_file_name' field in your item
    #     filename = item.get('genome_file_name')
    #     # Return the filename
    #     return filename


## Archaea

### Spider

In [21]:
class SpiderGenomesArchaea(scrapy.Spider):
    name = 'spider_genomes'
    allowed_domains = ['ftp.ncbi.nlm.nih.gov']
    custom_settings = {
        'ITEM_PIPELINES': {'__main__.CustomFilesPipeline': 1},
        # 'ITEM_PIPELINES': {'scrapy.pipelines.files.FilesPipeline': 1},
        'FILES_STORE': str( PATH_GENOMIC_FNA ),
    }

    start_urls = list(urls_archaea_organisms)  # Assuming this is defined elsewhere

    def parse(self, response):
        url_aav = urljoin(response.url, 'all_assembly_versions/')
        
        yield scrapy.Request(
            url=url_aav,
            callback=self.parse_through,
            meta={
                'base_url': response.url,
                'organism': response.url.split('/')[-2]
                }
        )

        
    def parse_through(self, response):
        # avs =: assembly_versions
        hrefs_avs = response.xpath('//pre/a/@href').getall()
        hrefs_avs = hrefs_avs[1:]
        href_1 = hrefs_avs[0]

        if ( href_1 == "suppressed/" ):
            yield scrapy.Request(
                url= urljoin(response.url, href_1),
                callback=self.parse_through,
                meta={
                    'base_url': response.url,
                    'organism': response.meta['organism']
                    }
            )
        else:
            yield scrapy.Request(
                url= urljoin(response.url, href_1),
                callback=self.download_files,
                meta={
                    'base_url': response.url,
                    'organism': response.meta['organism'],
                    'genome_version': href_1.split('/')[-2]
                    }
            )

        
    def download_files(self, response):
        genome_file_name = str(response.meta['genome_version']) + '_genomic.fna.gz'
        urls_download_genome = [urljoin(response.url, genome_file_name)]

        item = genomeNameItem()
        item['file_urls'] = urls_download_genome
        item['genome_file_name'] = str(response.meta['organism']).lower() \
            + '/' + str(genome_file_name)
        
        print("Downloading genome for: ", response.meta['organism'])

        yield item

In [22]:
print(list(urls_archaea_organisms[:5]))
print(urls_archaea_organisms[:5])

['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/', 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_ambivalens/', 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_brierleyi/', 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_hospitalis/', 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_infernus/']
['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_ambivalens/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_brierleyi/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_hospitalis/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_infernus/']


### Crawler

In [24]:
len(urls_archaea_organisms)

1237

In [None]:
process = CrawlerProcess()
process.crawl(SpiderGenomesArchaea)
process.start()

# last runtime (2024-03-19): 2m 54.6s
# 1193 genomes downloaded from 1237 links

## Bacteria

### Spider

In [17]:
class SpiderGenomesBacteria(scrapy.Spider):
    name = 'spider_genomes_bacteria'
    allowed_domains = ['ftp.ncbi.nlm.nih.gov']
    custom_settings = {
        'ITEM_PIPELINES': {'__main__.CustomFilesPipeline': 1},
        # 'ITEM_PIPELINES': {'scrapy.pipelines.files.FilesPipeline': 1},
        'FILES_STORE': str( PATH_GENOMIC_BACTERIA_FNA ),
    }

    start_urls = list(urls_bacteria_organisms[50000:])  # This is defined elsewhere

    def parse(self, response):
        url_aav = urljoin(response.url, 'all_assembly_versions/')
        
        yield scrapy.Request(
            url=url_aav,
            callback=self.parse_through,
            meta={
                'base_url': response.url,
                'organism': response.url.split('/')[-2]
                }
        )

        
    def parse_through(self, response):
        # avs =: assembly_versions
        hrefs_avs = response.xpath('//pre/a/@href').getall()
        hrefs_avs = hrefs_avs[1:]
        href_1 = hrefs_avs[0]

        if ( href_1 == "suppressed/" ):
            yield scrapy.Request(
                url= urljoin(response.url, href_1),
                callback=self.parse_through,
                meta={
                    'base_url': response.url,
                    'organism': response.meta['organism']
                    }
            )
        else:
            yield scrapy.Request(
                url= urljoin(response.url, href_1),
                callback=self.download_files,
                meta={
                    'base_url': response.url,
                    'organism': response.meta['organism'],
                    'genome_version': href_1.split('/')[-2]
                    }
            )


    def download_files(self, response):
        genome_file_name = str(response.meta['genome_version']) + '_genomic.fna.gz'
        urls_download_genome = [urljoin(response.url, genome_file_name)]

        item = genomeNameItem()
        item['file_urls'] = urls_download_genome
        item['genome_file_name'] = str(response.meta['organism']).lower() \
            + '/' + str(genome_file_name)
        
        print("Downloading genome for: ", response.meta['organism'])

        yield item

In [15]:
print(len(urls_bacteria_organisms))

url_last_genome_batch = 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Zymomonas_sp./'
# url_last_genome_batch = 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Sterolibacterium_denitrificans/'
# url_last_genome_batch = 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Romboutsia_weinsteinii/'
# url_last_genome_batch = 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Pseudodesulfovibrio_hydrargyri/'
# url_last_genome_batch = 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Oleisolibacter_albus/'
# url_last_genome_batch = 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Mesorhizobium_sp._STM_4661/'
# url_last_genome_batch = 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Hydrogenoanaerobacterium_sp./'
# url_last_genome_batch = 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Elioraea_tepida/'
# url_last_genome_batch = 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Cellulophaga_tyrosinoxydans/'
# url_last_genome_batch = 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Bacillus_cereus_group_sp._TH243-1LC/'
# index of the url_last_genome_batch in the numpy array
# url_last_genome_batch
 
idx_url_last_genome_batch = np.where(urls_bacteria_organisms == url_last_genome_batch)
print(idx_url_last_genome_batch)

print(urls_bacteria_organisms[52005:52015])

52909
(array([52006]),)
['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Zymomonas_mobilis/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Zymomonas_sp./'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/_Acholeplasma_multilocale/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/_Acidovorax_ebreus/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/_Actinobacillus_rossii/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/_Actinomadura_parvosata/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/_Bacillus_caldolyticus/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/_Bacillus_enclensis/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/_Bacillus_sp._KCTC_13219/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/_Bacteroides_pectinophilus/']


In [16]:
print(urls_bacteria_organisms[44997:45002])

['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Steroidobacter_gossypii/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Steroidobacter_sp./'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Sterolibacterium_denitrificans/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Stieleria_maiorica/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Stieleria_neptunia/']


In [None]:
process = CrawlerProcess()
process.crawl(SpiderGenomesBacteria)
process.start()



# last runtime (2024-03-19): 12m 36.7s
#  genomes downloaded from 5000 links

# last runtime (2024-07-15): 17m 17.0s
#  genomes downloaded from 5000 links

# last runtime (2024-07-27): 14m 18.3s
#  genomes downloaded from 5000 links

# last runtime (2024-08-01): 12m 1s
#  genomes downloaded from 5000 links

# last runtime (2024-08-04): 24m 28.2s
#  genomes downloaded from 5000 links