In [1]:
import numpy as np
import pandas as pd
import scrapy
import requests
from lxml import etree
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from io import StringIO
from scrapy.crawler import CrawlerProcess
from pathlib import Path
import time
import datetime

# Constants

In [2]:
PATH_DATA = Path.cwd().parent.parent / 'data'
PATH_DATA.exists()

True

In [3]:
PATH_SCRAPED = PATH_DATA / 'raw' / 'scraped'
PATH_SCRAPED.exists()

True

In [4]:
PATH_GENOMIC_FNA = PATH_DATA / 'raw' / 'genomes' \
    / 'archaea' / 'genomic_fna'
PATH_GENOMIC_FNA.exists()

True

In [5]:
PATH_TEMP_GENOMIC_FNA = PATH_DATA / 'raw' / 'genomes' \
    / 'archaea' / 'temp_genomic_fna'
PATH_TEMP_GENOMIC_FNA.exists()

True

# Data Recovery

In [6]:
df_archaea = pd.read_csv(PATH_DATA / 'raw' / \
    'scraped' / 'archaea_name_url.csv')
df_archaea

Unnamed: 0,name,url
0,ANME-2_cluster_archaeon_HR1,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...
1,Acidianus_ambivalens,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...
2,Acidianus_brierleyi,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...
3,Acidianus_hospitalis,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...
4,Acidianus_infernus,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...
...,...,...
1232,uncultured_Nitrososphaera_sp.,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...
1233,uncultured_archaeon_A07HB70,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...
1234,uncultured_archaeon_A07HN63,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...
1235,uncultured_archaeon_A07HR60,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...


In [7]:
urls_archaea_organisms = df_archaea['url'].values
print(len(urls_archaea_organisms))
urls_archaea_organisms

1237


array(['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_ambivalens/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_brierleyi/',
       ...,
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/uncultured_archaeon_A07HN63/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/uncultured_archaeon_A07HR60/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/uncultured_archaeon_A07HR67/'],
      dtype=object)

# Genome Scrapers

## Requests

In [8]:
# ANME-2_cluster_archaeon_HR1

try:
    request_archaea_0 = requests.get(urls_archaea_organisms[0])
    print(request_archaea_0.status_code)
except Exception as e:
    print(e)

200


In [9]:
soup_archaea_0 = BeautifulSoup(request_archaea_0.text, 'html.parser')
type(soup_archaea_0)

bs4.BeautifulSoup

In [10]:
soup_archaea_0

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">

<html>
<head>
<title>Index of /genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1</title>
</head>
<body>
<h1>Index of /genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1</h1>
<pre>Name                            Last modified      Size  <hr/><a href="/genomes/refseq/archaea/">Parent Directory</a>                                     -   
<a href="all_assembly_versions/">all_assembly_versions/</a>          2024-03-18 00:38    -   
<a href="annotation_hashes.txt">annotation_hashes.txt</a>           2024-03-16 19:37  410   
<a href="assembly_summary_historical.txt">assembly_summary_historical.txt</a> 2024-03-16 19:08  1.1K  
<hr/></pre>
<a href="https://www.hhs.gov/vulnerability-disclosure-policy/index.html">HHS Vulnerability Disclosure</a>
</body></html>

In [11]:
result_set_archaea_0 = soup_archaea_0.find('pre').find_all('a')
result_set_archaea_0

[<a href="/genomes/refseq/archaea/">Parent Directory</a>,
 <a href="all_assembly_versions/">all_assembly_versions/</a>,
 <a href="annotation_hashes.txt">annotation_hashes.txt</a>,
 <a href="assembly_summary_historical.txt">assembly_summary_historical.txt</a>]

In [12]:
items = []
for i in result_set_archaea_0[1:]:
    items.append( i.get('href') )
items

['all_assembly_versions/',
 'annotation_hashes.txt',
 'assembly_summary_historical.txt']

In [13]:
urls_archaea_0 = []
for i in result_set_archaea_0[1:]:
    urls_archaea_0.append( urljoin( request_archaea_0.url, i.get('href') ) )
urls_archaea_0

['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/all_assembly_versions/',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/annotation_hashes.txt',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/assembly_summary_historical.txt']

In [14]:
# aav := all_assembly_versions
url_aav_archaea_0 = [url for url \
    in urls_archaea_0 if 'all_assembly_versions' in url]
print(url_aav_archaea_0)

try:
    request_aav_archaea_0 =  requests.get(urls_archaea_0[0])
    print(request_archaea_0.status_code)
except Exception as e:
    print(e)

['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/all_assembly_versions/']


200


In [15]:
urls_archaea_organisms[:5]

array(['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_ambivalens/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_brierleyi/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_hospitalis/',
       'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_infernus/'],
      dtype=object)

In [16]:
for url in urls_archaea_organisms[:5]:
    print(urljoin(url, 'all_assembly_versions/'))

https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/all_assembly_versions/
https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_ambivalens/all_assembly_versions/
https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_brierleyi/all_assembly_versions/
https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_hospitalis/all_assembly_versions/
https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_infernus/all_assembly_versions/


In [17]:
url_aav_ok = 0
url_aav_not_ok = 0
for url in urls_archaea_organisms[:5]:
    try:
        response = requests.get(urljoin(url, 'all_assembly_versions/'))
        print( response.url )
        # access elements of the response.text using xpath


        # Assuming 'response' is the requests response object
        html_parser = etree.HTMLParser()
        parsed_html = etree.HTML(response.text, parser=html_parser)

        # Use XPath expressions to select elements
        elements = parsed_html.xpath('//pre/a')  
        # print(elements)

        urls_aav = [urljoin(url, "all_assembly_versions/") for url in urls_archaea_organisms[:5]]
        print("urls_aav", urls_aav)

        # Process the selected elements
        for element in elements:
            # Access element properties or extract text
            print("-----ELEMENT-----")
            property_value = element.get('href')  # Replace 'property_name' with the desired property name
            print("href: ", property_value)
            element_text = element.text
            print(element_text)
            # Perform further processing on the selected elements
        


        url_aav_ok += 1
        # print(request.status_code)
    except Exception as e:
        print(e)
        url_aav_not_ok += 1

# last runtime: 14m 11.9s

print(url_aav_ok, url_aav_not_ok)

https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/all_assembly_versions/
urls_aav ['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/all_assembly_versions/', 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_ambivalens/all_assembly_versions/', 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_brierleyi/all_assembly_versions/', 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_hospitalis/all_assembly_versions/', 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_infernus/all_assembly_versions/']
-----ELEMENT-----
href:  /genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/
Parent Directory
-----ELEMENT-----
href:  suppressed/
suppressed/
https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_ambivalens/all_assembly_versions/
urls_aav ['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/all_assembly_versions/', 'https://ftp.ncbi.nlm.nih.gov/ge

## Archaea

### Item and Custom Pipeline

In [18]:
from scrapy.pipelines.files import FilesPipeline
from urllib.parse import urlparse

In [19]:
class genomeNameItem(scrapy.Item):
    file_urls = scrapy.Field()
    genome_file_name = scrapy.Field()

In [20]:
class CustomFilesPipeline(FilesPipeline):
    def get_media_requests(self, item, info):
        # This method is called for each item and must return the Request objects to download.
        # Here, we customize it to use our item's `file_urls` field.
        return [scrapy.Request(x, meta={'genome_file_name': item['genome_file_name']}) for x in item.get('file_urls', [])]

    def file_path(self, request, response=None, info=None, *, item=None):
        # This method determines the path (including the filename) where the file will be saved.
        # Here, we use the filename from the request's meta, which we set in get_media_requests.
        return request.meta['genome_file_name']
    
    # def file_path(self, request, response=None, info=None, *, item=None):
    #     # Extract the desired filename from the item
    #     # This assumes you have a 'genome_file_name' field in your item
    #     filename = item.get('genome_file_name')
    #     # Return the filename
    #     return filename


### Spider

In [21]:
class SpiderGenomesArchaea(scrapy.Spider):
    name = 'spider_genomes'
    allowed_domains = ['ftp.ncbi.nlm.nih.gov']
    custom_settings = {
        'ITEM_PIPELINES': {'__main__.CustomFilesPipeline': 1},
        # 'ITEM_PIPELINES': {'scrapy.pipelines.files.FilesPipeline': 1},
        'FILES_STORE': str( PATH_TEMP_GENOMIC_FNA ),
    }

    start_urls = list(urls_archaea_organisms[:5])  # Assuming this is defined elsewhere

    def parse(self, response):
        url_aav = urljoin(response.url, 'all_assembly_versions/')
        
        yield scrapy.Request(
            url=url_aav,
            callback=self.parse_through,
            meta={
                'base_url': response.url,
                'organism': response.url.split('/')[-2]
                }
        )

        
    def parse_through(self, response):
        # avs =: assembly_versions
        hrefs_avs = response.xpath('//pre/a/@href').getall()
        hrefs_avs = hrefs_avs[1:]
        href_1 = hrefs_avs[0]

        if ( href_1 == "suppressed/" ):
            yield scrapy.Request(
                url= urljoin(response.url, href_1),
                callback=self.parse_through,
                meta={
                    'base_url': response.url,
                    'organism': response.meta['organism']
                    }
            )
        else:
            yield scrapy.Request(
                url= urljoin(response.url, href_1),
                callback=self.download_files,
                meta={
                    'base_url': response.url,
                    'organism': response.meta['organism'],
                    'genome_version': href_1.split('/')[-2]
                    }
            )

        
    def download_files(self, response):
        genome_file_name = str(response.meta['genome_version']) + '_genomic.fna.gz'
        urls_download_genome = [urljoin(response.url, genome_file_name)]

        item = genomeNameItem()
        item['file_urls'] = urls_download_genome
        item['genome_file_name'] = str(response.meta['organism']).lower() \
            + '/' + str(genome_file_name)

        yield item

In [22]:
print(list(urls_archaea_organisms[:5]))
print(urls_archaea_organisms[:5])

['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/', 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_ambivalens/', 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_brierleyi/', 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_hospitalis/', 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_infernus/']
['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_ambivalens/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_brierleyi/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_hospitalis/'
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_infernus/']


### Crawler

In [23]:

process = CrawlerProcess()
process.crawl(SpiderGenomesArchaea)
process.start()


2024-03-19 12:23:22 [scrapy.utils.log] INFO: Scrapy 2.10.0 started (bot: scrapybot)
2024-03-19 12:23:22 [scrapy.utils.log] INFO: Versions: lxml 4.9.2.0, libxml2 2.10.4, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.11.3 | packaged by conda-forge | (main, Apr  6 2023, 08:57:19) [GCC 11.3.0], pyOpenSSL 23.1.1 (OpenSSL 3.1.2 1 Aug 2023), cryptography 40.0.2, Platform Linux-6.5.0-26-generic-x86_64-with-glibc2.35
2024-03-19 12:23:22 [scrapy.addons] INFO: Enabled addons:
[]
2024-03-19 12:23:22 [scrapy.crawler] INFO: Overridden settings:
{}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2024-03-19 12:23:22 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2024-03-19 12:23:22 [scrapy.extensions.telnet] INFO: Telnet Password: de6da8d118fef94f
2024-03-19 12:23:22 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.coresta