In [20]:
import numpy as np
import pandas as pd
import scrapy
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin


# SpiderRefSeq

In [3]:
class SpiderRefSeq(scrapy.Spider):
    name = 'spider_refseq'
    allowed_domains = ['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/']
    custom_settings = {
        'FEED_FORMAT': 'json',
        'FEED_URI': 'refseq.json',
    }

    start_urls = ['https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/',
                  'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/']
    
    def parse(self, response):
        for href in response.css('a::attr(href)'):
            yield response.follow(href, self.parse_dir)


# Requests

## 4 requests

In [41]:
url = 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/'

try:
    refseq = requests.get(url)
    print(refseq.status_code)
except Exception as e:
    print(e)

200


In [5]:
print(refseq.text)

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
 <head>
  <title>Index of /genomes/refseq</title>
 </head>
 <body>
<h1>Index of /genomes/refseq</h1>
<pre>Name                                   Last modified      Size  <hr><a href="/genomes/">Parent Directory</a>                                            -   
<a href="archaea/">archaea/</a>                               2023-08-26 19:25    -   
<a href="bacteria/">bacteria/</a>                              2023-08-26 19:25    -   
<a href="fungi/">fungi/</a>                                 2023-08-27 11:33    -   
<a href="invertebrate/">invertebrate/</a>                          2023-08-27 11:33    -   
<a href="metagenomes/">metagenomes/</a>                           2023-02-07 00:44    -   
<a href="mitochondrion/">mitochondrion/</a>                         2023-07-13 20:33    -   
<a href="plant/">plant/</a>                                 2023-08-27 11:33    -   
<a href="plasmid/">plasmid/</a>                      

In [6]:
# response headers
refseq.headers

{'Date': 'Sun, 27 Aug 2023 15:37:48 GMT', 'Server': 'Apache', 'Vary': 'Accept-Encoding', 'Content-Encoding': 'gzip', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload', 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'GET,POST,PUT,OPTIONS', 'Access-Control-Allow-Headers': 'RANGE, Cache-control, If-None-Match, Content-Type', 'Access-Control-Expose-Headers': 'Content-Length, Content-Range, Content-Type', 'Content-Length': '609', 'Keep-Alive': 'timeout=5, max=1000', 'Connection': 'Keep-Alive', 'Content-Type': 'text/html;charset=UTF-8'}

In [7]:
# solicitude headers
refseq.request.headers

{'User-Agent': 'python-requests/2.31.0', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': '*/*', 'Connection': 'keep-alive'}

In [8]:
refseq.request.url

'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/'

## bs4

In [29]:
soup = BeautifulSoup(refseq.text, 'html.parser')
type(soup)

bs4.BeautifulSoup

In [30]:
print(soup.prettify())

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
 <head>
  <title>
   Index of /genomes/refseq
  </title>
 </head>
 <body>
  <h1>
   Index of /genomes/refseq
  </h1>
  <pre>Name                                   Last modified      Size  <hr/><a href="/genomes/">Parent Directory</a>                                            -   
<a href="archaea/">archaea/</a>                               2023-08-26 19:25    -   
<a href="bacteria/">bacteria/</a>                              2023-08-26 19:25    -   
<a href="fungi/">fungi/</a>                                 2023-08-27 11:33    -   
<a href="invertebrate/">invertebrate/</a>                          2023-08-27 11:33    -   
<a href="metagenomes/">metagenomes/</a>                           2023-02-07 00:44    -   
<a href="mitochondrion/">mitochondrion/</a>                         2023-07-13 20:33    -   
<a href="plant/">plant/</a>                                 2023-08-27 11:33    -   
<a href="plasmid/">plasmid/</a>   

In [31]:
soup.find('a')

<a href="/genomes/">Parent Directory</a>

In [32]:
result_set = soup.find('pre').find_all('a')
result_set

[<a href="/genomes/">Parent Directory</a>,
 <a href="archaea/">archaea/</a>,
 <a href="bacteria/">bacteria/</a>,
 <a href="fungi/">fungi/</a>,
 <a href="invertebrate/">invertebrate/</a>,
 <a href="metagenomes/">metagenomes/</a>,
 <a href="mitochondrion/">mitochondrion/</a>,
 <a href="plant/">plant/</a>,
 <a href="plasmid/">plasmid/</a>,
 <a href="plastid/">plastid/</a>,
 <a href="protozoa/">protozoa/</a>,
 <a href="unknown/">unknown/</a>,
 <a href="vertebrate_mammalian/">vertebrate_mammalian/</a>,
 <a href="vertebrate_other/">vertebrate_other/</a>,
 <a href="viral/">viral/</a>,
 <a href="README.txt">README.txt</a>,
 <a href="assembly_summary_refseq.txt">assembly_summary_refseq.txt</a>,
 <a href="assembly_summary_refseq_historical.txt">assembly_summary_refseq_historical.txt</a>]

In [33]:
names = []
for i in result_set:
    names.append(i.get_text())
names

['Parent Directory',
 'archaea/',
 'bacteria/',
 'fungi/',
 'invertebrate/',
 'metagenomes/',
 'mitochondrion/',
 'plant/',
 'plasmid/',
 'plastid/',
 'protozoa/',
 'unknown/',
 'vertebrate_mammalian/',
 'vertebrate_other/',
 'viral/',
 'README.txt',
 'assembly_summary_refseq.txt',
 'assembly_summary_refseq_historical.txt']

In [34]:
urls = []
for i in result_set:
    urls.append( urljoin(url, i.get('href') ) )
urls

['https://ftp.ncbi.nlm.nih.gov/genomes/',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/fungi/',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/invertebrate/',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/metagenomes/',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/mitochondrion/',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/plant/',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/plasmid/',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/plastid/',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/protozoa/',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/unknown/',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_other/',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/viral/',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/README.txt',
 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refse

In [35]:
urls[1]

'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/'

In [39]:
try:
    archaea = requests.get(urls[1])
    print(archaea.status_code)
except Exception as e:
    print(e)

200
