Scraping Websites and Extracting Data

Blueprint: Downloading and Interpreting robots.txt

In [3]:
import urllib.robotparser

rp= urllib.robotparser.RobotFileParser()
rp.set_url("https://web.pharmacyboardkenya.org/robots.txt")
rp.read()
rp.can_fetch("*", "https://web.pharmacyboardkenya.org/sitemap.xml")

True

Blueprint: Finding URLs from sitemap.xml

In [5]:
import xmltodict #this converts the sitemap.xml file to a dictionary
import requests 

sitemap = xmltodict.parse(requests.get("https://web.pharmacyboardkenya.org/sitemap.xml").text)

In [9]:
print(sitemap)

{'sitemapindex': {'@xmlns': 'http://www.sitemaps.org/schemas/sitemap/0.9', 'sitemap': [{'loc': 'https://web.pharmacyboardkenya.org/post-sitemap.xml', 'lastmod': '2024-09-15T18:51:07+00:00'}, {'loc': 'https://web.pharmacyboardkenya.org/page-sitemap.xml', 'lastmod': '2024-08-27T10:23:56+00:00'}, {'loc': 'https://web.pharmacyboardkenya.org/wpdmpro-sitemap.xml', 'lastmod': '2024-08-16T12:40:36+00:00'}, {'loc': 'https://web.pharmacyboardkenya.org/ot_header_builders-sitemap.xml', 'lastmod': '2024-04-12T10:22:07+00:00'}, {'loc': 'https://web.pharmacyboardkenya.org/ot_footer_builders-sitemap.xml', 'lastmod': '2024-03-08T09:52:24+00:00'}, {'loc': 'https://web.pharmacyboardkenya.org/post-archive-sitemap.xml', 'lastmod': '2024-08-16T12:40:36+00:00'}, {'loc': 'https://web.pharmacyboardkenya.org/category-sitemap.xml', 'lastmod': '2024-09-15T18:51:07+00:00'}, {'loc': 'https://web.pharmacyboardkenya.org/wpdmcategory-sitemap.xml', 'lastmod': '2024-08-16T12:40:36+00:00'}]}}


In [10]:
#check what is in the dict before downloading the files:

if 'urlset' in sitemap and 'url' in sitemap['urlset']:
    urls = [url['loc'] for url in sitemap['urlset']['url']]
else:
    print("Unexpected sitemap structure:", sitemap.keys())


#this gives us an idea of our sitemap structure that is used in the website. Instead of
# 'urlset', our uses sitemap index first as shown. 

Unexpected sitemap structure: dict_keys(['sitemapindex'])


In [11]:
sitemap_urls = [sitemap['loc'] for sitemap in sitemap['sitemapindex']['sitemap']]

In [12]:
def fetch_sitemap(url):
    response = requests.get(url)
    return xmltodict.parse(response.content)

all_urls = []

for sitemap_url in sitemap_urls:
    individual_sitemap = fetch_sitemap(sitemap_url)
    if 'urlset' in individual_sitemap and 'url' in individual_sitemap['urlset']:
        urls = [url['loc'] for url in individual_sitemap['urlset']['url']]
        all_urls.extend(urls) 
    else:
        print(f'Unexpected structure is sitemap {sitemap_url}')

print("\n".join(all_urls[:3]))


https://web.pharmacyboardkenya.org/blog-2/
https://web.pharmacyboardkenya.org/%f0%9d%90%8f%f0%9d%90%8f%f0%9d%90%81-%f0%9d%90%9a%f0%9d%90%a7%f0%9d%90%9d-%f0%9d%90%8a%f0%9d%90%80%f0%9d%90%8f%f0%9d%90%88-%f0%9d%90%8c%f0%9d%90%9a%f0%9d%90%ab%f0%9d%90%a4-%f0%9d%90%96%f0%9d%90%a8/
https://web.pharmacyboardkenya.org/%f0%9d%90%80%f0%9d%90%94-%f0%9d%9f%91%f0%9d%90%92-%f0%9d%90%92%f0%9d%90%ad%f0%9d%90%9e%f0%9d%90%9e%f0%9d%90%ab%f0%9d%90%a2%f0%9d%90%a7%f0%9d%90%a0-%f0%9d%90%82%f0%9d%90%a8%f0%9d%90%a6%f0%9d%90%a6/


Blueprint: Finding URLs from RSS

Often (and
sometimes more easily) this can be found by taking a look at the source code of the
corresponding webpage and searching for RSS.

In [8]:
import feedparser

feed = feedparser.parse('https://web.pharmacyboardkenya.org/feed/')

[(e.title, e.link) for e in feed.entries] 

[('𝐏𝐏𝐁 𝐚𝐧𝐝 𝐊𝐀𝐏𝐈 𝐌𝐚𝐫𝐤 𝐖𝐨𝐫𝐥𝐝 𝐏𝐚𝐭𝐢𝐞𝐧𝐭 𝐒𝐚𝐟𝐞𝐭𝐲 𝐃𝐚𝐲 𝐰𝐢𝐭𝐡 𝐅𝐨𝐜𝐮𝐬 𝐨𝐧 𝐈𝐦𝐩𝐫𝐨𝐯𝐢𝐧𝐠 𝐃𝐢𝐚𝐠𝐧𝐨𝐬𝐢𝐬 – 13th September 2024',
  'https://web.pharmacyboardkenya.org/%f0%9d%90%8f%f0%9d%90%8f%f0%9d%90%81-%f0%9d%90%9a%f0%9d%90%a7%f0%9d%90%9d-%f0%9d%90%8a%f0%9d%90%80%f0%9d%90%8f%f0%9d%90%88-%f0%9d%90%8c%f0%9d%90%9a%f0%9d%90%ab%f0%9d%90%a4-%f0%9d%90%96%f0%9d%90%a8/?utm_source=rss&utm_medium=rss&utm_campaign=%25f0%259d%2590%258f%25f0%259d%2590%258f%25f0%259d%2590%2581-%25f0%259d%2590%259a%25f0%259d%2590%25a7%25f0%259d%2590%259d-%25f0%259d%2590%258a%25f0%259d%2590%2580%25f0%259d%2590%258f%25f0%259d%2590%2588-%25f0%259d%2590%258c%25f0%259d%2590%259a%25f0%259d%2590%25ab%25f0%259d%2590%25a4-%25f0%259d%2590%2596%25f0%259d%2590%25a8'),
 ('𝐀𝐔-𝟑𝐒 𝐒𝐭𝐞𝐞𝐫𝐢𝐧𝐠 𝐂𝐨𝐦𝐦𝐢𝐭𝐭𝐞𝐞 𝐌𝐞𝐞𝐭𝐢𝐧𝐠 𝐄𝐧𝐝𝐬 𝐰𝐢𝐭𝐡 𝐂𝐨𝐦𝐦𝐢𝐭𝐦𝐞𝐧𝐭 𝐭𝐨 𝐁𝐨𝐨𝐬𝐭 𝐌𝐞𝐝𝐢𝐜𝐚𝐥 𝐏𝐫𝐨𝐝𝐮𝐜𝐭 𝐒𝐚𝐟𝐞𝐭𝐲 𝐢𝐧 𝐀𝐟𝐫𝐢𝐜𝐚\xa0 – 13th September 2024',
  'https://web.pharmacyboardkenya.org/%f0%9d%90%80%f0%9d%90%94-%f0%9d%9f%91%f0%9d%90%92-%f0%9d%90%92%f0%9d%90%ad%f0%9d%90%9e%f0%9d%90%9e%f0%9d%90%ab%f0%9d%90%a2%f

In [13]:
urls = [e.id for e in feed.entries] 

In [14]:
urls

['https://web.pharmacyboardkenya.org/?p=10573',
 'https://web.pharmacyboardkenya.org/?p=10570',
 'https://web.pharmacyboardkenya.org/?p=10562',
 'https://web.pharmacyboardkenya.org/?p=10558',
 'https://web.pharmacyboardkenya.org/?p=10164',
 'https://web.pharmacyboardkenya.org/?p=10158',
 'https://web.pharmacyboardkenya.org/?p=10155',
 'https://web.pharmacyboardkenya.org/?p=9958',
 'https://web.pharmacyboardkenya.org/?p=9955',
 'https://web.pharmacyboardkenya.org/?p=9946']

This is an alternative way to get a list of URLs when no sitemap.xml is available. Atom feeds offer the same information as RSS in a different format. 

Blueprint: Downloading HTML pages with Python

In [17]:
import requests
import time
import re 

start_time = time.time()

s = requests.Session()

#func to sanitize urls
def sanitize_filename(filename):
    return re.sub(r'[^\w\-_\.]','_', filename)

for url in urls[0:10]:
    #get the part after the last / in the url and use as a filename
    filename = url.rsplit('/')[-1]
    filename = sanitize_filename(filename)

    r = s.get(url)
    if r.ok:
        with open(filename, 'w+b') as f:
            f.write(r.text.encode('utf-8'))
    else:
        print(f'error with URL: {url}')

end_time = time.time()
execution_time = end_time - start_time
print(f'execution time: {execution_time:.2f} seconds')

execution time: 14.15 seconds


Blueprint: Downloading HTML Pages with wget(a command line tool)

wget supports lists of URLs for downloads and HTTP keep-alive. 

The -nc option of wget will check
whether files have already been downloaded. This way, we can avoid downloading
content twice. We can now stop the process at any time and restart without losing
data, which is important if a web server blocks us, our Internet connection goes
down,

In [18]:
with open('urls.txt','w+b') as f:
    f.write('\n'.join(urls).encode('utf-8'))

Go to your command line or open a terminal on Jupyter. Then run the following command:

wget -nc -i urls.txt

Make sure to install wget and add it to your system path, then read documentation if any problems arise. Ensure to disable the inbuilt wget alias and specify the installed wget path if any issue arises. 

The i option tells wget the lit of urls to downlaod. wget skips the exisitng file due to the -nc option(it means no clobber). wget can also be used for recursively downloading websites with the option -r. Combine it with -l to specify the depth/level of the recursion.

-----------------------------------------------------------------------------------------------


Extracting Semistructured Data

Blueprint: Extracting Data with Regular Expressions (we already did this in the code above)

In [22]:
import requests 
import re 

def sanitize_file(file):
    return re.sub(r'[^\w\-_\.]','_', file)

url = 'https://web.pharmacyboardkenya.org/pharmacy-and-poisons-board-adopts-new-technology-to-ensure-drug-quality-29th-july-2024/?utm_source=rss&utm_medium=rss&utm_campaign=pharmacy-and-poisons-board-adopts-new-technology-to-ensure-drug-quality-29th-july-2024'

s= requests.Session()
#use the part after the lat / as the filename
file = url.split('/')[-1] + '.html'
file = sanitize_file(file)
r = s.get(url)

if r.ok:
    with open(file, 'w+b') as f:
        f.write(r.text.encode('utf-8'))

else:
    print(f'Error with URL: {url}')

In [23]:
import re

with open(file, 'r') as f:
    html=f.read()
    g = re.search(r'<title>(.*)</title>', html, re.MULTILINE | re.DOTALL)
    if g:
        print(g.groups()[0])  #prints the title

Pharmacy and Poisons Board Adopts New Technology to Ensure Drug Quality, 29th July 2024 - Pharmacy and Poisons Board


The re library is not fully integrated into Python string handling thus cannot be invoked as methods of string.

As our HTML documents consist of many
lines, we have to use re.MULTILINE|re.DOTALL. Sometimes cascaded calls to
re.search are necessary, thought they make the code harder to read.

Use re.search and not re.match in Py. The latter tries to match the whole string, and
as there is data before < title> and after </ title>, it fails

------------------------------------------------------------------------------------------------

Blueprint: Using an HTML Parser for Extraction (using Beautiful Soup)

extracting the title/headline

In [24]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
soup.select('h4.entry-title')

[<h4 class="entry-title">Pharmacy and Poisons Board Adopts New Technology to Ensure Drug Quality, 29th July 2024</h4>]

In [26]:
#use the tag names directly 
soup.h4

<h4 class="entry-title">Pharmacy and Poisons Board Adopts New Technology to Ensure Drug Quality, 29th July 2024</h4>

In [28]:
#extract text without the HTML clutter around it 
soup.h4.text

'Pharmacy and Poisons Board Adopts New Technology to Ensure Drug Quality, 29th July 2024'

Note that in contrast to the regular expression solution, unnecessary whitespaces
have been stripped by Beautiful Soup.
Unfortunately, that does not work as well for the title, see the -ppb suffix:

In [29]:
soup.title.text

'Pharmacy and Poisons Board Adopts New Technology to Ensure Drug Quality, 29th July 2024 - Pharmacy and Poisons Board'

extracting the article text

In [30]:
soup.select_one('div.entry-summary').text

'\n\xa0\n\xa0 \nKenya – The Pharmacy and Poisons Board (PPB) has acquired advanced Near Infrared (NIR) technology, the Pillscan, to strengthen its efforts in safeguarding drug quality. Provided by the Mission of Essential Medicines and Supplies, this new tool will be used for on-site screening of medical products at PPB regional offices and key entry points.\nComplementing the PPB’s existing quality control measures, the NIR technology will enhance the detection of sub-standard and falsified medical products. To maximize the impact of this new tool, a six-day training was conducted for PPB staff, county pharmacists, and KEMSA personnel.\nBy incorporating NIR technology into its operations, the PPB aims to bolster its surveillance of the Kenyan pharmaceutical market and protect public health.\nThe Pharmacy and Poisons Board commends the Global Fund for supporting its mission to ensure safe and quality medicines for Kenyans. This investment in advanced technology is a significant step fo

extracting image captions

In [35]:
soup.select('div.entry-summary img')

[<img alt="" class="alignnone size-medium wp-image-9960" decoding="async" fetchpriority="high" height="225" sizes="(max-width: 300px) 100vw, 300px" src="https://web.pharmacyboardkenya.org/wp-content/uploads/2024/07/PHOTO-2024-07-29-16-45-54-300x225.jpg" srcset="https://web.pharmacyboardkenya.org/wp-content/uploads/2024/07/PHOTO-2024-07-29-16-45-54-300x225.jpg 300w, https://web.pharmacyboardkenya.org/wp-content/uploads/2024/07/PHOTO-2024-07-29-16-45-54.jpg 640w" width="300"/>,
 <img alt="" class="alignnone size-medium wp-image-9961" decoding="async" height="300" sizes="(max-width: 225px) 100vw, 225px" src="https://web.pharmacyboardkenya.org/wp-content/uploads/2024/07/PHOTO-2024-07-29-16-45-53-225x300.jpg" srcset="https://web.pharmacyboardkenya.org/wp-content/uploads/2024/07/PHOTO-2024-07-29-16-45-53-225x300.jpg 225w, https://web.pharmacyboardkenya.org/wp-content/uploads/2024/07/PHOTO-2024-07-29-16-45-53-768x1024.jpg 768w, https://web.pharmacyboardkenya.org/wp-content/uploads/2024/07/PHO

In [37]:
#to get the image caption
soup.select('div.entry-summary figcaption')  #our image has no caption 

[]

extracting the url 

When downloading many HTML files, it is often difficult to find the original URLs of
the files if they have not been saved separately, or the URLs might have changed.

HTML tag called < link rel="canonical">  can be used for this purpose. The tag
is not mandatory, but it is extremely common, as it is also taken into account by
search engines and contributes to a good ranking:


In [39]:
soup.find('link',{'rel':'canonical'})['href']

'https://web.pharmacyboardkenya.org/pharmacy-and-poisons-board-adopts-new-technology-to-ensure-drug-quality-29th-july-2024/'

extracting list information (authors)

In [69]:
url = 'https://health-data-commons.pharmaccess.org/'

In [71]:
soup.find ("meta", {'name': 'author'})['content']

'Daniel Kapitan, Julie Fleischer, Chris Ihure, Rob Wiegman, Iris Bokkes, Mark van der Graaf'

The above code returns only one author. If there is another author,
which is unfortunately not contained in the meta-information of the page,
it can be extracted again by selecting the elements in the browser and using the CSS
selector:

In [73]:
#url = 'https://health-data-commons.pharmaccess.org/'

#sel = " "  #replace this link with the html class where the authors are stored
#soup.select(sel)

In [74]:
#assuming we got the name of the authors above and wanted to extract the name of the author in pure text:

# [a.text for a in soup.select(sel)]

semantic and nonsemantic content

the sel selector is not semantic. Selection is performed
based on layout-like classes.This works well for the moment but is likely to break if the layout is changed.

Therefore, it’s a good idea to avoid these kinds of selections
if the code is likely to be executed not only once or in a batch but should also
run in the future.

extracting text of links

In [78]:
soup.select_one('div.quarto-title p').text

'Demonstrating fair sharing and reuse of health data in sub-Saharan Africa'

extracting reading time

In [59]:
# soup.select_one("p.ByLineBar_reading-time").text 
# 
# #our article has no reading time in the source code 

extracting attributes (IDs)

In [61]:
import requests 
import re 

def sanitize_file(file):
    return re.sub(r'[^\w\-_\.]','_', file)

url = 'https://health-data-commons.pharmaccess.org/' 

s= requests.Session()
#use the part after the lat / as the filename
file = url.split('/')[-1] + '.html'
file = sanitize_file(file)
r = s.get(url)

if r.ok:
    with open(file, 'w+b') as f:
        f.write(r.text.encode('utf-8'))

else:
    print(f'Error with URL: {url}')

In [62]:

with open(file, 'r') as f:
    html=f.read()
    g = re.search(r'<title>(.*)</title>', html, re.MULTILINE | re.DOTALL)
    if g:
        print(g.groups()[0]) 

Towards a health data commons in LMICs


In [63]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
soup.select('h2.unnumbered.anchored')

[<h2 class="unnumbered anchored" data-anchor-id="data-commons-as-a-catalyst-for-achieving-uhc">Data commons as a catalyst for achieving UHC</h2>,
 <h2 class="unnumbered anchored" data-anchor-id="building-on-the-openhie-framework">Building on the openHIE framework</h2>,
 <h2 class="unnumbered anchored" data-anchor-id="demonstrators-built-for-momcare-project">Demonstrators built for MomCare project</h2>]

In [68]:
# soup.select_one('div.quarto-title')['id']

#our article has no ID that identifies it 

extract attribution

Apart from the authors, the article carries more attributions. They can be found at the
end of the text and reside in a special container

In [83]:
soup.select_one('div#refs.references.csl-bib-body.hanging-indent').text 

'\n\nAyaz, Muhammad, Muhammad F Pasha, Mohammed Y Alzahrani, Rahmat Budiarto, and Deris Stiawan. 2021. â\x80\x9cThe Fast Health Interoperability Resources (FHIR) Standard: Systematic Literature Review of Implementations, Applications, Challenges and Opportunities.â\x80\x9d JMIR Medical Informatics 9 (7): e21929.\n\n\nBeck, Micah. 2019. â\x80\x9cOn the Hourglass Model.â\x80\x9d Communications of the ACM 62 (7): 48â\x80\x9357.\n\n\nDuda, Stephany N, Nan Kennedy, Douglas Conway, Alex C Cheng, Viet Nguyen, Teresa Zayas-CabÃ¡n, and Paul A Harris. 2022. â\x80\x9cHL7 FHIR-based Tools and Initiatives to Support Clinical Research: A Scoping Review.â\x80\x9d Journal of the American Medical Informatics Association 29 (9): 1642â\x80\x9353.\n\n\nGebreslassie, Tesfit Gebremeskel, Mirjam van Reisen, Samson Yohannes Amare, Getu Tadele Taye, and Ruduan Plug. 2023. â\x80\x9cFHIR4FAIR: Leveraging FHIR in Health Data FAIRfication Process: In the Case of VODAN-A.â\x80\x9d FAIR Connect 1 (1): 49â\x80\x9354.

extracting timestamp

it is crucial to know the time that the article was posted.
This is mentioned next to the section, but unfortunately it is constructed to be
human-readable (like “3 days ago”). This can be parsed but is tedious. Knowing the
real publishing time, the correct element can be found in the HTML head element

In [86]:
# ptime = soup.find('meta', {'property': 'article:published_time'})['content'] 

#our article has no published time

In [87]:
#convert the string time output to a datetime object

# from dateutil import parser

# parser.parse(ptime)

Comments:
Use regular expressions only for crude extraction. An HTML parser is slower but
much easier to use and more stable

It often makes sense to take a look at the semantic structure of the documents and
use HTML tags that have semantic class names to find the value of structural elements.
These tags have the advantage that they are the same over a large class of web
pages. Extraction of their content therefore has to be implemented only once and can
be reused.


----------------------------------------------------------------------------------

Blueprint: Spidering

In [2]:
import requests
from bs4 import BeautifulSoup
import os.path
from dateutil import parser 

#1. Define how many pages of the archive should be downloaded
def download_archive (page):
    filename = 'page-%06d.html' % page #filename = 'page-000001.html'
    if not os.path.isfile(filename):
        url = "https://www.reuters.com/news/archive/" + \
              "?view=page&page=%d&pageSize=10" % page 
        r = requests.get(url)
        with open(filename, 'w+') as f:
            f.write(r.text)

#2. Download each page of the archive into a file and for each page extract the links to the articles
def parse_archive_page(page_file):
    with open(page_file, 'r') as f:
        html = f.read()
    
    soup = BeautifulSoup(html, 'html.parser')
    hrefs = ['https://www.reuters.com' + a['href']
             for a in soup.select('article.story div.story-content a')]
    
    return hrefs
#4. For each article url download it to a html file, if the article file is present, skip this step
def download_article(url):
    #check if article is already present
    filename = url.split('/')[-1] + '.html'
    if not os.path.isfile(filename):
        r = requests.get(url)
        with open(filename, 'w+') as f:
            f.write(r.text)

#5. For each article file, extract the content into a dict and combine the dicts to a dataframe
def parse_article(article_file):
    def find_obfuscated_class(soup, klass):
        return soup.find_all(lambda tag: tag.has_attr('class') and (klass in " ".join(tag['class'])))
    with open (article_file, 'r') as f:
        html = f.read()
    r = {}  #initate an empty dict
    soup = BeautifulSoup(html, 'html.parser')
    r['id'] = soup.select_one('div.StandardArticle_inner-container')['id']
    r['url'] = soup.find('link', {'rel': 'canonical'})['href']
    r['headline'] = soup.h1.text
    r['section'] = find_obfuscated_class(soup, "ArticleHeader-channel")[0].text
    
    r['text'] = "\n".join([t.text for t in find_obfuscated_class(soup, "Paragraph-paragraph")])
    r['authors'] = find_obfuscated_class(soup, "Attribution-attribution")[0].text
    r['time'] = soup.find('meta', {'property':
                                   'og:article:published_time'})['content']
    
    return r 


In [3]:
#download 10 pages of the archive
for p in range(1, 10):
    download_archive(p)

In [4]:
#pass archive and add to article_urls
import glob 

article_urls = []
for page_file in glob.glob('page-*.html'):
    article_urls += parse_archive_page(page_file)

In [5]:
#download articles 
for url in article_urls:
    download_article(url)

In [15]:
#arrange in pandas DataFrame
import pandas as pd
import glob 

df = pd.DataFrame() 
for article_file in glob.glob("*-id???????????.html"):
    df = df.append(parse_article(article_file), ignore_index=True)  # Fixed indentation

if 'time' in df.columns:
    df['time'] = pd.to_datetime(df['time'])
else:
    print("The 'time' column is not present in the DataFrame.")


The 'time' column is not present in the DataFrame.


Density-Based Text Extraction

In [25]:
from readability import Document 
doc = Document('https://web.pharmacyboardkenya.org/feed/')  #replace the url with a html page, use requests lib to get the html
doc.title()

'[no-title]'

In [20]:
#doc.short_title()

#finding the title or summary of the page 

In [None]:
#doc.summary()

In [24]:
#to extract the body part of the article

#density_soup = BeautifulSoup(html, 'html.parser')
#density_soup.body.text

In most of the cases, python-readability works
reasonably well and removes the need to implement too many special cases. However,
the cost of using this library is uncertainty.

 Will it always work in the expected way
with the impossibility of extracting structured data such as timestamps, authors, and
so on (although there might be other heuristics for that)?

----------------------------------------------------------------------------------

Blueprint: Scrapy

Unfortunately, the code for scrapy cannot be changed easily. One more argument for using up to date separate libraries. In the version used here, it still collects the titles of the articles but not more.

In [27]:
import scrapy
import logging 

class ReutersArchiveSpider(scrapy.Spider):
    name = 'reuters_archive'

    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'FEED_FORMAT': 'json',
        'FEED_URL': 'reuters_archive.json'
    }

    start_urls = ['https://www.reuters.com/news/archive/']

    def parse(self, response):
        for article in response.css('article.story div.story-content a'):
            yield response.follow(article.css('a::attr(href)').extract_first(), self.parse_article)

        next_page_url = response.css('a.control-nav-next::attr(href)').extract_first()
        if (next_page_url is not None) & ('page=2' not in next_page_url):
            yield response.follow(next_page_url, self.parse)

    def parse_article(self, response):
        yield {
            'title': response.css('h1::text').extract_first().strip(),
        }


Scrapy works in an object-oriented way. For each so-called spider, a class needs to be
implemented that is derived from scrapy.Spider. Scrapy adds a lot of debug output,
which we reduce by logging.WARNING. 

The base class automatically
calls the parse function with the start_urls. This function extracts the
links to the article and invokes yield with the function parse_article as a parameter.

This function in turn extracts some attributes from the articles and yields them
in a dict. Finally, the next page link is crawled, but we stop here before getting the
second page.

yield has a double functionality in Scrapy. If a dict is yielded, it is added to the
results. If a Request object is yielded, the object is fetched and gets parsed.

In [28]:
#this can be run only once from a Jupyter notebook due to Twisted dependencies which are ancient 
from scrapy.crawler import CrawlerProcess 
process = CrawlerProcess()

process.crawl(ReutersArchiveSpider)
process.start()


2024-09-30 13:06:00 [scrapy.utils.log] INFO: Scrapy 2.11.1 started (bot: scrapybot)
2024-09-30 13:06:00 [scrapy.utils.log] INFO: Versions: lxml 5.2.1.0, libxml2 2.10.4, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 23.10.0, Python 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:03:56) [MSC v.1929 64 bit (AMD64)], pyOpenSSL 24.0.0 (OpenSSL 3.0.14 4 Jun 2024), cryptography 42.0.5, Platform Windows-11-10.0.22631-SP0


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)



In [29]:
glob.glob('*.json') #return a list of paths matching a pathname pattern 

[]

In [32]:
!cat 'reuters_archive.json'  #return the content of a file - in our case, it's nothing since an empty list was returned 

'cat' is not recognized as an internal or external command,
operable program or batch file.


NOTES:

1. As most of the coding is spent in extracting data in the articles, this code has to
change frequently. For this, spidering has to be restarted (and if you are running
the script in Jupyter, you also have to start the Jupyter notebook server), which
tremendously increases turnaround times.

2. It’s nice that JSON can directly be produced. Be careful as the JSON file is
appended, which can result in an invalid JSON if you don’t delete the file before
starting the spidering process. This can be solved by using the so-called jl format
(JSON lines), but it is a workaround

3. Scrapy has some nice ideas. In our day-to-day work, we do not use it, mainly
because debugging is hard. If persistence of the HTML files is needed (which is
strongly recommended), it loses lots of advantages. The object-oriented approach is useful
and can be implemented outside of Scrapy without too much effort

4. For some
websites, ready-made Scrapy spiders might already be available and can be reused