In [2]:
!pip install scrapy

Collecting scrapy
  Downloading Scrapy-2.12.0-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting Twisted>=21.7.0 (from scrapy)
  Downloading twisted-24.11.0-py3-none-any.whl.metadata (20 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.2-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.10.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.7.0-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.2.0-py3-none-any.whl.metadata (5.1 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Downloading w3lib-2.3.1-py3-none-any.whl.metadata (2.3 kB)
Collecting zope.interface>=5.1.0 (from scrapy)
  Downloading zope.interface-7.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_6

In [7]:
import scrapy
from scrapy.crawler import CrawlerProcess

# Define the data model for storing Nobel Prize winners' information
class NWinnerItem(scrapy.Item):
    country = scrapy.Field()  # Country name
    name = scrapy.Field()  # Nobel laureate's name
    link_text = scrapy.Field()  # Full text from the list item

# Define the Scrapy Spider to scrape data from Wikipedia
class NWinnerSpider(scrapy.Spider):
    name = 'nwinners_list'  # Unique name for the spider
    allowed_domains = ['en.wikipedia.org']  # Allowed domain to scrape
    start_urls = [
        "https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
    ]  # List of URLs to start crawling from

    def parse(self, response):
        h3s = response.xpath('//h3')  # Find all <h3> headers (country names)

        for h3 in h3s:
            country = h3.xpath('./text()').get()  # Extract the country name
            if country:
                # Find the ordered list <ol> that comes immediately after the <h3>
                winners = h3.xpath('../following-sibling::ol[1]/li')
                for w in winners:
                    text = w.xpath('.//text()').getall()  # Extract all text from the list item
                    yield NWinnerItem(
                        country=country.strip(),  # Clean country name
                        name=text[0].strip(),  # Get the first text element (winner's name)
                        link_text=' '.join(text).strip()  # Join all text elements into a single string
                    )
# Use CrawlerProcess to run Scrapy within a script (useful for Google Colab)
process = CrawlerProcess(settings={
    "FEEDS": {  # Define the output file and format
        "nobel_winners.json": {"format": "json"},
    },
})

process.crawl(NWinnerSpider)  # Start the spider
process.start()  # Run the Scrapy process


[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
DEBUG:scrapy.core.scraper:Scraped from <200 https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country>
{'country': 'Switzerland',
 'link_text': 'Heinrich Rohrer , Physics, 1986',
 'name': 'Heinrich Rohrer'}
2025-02-03 01:46:33 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country>
{'country': 'Switzerland',
 'link_text': 'Heinrich Rohrer , Physics, 1986',
 'name': 'Heinrich Rohrer'}
DEBUG:scrapy.core.scraper:Scraped from <200 https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country>
{'country': 'Switzerland',
 'link_text': 'Werner Arber , Physiology or Medicine, 1978',
 'name': 'Werner Arber'}
2025-02-03 01:46:33 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country>
{'country': 'Switzerland',
 'link_text': 'Werner Arber , Physiology or Medicine, 1978',
 'name': 'Werner Arber'}
DEBUG

In [8]:
import json

with open("nobel_winners.json", "r") as file:
    data = json.load(file)

print(json.dumps(data[:5], indent=2))  # Show the first five results


[
  {
    "country": "Algeria",
    "name": "Claude Cohen-Tannoudji",
    "link_text": "Claude Cohen-Tannoudji *, Physics, 1997"
  },
  {
    "country": "Algeria",
    "name": "Albert Camus",
    "link_text": "Albert Camus *, Literature, 1957"
  },
  {
    "country": "Argentina",
    "name": "C\u00e9sar Milstein",
    "link_text": "C\u00e9sar Milstein *, Physiology or Medicine, 1984"
  },
  {
    "country": "Argentina",
    "name": "Adolfo P\u00e9rez Esquivel",
    "link_text": "Adolfo P\u00e9rez Esquivel , Peace, 1980"
  },
  {
    "country": "Argentina",
    "name": "Luis Federico Leloir",
    "link_text": "Luis Federico Leloir ,  born in France , Chemistry, 1970"
  }
]
