In [1]:
import scrapy 

In [2]:
class Spider12(scrapy.Spider):
    name = 'spider12'
    # dominios a scrapear
    allowed_domains = ['pagina12.com.ar']  
    # formato de archivo de salida
    custom_settings = {'FEED_FORMAT':'json',   
                       'FEED_URI': 'resultados.json',
                       'DEPTH_LIMIT': 2}  
    
    # URLS a scrapear
    starts_urls = ['https://www.pagina12.com.ar/secciones/el-pais',
                  'https://www.pagina12.com.ar/secciones/economia',
                  'https://www.pagina12.com.ar/secciones/sociedad',
                  'https://www.pagina12.com.ar/suplementos/cultura-y-espectaculos',
                  'https://www.pagina12.com.ar/secciones/el-mundo',
                  'https://www.pagina12.com.ar/secciones/deportes',
                  'https://www.pagina12.com.ar/secciones/contratapa',
                  'https://www.pagina12.com.ar/secciones/audiovisuales']
    
    # Procesar la respuesta de cada solicitud
    def parse(self, response):
        
        # Articulo promocionado
        nota_promocionada = response.xpath('//div[@class="featured-article__container"]/h2/a/@href').get()
        if nota_promocionada is not None:
            # Pasar la respuesta a parse_nota
            yield response.follow(nota_promocionada, callback=self.parse_nota)
        
        # Listado de notas
        notas = response.path('//ul[@class="article-list"]//li//a/@href').getall()
        for nota in notas: 
            # Pasar la respuesta a parse_nota
            yield response.follow(nota, callback=self.parse_nota)

        # Link a la siguiente pagina
        next_page = response.xpath('//a[@class="pagination-btn-next"]/@href')
        if next_page is not None:
            # Pasar la respuesta a parse
            yield response.follow(next_page, callback=self.parse)
        
        
    def parse_nota(self, response):
        date = response.xpath('//div[@class="time"]/span/@datetime').get()
        prefix = response.xpath('//h2[@class="article-prefix"]/text()').get()
        title = response.xpath('//h1[@class="article-titles"]/text()').get()
        summary = response.xpath('//div[@class="article-summary"]/text()').get()
        content = response.xpath('//div[@class="article-text"]/text()').getall()
        image = response.xpath('//div[@class="article-main-media-image__container"]/img/@src').getall()[-1]
        yield {'url': response.url,
                'date': date,
                'prefix': prefix,
                'title': title,
                'summary': summary,
                'content': content,
                'image': image}


In [3]:
from scrapy.crawler import CrawlerProcess

In [4]:
process = CrawlerProcess()
process.crawl(Spider12)
process.start()

2023-07-21 22:17:11 [scrapy.utils.log] INFO: Scrapy 2.9.0 started (bot: scrapybot)
2023-07-21 22:17:11 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.1, Twisted 22.10.0, Python 3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:25:13) [Clang 14.0.6 ], pyOpenSSL 23.0.0 (OpenSSL 1.1.1t  7 Feb 2023), cryptography 39.0.1, Platform macOS-13.4.1-arm64-arm-64bit
2023-07-21 22:17:11 [scrapy.crawler] INFO: Overridden settings:
{'DEPTH_LIMIT': 2}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2023-07-21 22:17:11 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2023-07-21 22:17:11 [scrapy.extensions.telnet] INFO: Telnet Password: 05773beca4a0ddbd
  exporter = cls(crawler)

2023-07-21 22:17:11 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.exte