In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from itemloaders.processors import MapCompose, TakeFirst
from scrapy.loader import ItemLoader
import pandas as pd
from openpyxl import Workbook, load_workbook
import os
import re
import logging

---

# SCRAPY ITEMS 
Using Scrapy Item as shown below. We can get the data in a class BookItem

```
class BookItem(scrapy.Item):
    link_c = scrapy.Field()
    link_x = scrapy.Field()
    title_c = scrapy.Field()
    price_c = scrapy.Field()
    title_x = scrapy.Field()
    price_x = scrapy.Field()
```

The above class is the used in the for loop of the BookToScrapeSpider class method parse as below
```
for ebook_c, ebook_x in zip(ebooks_css, ebooks_xpath):
            book_item = BookItem() # call the book item class 
            
            book_item['title_c'] = ebook_c.css("a::text").get()
            book_item['title_x'] = ebook_x.xpath('.//a[@title]/text()').get()
            
            
            price_c = ebook_c.css("p.price_color::text").get()
            book_item['price_c'] = float(price_c[1:])
            
            price_x = ebook_x.xpath('.//p[@class = "price_color"]/text()').get()
            book_item['price_x'] = float(price_x[1:])
            
            
            book_item['link_c'] = ebook_c.css("a::attr(href)").get()
            book_item['link_x'] = ebook_x.xpath('.//a[@href]/text()').get()
            
            yield book_item
```

I have changed the below to a raw NBconvert

---

---
# SCRAPY ITEM
The price has a pound sign like £10.34 and I define a function te remove it like below.
Also, i can  round the results then the Mapcompose and the TakeFirst will be very helpful



```



# This Class item is for the collection of data extracted from the class BookToScrapeSpider
class BookItem(scrapy.Item):
    # I am able to remove the pound sign from the price_x and price_c but rounded only the price_c inside the MapCompose
    
    def get_price(txt):
        return float(txt.replace('£', ''))

    def round_get_price(x):
        return round(x)
        
    # use the field method from scrapy to get the data directly from the parse method  
    link_c = scrapy.Field()
    link_x = scrapy.Field()
    title_c = scrapy.Field()
    price_c = scrapy.Field(input_processor = MapCompose(get_price, round_get_price),
                                output_processor = TakeFirst())
    title_x = scrapy.Field()
    price_x = scrapy.Field(input_processor = MapCompose(get_price),
                                output_processor = TakeFirst())

```

Then with the help of ItemLoader, I was able to use the BookItem() class.

Also, i was abble to use the loader.add_css and loader.add_xpath to get the required data passing the names instead of using a dictionary directly


```
for ebook_c, ebook_x in zip(ebooks_css, ebooks_xpath):
            loader_c = ItemLoader(item=BookItem(), selector=ebook_c) # Initialize the Itemloader with the BookItem Class
            loader_c.add_css('title_c', 'a::text')
            loader_c.add_css('price_c', 'p.price_color::text')
            loader_c.add_css('link_c', 'a::attr(href)')
            
            # Create an ItemLoader for XPath context
            loader_x = ItemLoader(item=BookItem(), selector=ebook_x) # Initialize the Itemloader with the BookItem Class
            loader_x.add_xpath('title_x', './/a[@title]/text()')
            loader_x.add_xpath('price_x', './/p[@class="price_color"]/text()')
            loader_x.add_xpath('link_x', './/a[@href]/text()')
            
            # Merge items from both loaders (you could also choose to process separately)
            item = loader_c.load_item()
            item.update(loader_x.load_item())

```


---


# SCRAPY PIPELINE

The yield statement from our parse method in the BooksToScrapeSpider is passed to the pipline.

Install !pip install openpyxl

The spider in the method helps to access name and start url in the BooksToScrapeSpider class



---
# PAGINATION

Pagination helps us to get the next page of the website.

```
 def __init__(self):
        super().__init__()
        self.page_count = 1
        self.total_pages = 4
        
    def start_requests(self):
        base_url = 'https://books.toscrape.com/catalogue/category/books/sequential-art_5'
        
        while self.page_count <= self.total_pages:
            print(f'Page Count : {self.page_count}')
            yield scrapy.Request(f"{base_url}/page-{self.page_count}.html")
                        
            self.page_count += 1
```

---

# FOLLOWING THE LINK

## CSS SELECTOR METHOD

## Xpath method

# EXTRACTING TABLES

# EXTRACTING TABLE FROM codecademy beautifulsoup course using SCRAPY

In [2]:
class BooksToScrapeSpider(scrapy.Spider):
    name = 'bookspider'

    # Website spider sends requests to
    start_urls = ['https://content.codecademy.com/courses/beautifulsoup/cacao/index.html']
    
    # This will help to save the data generated in CSV format
    custom_settings = {
        'FEEDS': {
            'cacao_table.csv': {
                'format': 'csv',
                'overwrite': True
            }
        },
        'LOG_LEVEL': 'WARNING',  # Set the logging level to WARNING
        #'LOG_FILE': 'scrapy_log.txt'  # Optional: Save log messages to a file
    }

    def parse(self, response):
        print('[ OUR RESPONSE ]')
        
        # Initialize lists to store header and data
        header = []
        data = []

        # Select all rows in the table
        rows = response.xpath('//tr')

        # Iterate over each row
        for index, row in enumerate(rows):
            # Select all <td> elements in the current row and get their text
            tds = row.xpath('td')
            row_data = [td.xpath('text()').get() for td in tds]

            # Handle header row
            if index == 2:
                header = row_data
                #print('Header:', header)

            # Collect data rows starting from index 3 onward
            elif index >= 3:
                data.append(row_data)

        # Combine header with data into rows
        if header and data:
            for row in data:
                # Yield each row as a dictionary with header as keys
                yield dict(zip(header, row))

                
                
def run_spider():
    process = CrawlerProcess(settings=get_project_settings())
    process.crawl(BooksToScrapeSpider)
    process.start()

if __name__ == '__main__':
    # Configure logging
    logging.getLogger('scrapy').setLevel(logging.WARNING)  # Show only warnings and errors
    run_spider()

2024-09-12 19:48:16 [scrapy.utils.log] INFO: Scrapy 2.11.2 started (bot: scrapybot)
2024-09-12 19:48:16 [scrapy.utils.log] INFO: Versions: lxml 5.3.0.0, libxml2 2.11.7, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.7.0, Python 3.10.13 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:24:38) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 24.2.1 (OpenSSL 3.3.2 3 Sep 2024), cryptography 43.0.1, Platform Windows-10-10.0.22631-SP0


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)



[ OUR RESPONSE ]


# SCRAPING JAVASCRIPT RENDERED WEBSITES