In [1]:
import scrapy
import json
import logging
import pandas as pd
import requests
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
from time import sleep

In [2]:
url = 'https://www.rossmann.pl/kategoria/Perfumy,8512?Page=1&PageSize=96' # link do kategorii

In [3]:
# Korzystam z BS, aby wydobyć ingformację na temat numeru ostatniej zakładki
headers = {'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.34 Safari/537.36'}
page = requests.get(url, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
zakres_max = soup.find('a', href = True, class_ = "pages__last").next_element 
zakres_max = int(zakres_max)
sleep(2)

In [4]:
# tworzę listę startowych adresów
urls = []
for i in range(1,zakres_max+1):
    url = url.replace(f'Page={i-1}',f'Page={i}')
    urls.append(url) # generuję listę start_urls do scrapera

In [5]:
# format pliku wynikowego
class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.file = open('output.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [6]:
class ProductsSpider(scrapy.Spider):
    
    name = 'products'
    start_urls = urls
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, 
        'FEED_FORMAT':'json',                                 
        'FEED_URI': 'output.json'
        ,'DOWNLOAD_DELAY': 3 # opóźnienie w sekundach, czas scrapowania około 10 minut
        ,'RANDOMIZE_DOWNLOAD_DELAY' : True # losowość opóźnień: (0.5,1.5)*lag
    }
    
    def parse_product(self, response):
        
        xpath_name = '//h1[@class="h1"]/text()' # ścieżka do nazwy 
        xpath_EAN = "//*[contains(text(), 'Kod EAN')]/following-sibling::text()[1]" # ścieżka do EAN 
        xpath_price = '//meta[@property="product:price:amount"]/@content' # ścieżka do ceny
        xpath_price_promo = '//meta[@property="product:sale_price:amount"]/@content' # ścieżka do ceny promocyjnej
        
        name = ''.join(response.xpath(xpath_name).getall())
        EAN = response.xpath(xpath_EAN).get()
        price = response.xpath(xpath_price).get()
        price_promo = response.xpath(xpath_price_promo).get()
        
        yield {
                'Name':name,
                'EAN': EAN,
                'cena': price,
                'cena promo': price_promo,
                'url': response.url
                }
        
    def parse(self,response):
        
        xpath_url = '//a[@class = "tile-product__name"]/@href' # ścieżka do poszczególnego produktu na zakładce
        
        for url in response.xpath(xpath_url).extract(): # wydobądź wszystkie produkty na karcie i pętla po każdym z nich
            href = response.urljoin(url)
            yield scrapy.Request(href, self.parse_product) # wywołaj funkcję scrapowania pojedynczego produktu

In [7]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.34 Safari/537.36'
})

process.crawl(ProductsSpider)
process.start()

2023-02-03 01:38:03 [scrapy.utils.log] INFO: Scrapy 2.7.1 started (bot: scrapybot)
2023-02-03 01:38:03 [scrapy.utils.log] INFO: Versions: lxml 4.9.2.0, libxml2 2.9.12, cssselect 1.2.0, parsel 1.7.0, w3lib 2.1.1, Twisted 22.10.0, Python 3.9.4 (tags/v3.9.4:1f2e308, Apr  6 2021, 13:40:21) [MSC v.1928 64 bit (AMD64)], pyOpenSSL 23.0.0 (OpenSSL 3.0.7 1 Nov 2022), cryptography 39.0.0, Platform Windows-10-10.0.19041-SP0
2023-02-03 01:38:03 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 30,
 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
               '(KHTML, like Gecko) Chrome/101.0.4951.34 Safari/537.36'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

  exporter = cls(crawler)



In [8]:
dfjl = pd.read_json('output.jl', lines=True, dtype = str) # wczytywanie pliku wynikowego z dysku
dfjl['EAN'] = dfjl['EAN'].apply(lambda x: x.replace(' ','')) # usuwam spacje z EAN 
dfjl_ = dfjl[dfjl['cena promo'] != 'None'] # usuwam produkty, które nie są w promocji
dfjl_.to_excel('Wynik1.xlsx', sheet_name='Arkusz', index = False, freeze_panes = [1,0]) # zapisanie do excela