In [1]:
import pandas as pd
import numpy as np
import json
from time import sleep

In [2]:
#### part 1 list of all products

# it is not possible to download links to products on the page using BS or Scrapy. It is necessary to use Selenium.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

d = webdriver.Chrome(executable_path='chromedriver') 

In [3]:
d.get("https://www.notino.pl/perfumy/")

In [4]:
# number of the last page
max_range = int(d.find_elements("xpath","//span[@data-testid='page-item']")[-1].text)

In [5]:
products = [] # list of links to each product
url = 'https://www.notino.pl/perfumy/?f=0-1-55544'
for i in range(1,max_range+1): 
    url = url.replace(f'?f={i-1}',f'?f={i}') # next pages are created according to this scheme
    d.get(url) 
    product = d.find_elements("xpath","//div[@data-testid='product-container']//a") # find links to products
    sleep(2)
    for i in product:
        products.append(i.get_attribute('href')) # add links to the list
products = list(set(products)) # unique values
unnecessary_links = ['https://www.notino.pl/opakowanie-prezentowe/', 'https://www.notino.pl/mobile-application/',
                     'https://www.notino.pl/wyprzedaz-perfum-kosmetykow/', 'https://www.notino.pl/weekly-promo']
for link in unnecessary_links:
    if link in products:
        products.remove(link) 

In [6]:
d.close() 

In [7]:
# saving the list with all links to the csv file so you don't have to repeat the above every time
df = pd.DataFrame(products)
df.dropna(inplace=True)
df.to_csv("link_list.csv", index = False)

In [8]:
#### Part 2 - scraping each product

In [9]:
import scrapy
import logging
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor

In [10]:
class ProductsSpider(scrapy.Spider):
    
    name = 'products'
    start_urls = products
            
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'FEEDS': {'res3.csv': {'format':'csv'}}
        ,'DOWNLOAD_DELAY': 3 
        ,'RANDOMIZE_DOWNLOAD_DELAY' : True 
    }
    
    def parse_product(self,response):
        
        xpath_name = '//title/text()' 
        xpath_EAN = '//script[@type="application/ld+json"]//text()' 
        xpath_price_promo = '//div[@id = "pd-price"]/span/@content'
        xpath_price = '//span[@data-testid = "originalPriceLineThroughWrapper"]/span/span/@content'
            
        name = response.xpath(xpath_name).get()[:-12]
        EAN = json.loads(response.xpath(xpath_EAN).get())['gtin13']
        price_promo = response.xpath(xpath_price_promo).get()
        price = response.xpath(xpath_price).get()
        
        yield {
                'Name': name,
                'EAN': EAN,
                'price': price,
                'price promo': price_promo,
                'url': response.url
                }
    
    def parse(self,response):
        
        # product can have many variants
        xpath_variant = '//div[@id = "pdVariantsTile"]//@href' 
        
        if response.xpath(xpath_variant).extract() == []: # if there is only one variant then scrape response.url
            yield scrapy.Request(response.url, self.parse_product)
            
        else: # if there are more, generate a list of them and scrape everything
            for url in response.xpath(xpath_variant).extract():
                href = response.urljoin(url)
                yield scrapy.Request(href, self.parse_product)

In [1]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.34 Safari/537.36'
})

process.crawl(ProductsSpider)
process.start()