### Biblioteki

In [8]:
import json 
import os
import requests
from bs4 import BeautifulSoup

### Funkcja do ekstrakcji zawartości strony html

In [10]:
def extract(ancestor, selector = None, attribute = None, returns_list = False):
    if selector:
        if returns_list:
            if attribute:
                return [tag[attribute].strip()
                for tag in ancestor.select(selector)]
            return [tag.get_text().strip() for tag in ancestor.select(selector)]
        if attribute:
            try:
                return ancestor.select_one(selector)[attribute].strip()
            except TypeError:
                return None
        if attribute:
            return ancestor[attribute].strip()
        return ancestor.get_text().strip()  

## Ekstracja składowych pojedynczej opinii
|składowa|Selektor|Zmienna|
|--------|--------|-------|
|id opinii|["data-entry-id"]|opinion_id
|autor |span.user-post__author-name|author
|rekomendacja |span.user-post__author-recomendation > em|recommendation
|gwiazdki |span.user-post__score-count|rating
|treść |div.user-post__text|content
|lista zalet |div.review-feature__title--positives ~ div.review-feature__item|pros
|lista wad |div.review-feature__title--negatives ~ div.review-feature__item|cons
|dla ilu przydatna |span[id^="votes-yes"]|useful
|dla ilu nieprzydatna |span[id^="votes-no"]|useless
|data wystawienia |span.user-post__published > time:nth-child(1)["datetime"]|post_date
|data zakupu |span.user-post__published > time:nth-child(2)["datetime"]|purchase_date

### Słownik reprezentujący strukturę

In [11]:
selectors = {
                "opinion_id" : (None, "data-entry-id"),
                "author": ("span.user-post__author-name",),
                "recommendation":("span.user-post__author-recomendation > em",),
                "rating": ("span.user-post__score-count",),
                "content":("div.user-post__text",),
                "pros": ("div.review-feature__title--positives ~ div.review-feature__item",  None, True),
                "cons": ("div.review-feature__title--negatives ~ div.review-feature__item", None, True),
                "useful" : ("span[id^='votes-yes']",),
              "useless": ("span[id^='votes-no']",),
              "post_date": ("span.user-post__published > time:nth-child(1)","datetime"),
              "purchase_date": ("span.user-post__published > time:nth-child(2)","datetime"),
            }

### Link do pierwszej strony z opiniami o wskazanym produkcie

In [12]:
product_id = input('Podaj kod produktu z serwisu CeneoPl')
url = f"https://www.ceneo.pl/{product_id}/opinie-1"


### Pobranie wszystkich opiniii o wskazanym produkcie

In [13]:
all_opinions = []
while(url):
    print(url)
    response = requests.get(url)
    page_dom = BeautifulSoup(response.text, "html.parser")
    opinions = page_dom.select("div.js_product-review")
    for opinion in opinions:
        single_opinion = {
            key: extract(opinion, *value)
            for key, value in selectors.items()
        }
    try: 
        url = "https://www.ceneo.pl"+extract(page_dom,"a.pagination__next","href")
    except TypeError:
        url = None


https://www.ceneo.pl/114700014/opinie-1
https://www.ceneo.pl/114700014/opinie-2
https://www.ceneo.pl/114700014/opinie-3
https://www.ceneo.pl/114700014/opinie-4
https://www.ceneo.pl/114700014/opinie-5
https://www.ceneo.pl/114700014/opinie-6
https://www.ceneo.pl/114700014/opinie-7
https://www.ceneo.pl/114700014/opinie-8
https://www.ceneo.pl/114700014/opinie-9
https://www.ceneo.pl/114700014/opinie-10
https://www.ceneo.pl/114700014/opinie-11
https://www.ceneo.pl/114700014/opinie-12
https://www.ceneo.pl/114700014/opinie-13
https://www.ceneo.pl/114700014/opinie-14
https://www.ceneo.pl/114700014/opinie-15
https://www.ceneo.pl/114700014/opinie-16
https://www.ceneo.pl/114700014/opinie-17
https://www.ceneo.pl/114700014/opinie-18
https://www.ceneo.pl/114700014/opinie-19


### Zapis opinii o wskazanym produkcie do pliku JSON

In [14]:
if not os.path.exists("opinions"):
    os.mkdir("opinions")

with open(f"opinions/{product_id}.json","w",encoding="UTF-8") as jf:
    json.dump(all_opinions, jf, indent = 4, ensure_ascii = False )
    

In [15]:
len(opinions)

9