### Extraction of opinions

Imports

In [25]:
import json
import requests
from bs4 import BeautifulSoup
import os
from deep_translator import GoogleTranslator
import datetime

Utils

In [26]:
# define a function to rerurn a given child selector(s) from within a given ancestor
def extract(ancestor, selector=None, attribute = None, multiple = False):
    if selector:
        if multiple:
            if attribute:
                return [tag[attribute].strip() for tag in ancestor.select(selector)]
            return [tag.text.strip() for tag in ancestor.select(selector)]
        if attribute:
            try:
                return ancestor.select_one(selector)[attribute].strip()
            except TypeError:
                return None
        try:
            return ancestor.select_one(selector).text.strip()
        except AttributeError:
            return None
    if attribute:
        return ancestor[attribute]
    return ancestor.text.strip()

In [27]:
selectors = {
    'opinion_id': (None, "data-entry-id"),
    'author': ("span.user-post__author-name",),
    'recommendation': ("span.user-post__author-recomendation > em",),
    'score': ("span.user-post__score-count",),
    'content_pl': ("div.user-post__text",),
    'pros_pl': ("div.review-feature__item--positive", None, True),
    'cons_pl': ("div.review-feature__item--negative", None, True),
    'thumbs_up': ("button.vote-yes", "data-total-vote",),
    'thumbs_down': ("button.vote-no", "data-total-vote",),
    'date_published': ("span.user-post__published > time:nth-child(1)", "datetime"),
    'date_purchased': ("span.user-post__published > time:nth-child(2)", "datetime")
}

Translation function

In [28]:
def translate(text, source = 'pl', target = 'en'):
    return GoogleTranslator(source, target).translate(text)

Obtaining the webpage

In [29]:
with open("./cookie.json", "r", encoding="UTF-8") as json_file:
    headers = json.load(json_file)
requests_url = "https://www.ceneo.pl/"
product_id = input('Please enter a produt ID') # 161123001, 163107768, 66915598
url = f"{requests_url}{product_id}#tab=reviews"
response = requests.get(url, headers = headers)

Extracting the opinions

In [30]:
all_opinions = []
while url is not None:
    response = requests.get(url, headers = headers)
    if response.status_code == 200:
        page_doc = BeautifulSoup(response.text, 'html.parser')
        opinions = page_doc.select("div.js_product-review:not(.user-post--highlight)")
        for opinion in opinions:
            single_opinion = {
                key: extract(opinion, *values) # unpacks a tuple or list --> converts list of elements into seperate elements
                for key, values in selectors.items()
            }

            # optional translation
            try:
                single_opinion['content_en'] = translate(single_opinion['content_pl'])
            except Exception:
                single_opinion['content_en'] = None
            try:
                single_opinion['pros_en'] = [translate(pro_pl) for pro_pl in single_opinion['pros_pl']]
            except Exception:
                single_opinion['pros_en'] = None
            try:
                single_opinion['cons_en'] = [translate(con_pl) for con_pl in single_opinion['cons_pl']]
            except Exception:
                single_opinion['cons_en'] = None
            
            single_opinion['recommendation'] = True if single_opinion['recommendation'] == "polecam" else False if single_opinion['recommendation']  == "Nie polecam" else None
            single_opinion['score'] = float(single_opinion['score'].split('/')[0].replace(",","."))
            # single_opinion['date_published'] = datetime.datetime.fromisoformat(single_opinion['date_published'])
            # single_opinion['date_purchased'] = datetime.datetime.fromisoformat(single_opinion['date_purchased']) if single_opinion['date_purchased'] else None
            single_opinion['thumbs_up'] = int(single_opinion['thumbs_up'])
            single_opinion['thumbs__down'] = int(single_opinion['thumbs_down'])

            all_opinions.append(single_opinion)
        try:
            url = "https://www.ceneo.pl"+page_doc.select_one("a.pagination__next")["href"]
        except TypeError:
            url = None

In [31]:
for single_opinion in all_opinions:
    try:
        single_opinion['date_published'] = single_opinion['date_published'].strftime("%Y-%m-%d, %H:%M:%S") if single_opinion['date_published'] else None
    except AttributeError:
        pass
    try:
        single_opinion['date_purchased'] = single_opinion['date_purchased'].strftime("%Y-%m-%d, %H:%M:%S") if single_opinion['date_purchased'] else None
    except AttributeError:
        pass

Generating json file

In [32]:
if not os.path.exists("./opinions"):
    os.mkdir("./opinions")
with open(f"./opinions/{product_id}.json", "w", encoding="UTF-8") as json_file:
    json.dump(all_opinions, json_file, indent=4, ensure_ascii=False)