In [None]:
import os
import re
import time
import json
import logging
import requests
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from dotenv import load_dotenv
from dateutil import parser

def resolve_env_vars(value, env_dict):
    if not value:
        return value
    pattern = re.compile(r"\$\{([^}]+)\}")
    matches = pattern.findall(value)
    for var in matches:
        if var in env_dict:
            value = value.replace(f"${{{var}}}", env_dict[var])
    return value

load_dotenv()
env_vars = dict(os.environ)

base_dir = env_vars.get("BASE_DIR", os.path.dirname(os.path.abspath(__file__)))
base_dir = resolve_env_vars(base_dir, env_vars)

data_raw_trustpilot = resolve_env_vars(env_vars.get("DATA_RAW_TRUSTPILOT"), env_vars)
if not data_raw_trustpilot:
    data_raw_trustpilot = os.path.join(base_dir, "data", "trustpilot")

log_dir = resolve_env_vars(env_vars.get("LOG_DIR"), env_vars)
if not log_dir:
    log_dir = os.path.join(base_dir, "log")

os.makedirs(data_raw_trustpilot, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

log_file = os.path.join(log_dir, f"scraping_trustpilot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.FileHandler(log_file, encoding="utf-8"), logging.StreamHandler()]
)

class TrustpilotScraper:
    def __init__(self, domain, max_pages=30):
        self.original_domain = domain.lower().strip()
        self.domain = re.sub(r"\.[a-z]{2,}$", "", self.original_domain)
        self.domain_dir = os.path.join(data_raw_trustpilot, self.domain)
        os.makedirs(self.domain_dir, exist_ok=True)
        self.max_pages = max_pages
        self.ua = UserAgent()
        self.session = self._init_session()
        self.last_page_path = os.path.join(self.domain_dir, "derniere_page.txt")
        self.info_data = None
        self.last_successful_page = 0

    def _init_session(self):
        session = requests.Session()
        retry = Retry(total=5, backoff_factor=1, status_forcelist=[500,502,503,504])
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('https://', adapter)
        return session

    def _headers(self):
        return {
            "User-Agent": self.ua.random,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
            "Referer": "https://www.google.com/"
        }

    def _load_last_page(self):
        if os.path.exists(self.last_page_path):
            try:
                page = int(open(self.last_page_path, "r", encoding="utf-8").read().strip())
                logging.info(f"Reprise à partir de la page {page + 1}")
                return page + 1
            except Exception as e:
                logging.warning(f"Erreur lecture {self.last_page_path} : {e}")
        return 1

    def _save_last_page(self, page):
        try:
            with open(self.last_page_path, "w", encoding="utf-8") as f:
                f.write(str(page))
            self.last_successful_page = page
        except Exception as e:
            logging.error(f"Erreur sauvegarde {self.last_page_path} : {e}")

    def _extract_json_ld(self, soup):