In [6]:
import requests
from bs4 import BeautifulSoup
import os
import time
import random
from urllib.parse import urljoin, urlparse
import logging
from fake_useragent import UserAgent
import cloudscraper
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import undetected_chromedriver as uc
import json

class AdvancedImageScraper:
    def __init__(self, download_folder="Uva"):
        self.download_folder = download_folder
        self.ua = UserAgent()
        self.setup_logging()
        self.downloaded_urls = set()
        
        # Inicializar múltiples métodos de scraping
        self.session = self.setup_requests_session()
        self.scraper = cloudscraper.create_scraper()
        
    def setup_requests_session(self):
        """Configura sesión de requests con headers avanzados"""
        session = requests.Session()
        headers = {
            'User-Agent': self.ua.random,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Cache-Control': 'max-age=0',
        }
        session.headers.update(headers)
        return session

    def setup_selenium_driver(self):
        """Configura Selenium para sitios muy protegidos"""
        options = Options()
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)
        options.add_argument(f'--user-agent={self.ua.random}')
        options.add_argument('--window-size=1920,1080')
        
        driver = webdriver.Chrome(options=options)
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        return driver

    def setup_undetected_chrome(self):
        """Usa undetected-chromedriver para sitios muy protegidos"""
        options = uc.ChromeOptions()
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--window-size=1920,1080')
        driver = uc.Chrome(options=options)
        return driver

    def setup_logging(self):
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('scraping.log', encoding='utf-8'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)

    def download_image(self, img_url, filename=None):
        """Descarga una imagen individual"""
        try:
            if img_url in self.downloaded_urls:
                return False

            if not filename:
                parsed = urlparse(img_url)
                filename = os.path.basename(parsed.path)
                if not filename or '.' not in filename:
                    filename = f"image_{int(time.time())}_{random.randint(1000,9999)}.jpg"

            filepath = os.path.join(self.download_folder, filename)
            
            # Intentar con cloudscraper primero
            response = self.scraper.get(img_url, timeout=30, stream=True)
            response.raise_for_status()

            content_type = response.headers.get('content-type', '')
            if 'image' not in content_type:
                self.logger.warning(f"No es una imagen: {img_url}")
                return False

            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)

            self.downloaded_urls.add(img_url)
            self.logger.info(f"Descargada: {filename}")
            return True

        except Exception as e:
            self.logger.error(f"Error descargando {img_url}: {str(e)}")
            return False

    def scrape_istockphoto_with_scroll(self, url):
        """Scraping específico para iStockPhoto con scroll infinito"""
        self.logger.info(f"Iniciando scraping iStockPhoto con scroll infinito: {url}")
        driver = self.setup_undetected_chrome()
        
        try:
            driver.get(url)
            time.sleep(5)  # Esperar carga inicial
            
            # Scroll infinito para cargar todas las imágenes
            self.logger.info("Iniciando scroll infinito...")
            last_height = driver.execute_script("return document.body.scrollHeight")
            scroll_attempts = 0
            max_scroll_attempts = 50  # Límite de seguridad
            
            while scroll_attempts < max_scroll_attempts:
                # Scroll hasta el fondo
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(3)  # Esperar a que carguen nuevas imágenes
                
                # Intentar hacer click en botones de "Load More" si existen
                try:
                    load_more_buttons = driver.find_elements(By.XPATH, "//button[contains(text(), 'Load more')] | //button[contains(text(), 'Cargar más')] | //a[contains(text(), 'Load more')]")
                    for button in load_more_buttons:
                        if button.is_displayed():
                            driver.execute_script("arguments[0].click();", button)
                            self.logger.info("Click en botón 'Load More'")
                            time.sleep(3)
                except:
                    pass
                
                # Calcular nueva altura
                new_height = driver.execute_script("return document.body.scrollHeight")
                self.logger.info(f"Scroll {scroll_attempts + 1}: Altura {new_height}")
                
                # Verificar si llegamos al final
                if new_height == last_height:
                    self.logger.info("No hay más contenido para cargar")
                    break
                    
                last_height = new_height
                scroll_attempts += 1
            
            # Buscar todas las imágenes después del scroll
            self.logger.info("Buscando imágenes después del scroll...")
            img_elements = driver.find_elements(By.TAG_NAME, "img")
            self.logger.info(f"Encontradas {len(img_elements)} elementos img")
            
            img_urls = []
            for img in img_elements:
                try:
                    # Obtener diferentes atributos de imagen
                    for attr in ['src', 'data-src', 'data-original', 'data-lazy-src']:
                        img_url = img.get_attribute(attr)
                        if img_url and self.is_valid_image_url(img_url):
                            # Convertir URL relativa a absoluta si es necesario
                            if img_url.startswith('//'):
                                img_url = 'https:' + img_url
                            elif img_url.startswith('/'):
                                img_url = 'https://www.istockphoto.com' + img_url
                                
                            img_urls.append(img_url)
                            break
                except Exception as e:
                    continue
            
            # Filtrar URLs únicas
            img_urls = list(set(img_urls))
            self.logger.info(f"Encontradas {len(img_urls)} URLs de imágenes únicas")
            
            # Descargar TODAS las imágenes sin límite
            downloaded = 0
            for i, img_url in enumerate(img_urls):
                self.logger.info(f"Procesando imagen {i+1}/{len(img_urls)}")
                if self.download_image(img_url):
                    downloaded += 1
                    # Delay aleatorio entre descargas
                    time.sleep(random.uniform(1, 3))
                    
            self.logger.info(f"✅ iStockPhoto: {downloaded} imágenes descargadas de {len(img_urls)} encontradas")
            return downloaded
            
        except Exception as e:
            self.logger.error(f"Error en scraping iStockPhoto: {str(e)}")
            return 0
        finally:
            driver.quit()

    def is_valid_image_url(self, url):
        """Verifica si la URL es de una imagen válida"""
        if not url:
            return False
        
        # Filtrar URLs que no son imágenes
        invalid_patterns = [
            'logo', 'icon', 'avatar', 'spinner', 'loading',
            'placeholder', 'advertisement', 'banner', 'svg',
            'istock_global', 'googleusercontent'
        ]
        
        url_lower = url.lower()
        
        # Si contiene patrones inválidos, descartar
        for pattern in invalid_patterns:
            if pattern in url_lower:
                return False
        
        # Verificar extensiones de imagen
        valid_extensions = ['.jpg', '.jpeg', '.png', '.webp', '.gif', '.bmp']
        for ext in valid_extensions:
            if ext in url_lower:
                return True
        
        # Si no tiene extensión pero parece URL de imagen
        if any(domain in url_lower for domain in ['istockphoto', 'istock', 'gettyimages']):
            if any(keyword in url_lower for keyword in ['photo', 'image', 'picture', 'asset']):
                return True
        
        return False

    def smart_scrape(self, url):
        """Scraping inteligente que elige el mejor método"""
        self.logger.info(f"Scrapeando inteligentemente: {url}")
        
        # Crear carpeta si no existe
        if not os.path.exists(self.download_folder):
            os.makedirs(self.download_folder)
        
        # Para iStockPhoto usar método específico con scroll
        if 'istockphoto.com' in url:
            return self.scrape_istockphoto_with_scroll(url)
        else:
            # Para otros sitios, método genérico
            return self.scrape_with_selenium_infinite(url)

    def scrape_with_selenium_infinite(self, url):
        """Método genérico con scroll infinito para otros sitios"""
        self.logger.info(f"Usando scroll infinito para: {url}")
        driver = self.setup_undetected_chrome()
        
        try:
            driver.get(url)
            time.sleep(5)
            
            # Scroll infinito
            last_height = driver.execute_script("return document.body.scrollHeight")
            scroll_attempts = 0
            max_scroll_attempts = 30
            
            while scroll_attempts < max_scroll_attempts:
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)
                new_height = driver.execute_script("return document.body.scrollHeight")
                
                if new_height == last_height:
                    break
                    
                last_height = new_height
                scroll_attempts += 1
                self.logger.info(f"Scroll {scroll_attempts}, altura: {new_height}")
            
            # Buscar imágenes
            img_elements = driver.find_elements(By.TAG_NAME, "img")
            img_urls = []
            
            for img in img_elements:
                for attr in ['src', 'data-src', 'data-original', 'data-lazy-src']:
                    img_url = img.get_attribute(attr)
                    if img_url and self.is_valid_image_url(img_url):
                        if img_url.startswith('//'):
                            img_url = 'https:' + img_url
                        img_urls.append(img_url)
                        break
            
            # Descargar todas sin límite
            downloaded = 0
            for img_url in set(img_urls):
                if self.download_image(img_url):
                    downloaded += 1
                    time.sleep(random.uniform(1, 3))
                    
            self.logger.info(f"Descargadas {downloaded} imágenes")
            return downloaded
            
        finally:
            driver.quit()

# USO ESPECÍFICO PARA iStockPhoto
if __name__ == "__main__":
    scraper = AdvancedImageScraper()
    
    # URLs de iStockPhoto (puedes agregar más)
    istock_urls = [
        "https://www.istockphoto.com/es/search/more-like-this/474752880?assettype=image&page=2",

    ]
    
    for url in istock_urls:
        try:
            scraper.logger.info(f"🔄 Iniciando scraping: {url}")
            downloaded = scraper.smart_scrape(url)
            scraper.logger.info(f"✅ Completado: {downloaded} imágenes descargadas de {url}")
            time.sleep(random.uniform(10, 15))  # Espera entre sitios
        except Exception as e:
            scraper.logger.error(f"❌ Error con {url}: {str(e)}")


2025-09-29 09:14:34,330 - INFO - 🔄 Iniciando scraping: https://www.istockphoto.com/es/search/more-like-this/474752880?assettype=image&page=2
2025-09-29 09:14:34,331 - INFO - Scrapeando inteligentemente: https://www.istockphoto.com/es/search/more-like-this/474752880?assettype=image&page=2
2025-09-29 09:14:34,331 - INFO - Iniciando scraping iStockPhoto con scroll infinito: https://www.istockphoto.com/es/search/more-like-this/474752880?assettype=image&page=2
2025-09-29 09:14:36,611 - INFO - patching driver executable C:\Users\MSI LAPTOP\appdata\roaming\undetected_chromedriver\undetected_chromedriver.exe
2025-09-29 09:14:46,572 - INFO - Iniciando scroll infinito...
2025-09-29 09:14:49,598 - INFO - Scroll 1: Altura 6439
2025-09-29 09:14:49,598 - INFO - No hay más contenido para cargar
2025-09-29 09:14:49,599 - INFO - Buscando imágenes después del scroll...
2025-09-29 09:14:49,608 - INFO - Encontradas 64 elementos img
2025-09-29 09:14:49,821 - INFO - Encontradas 61 URLs de imágenes únicas
20