In [4]:
import sys
import os
import time
import asyncio
import aiohttp
import requests
import pandas as pd
import tldextract
import ssl
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from io import BytesIO
from PIL import Image, UnidentifiedImageError
import imagehash
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import undetected_chromedriver as uc
from selenium_stealth import stealth
from concurrent.futures import ThreadPoolExecutor
import nest_asyncio
from time import sleep
from urllib.parse import urlparse

# On Windows, force the Selector Event Loop Policy to avoid socket errors.
if sys.platform == 'win32':
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

# Apply nest_asyncio immediately for Jupyter compatibility.
nest_asyncio.apply()

# Load the Parquet file containing your domains.
df = pd.read_parquet("logos.snappy.parquet", engine="fastparquet")
domains = df['domain'].tolist()

# Ensure the directory for logos exists.
os.makedirs("logos", exist_ok=True)

# Lowered concurrency parameters.
CONCURRENT_REQUESTS = 20  # Lowered from 50
semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)

# ---- DEDUPLICATION SETUP ----
# This dictionary will store the computed hash for each domain’s logo.
saved_logo_hashes = {}

def is_duplicate_logo(new_logo_path, domain, threshold=5):
    """Compute the average hash of the new logo and compare to saved ones.
       If the difference is below threshold for a different domain, consider it a duplicate."""
    try:
        new_hash = imagehash.average_hash(Image.open(new_logo_path))
    except Exception as e:
        print(f"Error computing hash for {domain}: {e}")
        return False
    for saved_domain, saved_hash in saved_logo_hashes.items():
        if saved_domain != domain and (new_hash - saved_hash) < threshold:
            print(f"Duplicate detected: {domain} logo is similar to logo from {saved_domain}")
            return True
    # If not duplicate, store the hash.
    saved_logo_hashes[domain] = new_hash
    return False

# ---- ASYNC HTML EXTRACTION ----
async def extract_logo_html(session, domain):
    """Extract logo URL from HTML source using both HTTPS and HTTP."""
    async with semaphore:
        for protocol in ["https", "http"]:
            url = f"{protocol}://{domain}"
            try:
                async with session.get(url, timeout=10, ssl=False) as response:
                    if response.status != 200:
                        continue
                    html = await response.text()
                    soup = BeautifulSoup(html, "html.parser")
                    for img in soup.find_all("img"):
                        alt = img.get("alt", "").lower()
                        src = img.get("src", "").lower()
                        if "logo" in alt or "logo" in src:
                            return urljoin(url, img.get("src"))
            except Exception as e:
                print(f"⚠️ {protocol.upper()} failed for {domain}: {e}")
        print(f"❌ HTML extraction failed for {domain}")
        return None

# ---- ASYNC FAVICON RETRIEVAL ----
async def get_favicon(session, domain, max_retries=10, timeout=20):
    """Try to fetch the favicon using aiohttp with a few retries."""
    root_domain = tldextract.extract(domain).registered_domain
    favicon_url = f"https://{root_domain}/favicon.ico"
    for attempt in range(max_retries):
        try:
            async with session.get(favicon_url, timeout=timeout) as response:
                if response.status == 200:
                    return favicon_url
        except Exception as e:
            print(f"⚠️ Favicon fetch error for {domain} (attempt {attempt+1}/{max_retries}): {e}")
        await asyncio.sleep(1)
    print(f"❌ Failed to fetch favicon for {domain} after {max_retries} attempts.")
    return None

# ---- SELENIUM RETRIEVAL (WRAPPED IN EXECUTOR) ----
async def get_logo_selenium_async(loop, driver, domain, max_retries=5):
    """Wrap the Selenium call in a retry loop to fetch logos on JS-heavy sites."""
    def get_logo_selenium(driver, domain):
        try:
            url = f"https://{domain}"
            driver.get(url)
            time.sleep(5)  # Blocking sleep – allow images to load.
            logo_elements = driver.find_elements(By.TAG_NAME, "img")
            for img in logo_elements:
                logo_url = img.get_attribute("src")
                alt_text = img.get_attribute("alt")
                if logo_url and ("logo" in logo_url.lower() or (alt_text and "logo" in alt_text.lower())):
                    return logo_url
        except Exception as e:
            print(f"❌ Selenium failed for {domain}: {e}")
        return None

    for attempt in range(max_retries):
        result = await loop.run_in_executor(None, get_logo_selenium, driver, domain)
        if result:
            return result
        else:
            print(f"⚠️ Selenium retry {attempt+1}/{max_retries} for {domain}")
            await asyncio.sleep(2)
    return None

# ---- DOWNLOAD LOGO ----
def download_logo(url, domain):
    """Download the logo image if it does not already exist, and check for duplicates."""
    if url.startswith("data:image"):
        print(f"⚠️ Skipping data URL for {domain}.")
        return False
    logo_path = f"logos/{domain}.png"
    if os.path.exists(logo_path):
        print(f"🔄 Logo for {domain} already exists. Skipping download.")
        return True
    try:
        response = requests.get(url, timeout=10, stream=True)
        if response.status_code == 200:
            img_data = BytesIO(response.content)
            try:
                img = Image.open(img_data)
                img.save(logo_path)
                # Check for duplicates using perceptual hash.
                if is_duplicate_logo(logo_path, domain):
                    print(f"❌ Duplicate logo detected for {domain}. Removing file.")
                    os.remove(logo_path)
                    return False
                print(f"✅ Logo saved for {domain}")
                return True
            except UnidentifiedImageError:
                print(f"❌ Error saving logo for {domain}: cannot identify image file")
                return False
    except Exception as e:
        print(f"❌ Error saving logo for {domain}: {e}")
    return False

# ---- MAIN PIPELINE FUNCTION ----
async def process_domain(session, loop, driver, domain):
    """Try multiple methods to get a logo URL and download it."""
    # First, try HTML extraction.
    logo_url = await extract_logo_html(session, domain)
    # If that fails, try retrieving the favicon.
    if not logo_url:
        logo_url = await get_favicon(session, domain)
    # If favicon retrieval fails, try Selenium.
    if not logo_url:
        logo_url = await get_logo_selenium_async(loop, driver, domain)
    if logo_url:
        return download_logo(logo_url, domain)
    print(f"❌ No logo found for {domain}")
    return False

# ---- RUN ASYNC TASKS ----
async def run_async_tasks(batch_size=50):  # Reduced batch size from 100 to 50
    """Main async function to run logo extraction across all domains."""
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1920,1080")

    driver = uc.Chrome(options=options, headless=True)
    stealth(driver, languages=["en-US", "en"], vendor="Google Inc.", platform="Win32",
            webgl_vendor="Intel Inc.", renderer="Intel Iris OpenGL Engine", fix_hairline=True)
    driver.set_page_load_timeout(15)

    loop = asyncio.get_running_loop()
    total_results = []
    # Process domains in batches.
    for i in range(0, len(domains), batch_size):
        batch = domains[i:i+batch_size]
        print(f"Processing batch {i//batch_size+1} with {len(batch)} domains")
        # Use a TCPConnector with a limit to restrict the number of open sockets.
        connector = aiohttp.TCPConnector(limit=20)
        async with aiohttp.ClientSession(connector=connector) as session:
            try:
                tasks = [process_domain(session, loop, driver, domain) for domain in batch]
                batch_results = await asyncio.gather(*tasks, return_exceptions=True)
                total_results.extend(batch_results)
            except OSError as ose:
                print(f"OSError encountered during batch processing: {ose}")
        await asyncio.sleep(2)
    driver.quit()
    return total_results


In [5]:
results = await run_async_tasks()
print(f"{sum(results)} logos saved")



could not detect version_main.therefore, we are assuming it is chrome 108 or higher


Processing batch 1 with 50 domains
⚠️ HTTPS failed for ymcasteuben.org: Cannot connect to host ymcasteuben.org:443 ssl:False [[SSL: SSLV3_ALERT_HANDSHAKE_FAILURE] sslv3 alert handshake failure (_ssl.c:992)]
❌ HTML extraction failed for ymcasteuben.org
⚠️ Favicon fetch error for ymcasteuben.org (attempt 1/10): Cannot connect to host ymcasteuben.org:443 ssl:default [[SSL: SSLV3_ALERT_HANDSHAKE_FAILURE] sslv3 alert handshake failure (_ssl.c:992)]
🔄 Logo for freseniusmedicalcare.ca already exists. Skipping download.
⚠️ HTTPS failed for greatplacetowork.com.bo: Cannot connect to host greatplacetowork.com.bo:443 ssl:False [None]
🔄 Logo for kia-moeller-wunstorf.de already exists. Skipping download.
🔄 Logo for plameco-hannover.de already exists. Skipping download.
❌ Error saving logo for autosecuritas-ct-seysses.fr: cannot identify image file
❌ Error saving logo for synlab.com.tr: cannot identify image file
🔄 Logo for toyota-buchreiter-eisenstadt.at already exists. Skipping download.
❌ Error s

CancelledError: 