In [1]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin, urlparse
import tldextract
import concurrent.futures

# Patterns for email, phone, and social links
EMAIL_PATTERN = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')
PHONE_PATTERN = re.compile(r'\(?\+?\d{1,3}\)?[\s.-]?\(?\d{2,4}\)?[\s.-]?\d{2,4}[\s.-]?\d{2,4}')
SOCIAL_DOMAINS = ["facebook.com", "twitter.com", "instagram.com", "linkedin.com", "youtube.com", "t.me"]

RELEVANT_SUBPAGES = ['contact', 'about', 'support', 'help', 'info', 'customer-service', 'faq']
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

def get_domain(url):
    ext = tldextract.extract(url)
    return ext.domain + '.' + ext.suffix

def is_same_domain(url, base_domain):
    domain = get_domain(url)
    return domain == base_domain

def fetch_page(url):
    try:
        resp = requests.get(url, headers=HEADERS, timeout=10)
        resp.raise_for_status()
        return resp.text
    except Exception as e:
        print(f"  [!] Failed to fetch {url}: {e}")
        return None

def extract_info_from_text(text):
    emails = set(re.findall(EMAIL_PATTERN, text))
    phones = set(re.findall(PHONE_PATTERN, text))
    return emails, phones

def extract_social_links(soup):
    social_links = set()
    for a_tag in soup.find_all("a", href=True):
        href = a_tag['href']
        for domain in SOCIAL_DOMAINS:
            if domain in href:
                social_links.add(href)
    return social_links

def find_relevant_links(soup, base_url, base_domain):
    links = []
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href'].lower()
        full_url = urljoin(base_url, href)
        if is_same_domain(full_url, base_domain):
            for keyword in RELEVANT_SUBPAGES:
                if keyword in href:
                    links.append(full_url)
                    break
    return list(set(links))

def scrape_info_from_url(url):
    print(f"[*] Scraping: {url}")
    html = fetch_page(url)
    if not html:
        return set(), set(), set()
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text(separator=' ')
    emails, phones = extract_info_from_text(text)
    socials = extract_social_links(soup)
    return emails, phones, socials

def extract_all_info_from_website(url):
    base_domain = get_domain(url)
    emails, phones, socials = scrape_info_from_url(url)

    if emails or phones or socials:
        return emails, phones, socials

    homepage_html = fetch_page(url)
    if not homepage_html:
        return set(), set(), set()

    soup = BeautifulSoup(homepage_html, 'html.parser')
    relevant_links = find_relevant_links(soup, url, base_domain)

    all_emails, all_phones, all_socials = set(), set(), set()
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(scrape_info_from_url, link) for link in relevant_links]
        for future in concurrent.futures.as_completed(futures):
            e, p, s = future.result()
            all_emails.update(e)
            all_phones.update(p)
            all_socials.update(s)

    return all_emails, all_phones, all_socials

def extract_emails_from_links(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    results = []

    for _, row in df.iterrows():
        website = row['website']
        if not website.startswith(('http://', 'https://')):
            website = 'https://' + website

        emails, phones, socials = extract_all_info_from_website(website)
        results.append({
            'website': website,
            'emails': ', '.join(emails) if emails else 'no email',
            'phones': ', '.join(phones) if phones else 'no phone',
            'social_links': ', '.join(socials) if socials else 'no social links'
        })

    df_result = pd.DataFrame(results)
    df_result.to_csv(output_csv, index=False)
    print(f"\n[✓] Saved results to {output_csv}")

if __name__ == "__main__":
    extract_emails_from_links("selected_websites.csv", "final_emails_output.csv")


[*] Scraping: https://oldjewelry.net/
[*] Scraping: https://diamondsourcenyc.com/
[*] Scraping: https://www.popular.jewelry/
[*] Scraping: https://www.veranda.com/luxury-lifestyle/luxury-fashion-jewelry/g37925974/best-jewelry-shops/
[*] Scraping: https://stores.cartier.com/united-states/ny/new-york/653-fifth-avenue
[*] Scraping: https://www.davidyurman.com/stores/new-york-city-dy57.html
[*] Scraping: https://www.instagram.com/mosesjewelry/
[*] Scraping: https://www.foreverdiamondsny.com/
[*] Scraping: https://www.martinbuschjewelers.com/
[*] Scraping: https://monaghansrvc.com/post/13-best-jewelry-stores-in-new-york-city.p2220
[*] Scraping: https://diamondjewelersonline.com/
[*] Scraping: https://www.melvinhjoyeria.com/
[*] Scraping: https://www.frassanitojewelers.com/
[*] Scraping: https://www.hanikenjewelry.com/
[*] Scraping: https://solomonsfinejewelers.com/
[*] Scraping: https://www.vrai.com/
[*] Scraping: https://stores.zales.com/ny/syracuse/1637
[*] Scraping: https://oscarstonenyc