In [None]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin, urlparse
import tldextract
import concurrent.futures

# Email regex pattern (standard, but can be tuned)
EMAIL_PATTERN = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')

# Common subpage keywords to look for if no emails on homepage
RELEVANT_SUBPAGES = ['contact', 'about', 'support', 'help', 'info', 'customer-service', 'faq']

HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

def get_domain(url):
    ext = tldextract.extract(url)
    return ext.domain + '.' + ext.suffix

def is_same_domain(url, base_domain):
    domain = get_domain(url)
    return domain == base_domain

def fetch_page(url):
    try:
        resp = requests.get(url, headers=HEADERS, timeout=10)
        resp.raise_for_status()
        return resp.text
    except Exception as e:
        print(f"  [!] Failed to fetch {url}: {e}")
        return None

def extract_emails_from_text(text):
    emails = set(re.findall(EMAIL_PATTERN, text))
    return emails

def find_relevant_links(soup, base_url, base_domain):
    links = []
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href'].lower()
        full_url = urljoin(base_url, href)
        if is_same_domain(full_url, base_domain):
            for keyword in RELEVANT_SUBPAGES:
                if keyword in href:
                    links.append(full_url)
                    break
    return list(set(links))  # unique links

def scrape_emails_from_url(url):
    print(f"[*] Scraping: {url}")
    html = fetch_page(url)
    if not html:
        return set()
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text(separator=' ')
    emails = extract_emails_from_text(text)
    return emails

def extract_emails_from_website(url):
    base_domain = get_domain(url)
    emails = scrape_emails_from_url(url)

    if emails:
        return emails

    # No emails on homepage, try relevant subpages
    homepage_html = fetch_page(url)
    if not homepage_html:
        return set()
    soup = BeautifulSoup(homepage_html, 'html.parser')
    relevant_links = find_relevant_links(soup, url, base_domain)

    all_emails = set()
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(scrape_emails_from_url, link) for link in relevant_links]
        for future in concurrent.futures.as_completed(futures):
            all_emails.update(future.result())

    return all_emails

def extract_emails_from_links(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    results = []

    for _, row in df.iterrows():
        website = row['website']
        # Normalize URL (ensure scheme exists)
        if not website.startswith(('http://', 'https://')):
            website = 'https://' + website

        emails = extract_emails_from_website(website)
        if emails:
            for email in emails:
                results.append({'website': website, 'email': email})
        else:
            results.append({'website': website, 'email': 'no email'})

    df_result = pd.DataFrame(results)
    df_result.to_csv(output_csv, index=False)
    print(f"\n[✓] Saved results to {output_csv}")

if __name__ == "__main__":
    extract_emails_from_links("selected_websites.csv", "final_emails_output.csv")
