In [53]:
import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

def extract_bing_links(html):
    soup = BeautifulSoup(html, "lxml")
    links = []
    # Print a sample of the page to debug
    # print(soup.prettify()[:1000])  # Uncomment to inspect the page
    
    # Bing search results are usually in <li> with class "b_algo"
    results = soup.find_all("li", class_="b_algo")
    print(f"Found {len(results)} results")
    for result in results:
        a_tag = result.find("a")
        if a_tag and a_tag.has_attr("href"):
            links.append(a_tag["href"])
    return links

def test_bing_scraper():
    query = "Eyeglasses store Texas USA"
    encoded_query = query.replace(" ", "+")
    url = f"https://www.bing.com/search?q={encoded_query}&first=1"
    print(f"Fetching: {url}")
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to fetch page, status code: {response.status_code}")
        return
    
    links = extract_bing_links(response.text)
    print("Extracted links:")
    for link in links:
        print(link)

if __name__ == "__main__":
    test_bing_scraper()


Fetching: https://www.bing.com/search?q=Eyeglasses+store+Texas+USA&first=1
Found 10 results
Extracted links:
https://www.americasbest.com/location/tx/
https://www.eyeglassworld.com/location/tx/
https://www.stantonoptical.com/
https://eyepiecestexas.com/
https://local.targetoptical.com/tx.html
https://paireyewear.com/store-locator
http://eyemaxusa.com/
https://www.eyeglassesoftexas.com/
https://www.americasbest.com/find-a-store
https://25optical.com/


In [54]:
import requests

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

query = "Eyeglasses store Texas USA"
encoded_query = query.replace(" ", "+")
url = f"https://www.bing.com/search?q={encoded_query}&first=1"

response = requests.get(url, headers=headers)

with open("bing_page.html", "w", encoding="utf-8") as f:
    f.write(response.text)

print("Saved bing_page.html")


Saved bing_page.html


In [55]:
from bs4 import BeautifulSoup

with open("bing_page.html", "r", encoding="utf-8") as f:
    html = f.read()

soup = BeautifulSoup(html, "lxml")

# Bing search results usually live in <li class="b_algo">, but let's confirm with a broader search
results = soup.find_all("li", class_="b_algo")

print(f"Found {len(results)} results")

urls = []
for result in results:
    a_tag = result.find("a")
    if a_tag and a_tag.has_attr("href"):
        urls.append(a_tag["href"])

print("Extracted URLs:")
for url in urls:
    print(url)


Found 10 results
Extracted URLs:
https://www.eyeglassworld.com/location/tx/
https://www.americasbest.com/find-a-store
https://www.americasbest.com/location/tx/
https://www.eyeglassesoftexas.com/
https://eyemaxusa.com/
/images/search?view=detailV2&ccid=YpX9Geh5&id=CE3B3C5B407D04BAFFD4541452B09750DC86966E&thid=OIP.YpX9Geh5uhzftcaabp35jwHaD1&mediaurl=https://eyepiecestexas.com/wp-content/uploads/2023/09/our-service-block-image-min.jpg&q=Eyeglasses+store+Texas+USA&ck=ACA23155C156C0F66FE1A3C5627D81FA&idpp=rc&idpview=singleimage&form=rc2idp
https://paireyewear.com/store-locator
https://www.stantonoptical.com/locations/weatherford-tx/
https://factoryeyeglassoutlet.com/
/images/search?view=detailV2&ccid=N3fPx3iP&id=AAD81FE68C701EFA5FCBCF67DB4B3B082E47FB25&thid=OIP.N3fPx3iPmQjHhKr-fB7MawHaHa&mediaurl=https://santafeoptical.com/wp-content/uploads/2024/08/IMG_3896.jpg&q=Eyeglasses+store+Texas+USA&ck=35B850026E5283283A029357D5A62691&idpp=rc&idpview=singleimage&form=rc2idp


In [56]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

def extract_bing_links(html):
    soup = BeautifulSoup(html, "lxml")
    results = soup.find_all("li", class_="b_algo")
    links = []
    for result in results:
        a_tag = result.find("a")
        if a_tag and a_tag.has_attr("href"):
            links.append(a_tag["href"])
    return links

def generate_bing_actual_urls(country, city, industry, count=20, max_pages=10):
    query = f"{industry} {city} {country}"
    encoded_query = query.replace(" ", "+")
    websites = set()
    page = 0

    print(f"[+] Searching Bing for: {query}")

    while len(websites) < count and page < max_pages:
        url = f"https://www.bing.com/search?q={encoded_query}&first={page * 10 + 1}"
        print(f"    → Fetching page {page + 1}: {url}")
        try:
            response = requests.get(url, headers=headers)
            if response.status_code != 200:
                print(f"    [!] Failed to fetch page: Status {response.status_code}")
                break

            links = extract_bing_links(response.text)
            prev_count = len(websites)
            for link in links:
                if link not in websites:
                    websites.add(link)
                if len(websites) >= count:
                    break

            if len(websites) == prev_count:
                print("    [!] No new results found. Stopping early.")
                break

        except Exception as e:
            print(f"    [!] Error: {e}")
            break

        page += 1
        time.sleep(1)  # polite delay

    df = pd.DataFrame(list(websites), columns=["website"])
    df["search_query"] = query
    df.to_csv("actual_websites.csv", index=False)
    print(f"[✓] Saved {len(df)} website URLs to actual_websites.csv")

# Example usage
if __name__ == "__main__":
    generate_bing_actual_urls(
        country="USA",
        city="Newyork",
        industry="Jewelery Store",
        count=200,
        max_pages=20
    )


[+] Searching Bing for: Jewelery Store Newyork USA
    → Fetching page 1: https://www.bing.com/search?q=Jewelery+Store+Newyork+USA&first=1
    → Fetching page 2: https://www.bing.com/search?q=Jewelery+Store+Newyork+USA&first=11
    → Fetching page 3: https://www.bing.com/search?q=Jewelery+Store+Newyork+USA&first=21
    → Fetching page 4: https://www.bing.com/search?q=Jewelery+Store+Newyork+USA&first=31
    → Fetching page 5: https://www.bing.com/search?q=Jewelery+Store+Newyork+USA&first=41
    → Fetching page 6: https://www.bing.com/search?q=Jewelery+Store+Newyork+USA&first=51
    → Fetching page 7: https://www.bing.com/search?q=Jewelery+Store+Newyork+USA&first=61
    → Fetching page 8: https://www.bing.com/search?q=Jewelery+Store+Newyork+USA&first=71
    → Fetching page 9: https://www.bing.com/search?q=Jewelery+Store+Newyork+USA&first=81
    → Fetching page 10: https://www.bing.com/search?q=Jewelery+Store+Newyork+USA&first=91
    → Fetching page 11: https://www.bing.com/search?q=Jewel

In [58]:
import pandas as pd
import requests
import time
from urllib.parse import urlparse
from bs4 import BeautifulSoup

# --- UTILITY FUNCTIONS ---

def is_valid_url(url):
    try:
        parsed = urlparse(url)
        return parsed.scheme in ("http", "https")
    except:
        return False

def get_domain_status(url):
    try:
        start = time.time()
        response = requests.get(url, timeout=5)
        load_time = round(time.time() - start, 2)
        return response.status_code, load_time
    except Exception as e:
        return str(e), None

def detect_cms(url):
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, "html.parser")

        # Basic CMS fingerprints
        if "wp-content" in response.text or "WordPress" in response.text:
            return "WordPress"
        if "shopify" in response.text:
            return "Shopify"
        if "drupal" in response.text:
            return "Drupal"
        if "joomla" in response.text:
            return "Joomla"
        if "squarespace" in response.text:
            return "Squarespace"
        if "wix.com" in response.text:
            return "Wix"

        return "Unknown"
    except:
        return "Error"

# --- MAIN FUNCTION ---

def filter_links(input_csv="actual_websites.csv", output_csv="filtered_websites.csv"):
    df = pd.read_csv(input_csv)

    if "website" not in df.columns:
        raise ValueError("Input CSV must contain a 'website' column.")

    print(f"[+] Processing {len(df)} websites for status, CMS, and speed...")

    results = []

    for idx, row in df.iterrows():
        url = row["website"]

        if not is_valid_url(url):
            print(f"[!] Skipping invalid URL: {url}")
            continue

        print(f"    → Checking: {url}")

        status, speed = get_domain_status(url)
        cms = detect_cms(url)

        results.append({
            "website": url,
            "status": status,
            "cms": cms,
            "load_time_sec": speed
        })

    result_df = pd.DataFrame(results)
    result_df.to_csv(output_csv, index=False)

    print(f"[✓] Saved filtered results to {output_csv}")

# --- OPTIONAL: TESTING CLI ---
if __name__ == "__main__":
    filter_links()


[+] Processing 154 websites for status, CMS, and speed...
    → Checking: https://www.itshot.com/
    → Checking: https://www.tiffany.com/jewelry-stores/washington-dc/
    → Checking: https://www.gemsny.com/
    → Checking: https://oldjewelry.net/
[!] Skipping invalid URL: /images/search?view=detailV2&ccid=ImGuB90r&id=88C3DCEDC1FC7558231E01F1508A8E9D6B507714&thid=OIP.ImGuB90rYAgKlEsEuak9TAHaJT&mediaurl=https://oldjewelry.net/cdn/shop/products/IMG_7334.jpg?v=1703017338&width=1066&q=Jewelery+Store+Newyork+USA&ck=8DA650A1C7B8B5D8A63B094F3E5B6F34&idpp=rc&idpview=singleimage&form=rc2idp
    → Checking: https://www.rolex.com/en-us/store-locator/unitedstates/newyork/newyork
    → Checking: https://diamondsourcenyc.com/
    → Checking: https://www.popular.jewelry/
    → Checking: https://www.veranda.com/luxury-lifestyle/luxury-fashion-jewelry/g37925974/best-jewelry-shops/
    → Checking: https://www.brilliantearth.com/stores/
    → Checking: https://www.tiffany.com/jewelry-stores/new-york-wall

In [59]:
import pandas as pd

def save_valid_websites(input_csv, output_csv, max_load_time):
    """
    Filters websites that have status 200 and load time <= max_load_time,
    then saves them to output_csv.

    Parameters:
    - input_csv (str): Path to filtered_websites.csv file.
    - output_csv (str): Path to save selected websites CSV.
    - max_load_time (float): Maximum acceptable load time in seconds.
    """

    # Load filtered websites data
    df = pd.read_csv(input_csv)

    # Ensure status is numeric for filtering, coerce errors to NaN (like errors)
    df['status'] = pd.to_numeric(df['status'], errors='coerce')
    df['load_time_sec'] = pd.to_numeric(df['load_time_sec'], errors='coerce')

    # Filter conditions: status == 200, load_time_sec <= max_load_time
    filtered_df = df[(df['status'] == 200) & (df['load_time_sec'] <= max_load_time)]

    # Save filtered valid websites
    filtered_df.to_csv(output_csv, index=False)
    print(f"[✓] Saved {len(filtered_df)} valid websites to '{output_csv}'")

# Example usage:
if __name__ == "__main__":
    save_valid_websites("filtered_websites.csv", "selected_websites.csv", max_load_time=2.0)


[✓] Saved 58 valid websites to 'selected_websites.csv'


In [60]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin, urlparse
import tldextract
import concurrent.futures

# Email regex pattern (standard, but can be tuned)
EMAIL_PATTERN = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')

# Common subpage keywords to look for if no emails on homepage
RELEVANT_SUBPAGES = ['contact', 'about', 'support', 'help', 'info', 'customer-service', 'faq']

HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

def get_domain(url):
    ext = tldextract.extract(url)
    return ext.domain + '.' + ext.suffix

def is_same_domain(url, base_domain):
    domain = get_domain(url)
    return domain == base_domain

def fetch_page(url):
    try:
        resp = requests.get(url, headers=HEADERS, timeout=10)
        resp.raise_for_status()
        return resp.text
    except Exception as e:
        print(f"  [!] Failed to fetch {url}: {e}")
        return None

def extract_emails_from_text(text):
    emails = set(re.findall(EMAIL_PATTERN, text))
    return emails

def find_relevant_links(soup, base_url, base_domain):
    links = []
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href'].lower()
        full_url = urljoin(base_url, href)
        if is_same_domain(full_url, base_domain):
            for keyword in RELEVANT_SUBPAGES:
                if keyword in href:
                    links.append(full_url)
                    break
    return list(set(links))  # unique links

def scrape_emails_from_url(url):
    print(f"[*] Scraping: {url}")
    html = fetch_page(url)
    if not html:
        return set()
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text(separator=' ')
    emails = extract_emails_from_text(text)
    return emails

def extract_emails_from_website(url):
    base_domain = get_domain(url)
    emails = scrape_emails_from_url(url)

    if emails:
        return emails

    # No emails on homepage, try relevant subpages
    homepage_html = fetch_page(url)
    if not homepage_html:
        return set()
    soup = BeautifulSoup(homepage_html, 'html.parser')
    relevant_links = find_relevant_links(soup, url, base_domain)

    all_emails = set()
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(scrape_emails_from_url, link) for link in relevant_links]
        for future in concurrent.futures.as_completed(futures):
            all_emails.update(future.result())

    return all_emails

def extract_emails_from_links(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    results = []

    for _, row in df.iterrows():
        website = row['website']
        # Normalize URL (ensure scheme exists)
        if not website.startswith(('http://', 'https://')):
            website = 'https://' + website

        emails = extract_emails_from_website(website)
        if emails:
            for email in emails:
                results.append({'website': website, 'email': email})
        else:
            results.append({'website': website, 'email': 'no email'})

    df_result = pd.DataFrame(results)
    df_result.to_csv(output_csv, index=False)
    print(f"\n[✓] Saved results to {output_csv}")

if __name__ == "__main__":
    extract_emails_from_links("selected_websites.csv", "final_emails_output.csv")


[*] Scraping: https://oldjewelry.net/
[*] Scraping: https://diamondsourcenyc.com/
[*] Scraping: https://www.popular.jewelry/
[*] Scraping: https://www.veranda.com/luxury-lifestyle/luxury-fashion-jewelry/g37925974/best-jewelry-shops/
[*] Scraping: https://www.veranda.com/about/a1033/advertise-online/
[*] Scraping: https://www.veranda.com/about/a492/contact-us/
[*] Scraping: https://www.veranda.com/about/a481/about-us/
[*] Scraping: https://www.veranda.com/about/a1032/community-guidelines/
[*] Scraping: https://stores.cartier.com/united-states/ny/new-york/653-fifth-avenue
[*] Scraping: https://www.cartier.com/en-us/contact-customer-care
[*] Scraping: https://int.cartier.com/en/contact-us.html
[*] Scraping: https://www.cartier.com/en-us/faq/
[*] Scraping: https://www.davidyurman.com/stores/new-york-city-dy57.html
[*] Scraping: https://www.instagram.com/mosesjewelry/
[*] Scraping: https://www.foreverdiamondsny.com/
[*] Scraping: https://www.martinbuschjewelers.com/
[*] Scraping: https://ww