In [None]:
import os
import time
import pandas as pd
import requests
import urllib3
from urllib.parse import urlparse, urljoin
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
from openai import AzureOpenAI

# ============================
# CONFIGURATION & CONSTANTS
# ============================

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
client = AzureOpenAI(
    api_key="your openai key",
)

GOOGLE_API_KEY   = 'your GCP api key'
BASE_URL_SEARCH  = "https://maps.googleapis.com/maps/api/place/textsearch/json"
BASE_URL_DETAILS = "https://maps.googleapis.com/maps/api/place/details/json"

QUERIES = [
    'specialty coffee roaster',
    'third wave coffee roaster',
    'artisan coffee roaster',
    
]

CITIES    = ['Huntsville','Des Moines','Allentown','Modesto','Syracuse'] #a list of different cities
REGION    = 'us'
MAX_DEPTH = 5

# ============================
# SESSION SETUP WITH RETRIES
# ============================
def create_session():
    session = requests.Session()
    retry_strategy = Retry(
        total=5,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("https://", adapter)
    session.mount("http://", adapter)
    return session

session = create_session()

# ============================
# GOOGLE PLACES HELPERS
# ============================
def get_places_data(query, region, next_page_token=None):
    params = {'query': query, 'region': region, 'key': GOOGLE_API_KEY}
    if next_page_token:
        params['pagetoken'] = next_page_token
    try:
        response = session.get(BASE_URL_SEARCH, params=params, timeout=10)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Request failed for query '{query}': {e}")
        return None

def get_place_details(place_id):
    params = {
        'place_id': place_id,
        'fields': 'name,formatted_address,website',
        'key': GOOGLE_API_KEY
    }
    try:
        response = session.get(BASE_URL_DETAILS, params=params, timeout=10)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching details for place ID '{place_id}': {e}")
        return None

def canonicalize_website(url):
    try:
        parsed = urlparse(url)
        domain = parsed.netloc.lower()
        if domain.startswith("www."):
            domain = domain[4:]
        return domain
    except Exception as e:
        print(f"Error parsing URL '{url}': {e}")
        return None

def extract_place_info(place):
    pid     = place.get('place_id')
    name    = place.get('name')
    address = place.get('formatted_address', 'No address provided')
    details = get_place_details(pid)
    website = 'No website provided'
    if details and 'result' in details:
        website = details['result'].get('website', website)
    canon = canonicalize_website(website) if website != 'No website provided' else None
    return pid, name, address, website, canon

# ============================
# SCRAPE RAW DATA FOR ONE CITY
# ============================
def scrape_google_places_for_city(city):
    all_places   = []
    unique_names = set()
    unique_webs  = set()

    for q in QUERIES:
        full_query = f"{city} {q}"
        print(f"\nSearching for '{full_query}' in {city}...")
        next_page_token = None
        while True:
            data = get_places_data(full_query, REGION, next_page_token)
            if not data or 'results' not in data:
                break
            for place in data['results']:
                pid, name, addr, web, canon = extract_place_info(place)
                if (web == 'No website provided'
                        or name in unique_names
                        or (canon and canon in unique_webs)):
                    continue
                unique_names.add(name)
                if canon:
                    unique_webs.add(canon)
                all_places.append((name, addr, web))
                print(f"  Added: {name}, {addr}, {web}")
                time.sleep(1)
            next_page_token = data.get('next_page_token')
            if not next_page_token:
                break
            time.sleep(2)

    raw_csv = f"{city}_specialty_coffee_roasters.csv"
    with open(raw_csv, 'w', encoding='utf-8') as f:
        f.write("Name,Address,Website\n")
        for n, a, w in all_places:
            safe_n = n.replace('"', '""')
            safe_a = a.replace('"', '""')
            safe_w = w.replace('"', '""')
            f.write(f'"{safe_n}","{safe_a}","{safe_w}"\n')

    print(f"[Saved] raw data → {raw_csv}")
    return raw_csv

# ============================
# GPT-DRIVEN ITERATIVE LINK-FINDING
# ============================
def scrape_website(url):
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        elements = soup.find_all(['h1', 'h2', 'h3', 'p', 'a'], limit=50)
        text = ' '.join(el.get_text(strip=True) for el in elements)
        return text[:3000], soup
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
        return None, None

def analyze_with_gpt(website_text, address, city):
    print(website_text)
    prompt = (
        f"Based on the following content from this website and the provided address, "
        f"please answer the following questions:\n\n"
        f"Website Content: {website_text}\n\n"
        f"Address: {address}\n\n"
        f"1. Does the site offer a shop where users can buy coffee beans? Provide a yes or no answer without explanation.\n"
        f"2. Is the business genuinely located in {city} or near it based on the address and website content? (If address not provided, assume yes.)"
    )
    try:
        resp = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user",   "content": prompt}
            ],
            temperature=0.3, max_tokens=500
        )
        return resp.choices[0].message.content.strip().lower()
    except Exception as e:
        print(f"Error with GPT request: {e}")
        return ""

def verify_page_sells_coffee_gpt(soup, base_url):
    anchors = soup.find_all('a', href=True)[:100]
    if not anchors:
        return False
    lines = []
    for a in anchors:
        text = a.get_text(strip=True)
        full = urljoin(base_url, a['href'])
        lines.append(f"Text: {text}, URL: {full}")
    prompt = (
    "Below is a list of anchor tags (text and URL) from a webpage. "
    "We want to determine if this page serves as the main shopping hub for coffee beans—"
    "that is, a page aggregating many links to individual product listings. "
    "If it’s just a single product page, an about page, a blog, or only links out "
    "without listing multiple products, it does NOT count.\n\n"
    + "\n".join(lines) +
    "\nQuestion: Based on these links alone, is this page a central shopping hub "
    "that aggregates many coffee-bean product listings? Answer 'yes' or 'no' without explanation."
)

    try:
        resp = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user",   "content": prompt}
            ],
            temperature=0, max_tokens=50
        )
        return "yes" in resp.choices[0].message.content.lower()
    except Exception as e:
        print(f"Error in verify_page_sells_coffee_gpt: {e}")
        return False

def pick_most_likely_store_page_with_gpt(soup, base_url, visited):
    anchors = soup.find_all('a', href=True)[:100]
    visited_str = "\n".join(visited) if visited else "(none)"
    choices = []
    for a in anchors:
        absu = urljoin(base_url, a['href'])
        if absu not in visited:
            choices.append(f"Text: {a.get_text(strip=True)}, URL: {absu}")
    if not choices:
        return None
    prompt = (
        "Already visited the following URLs, do NOT pick them again:\n"
        f"{visited_str}\n\n"
        "Below is a list of anchor texts and their URLs from this webpage. "
        "Among these unvisited links, find exactly one link that most likely leads "
        "to an online store selling coffee beans. If none seems relevant, say 'None'.\n\n"
        + "\n".join(choices) +
        "\n your answer should only be the single best URL or 'None' without extra texts."
    )
    try:
        resp = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user",   "content": prompt}
            ],
            temperature=0.3, max_tokens=150
        )
        ans = resp.choices[0].message.content.strip()
        if "none" in ans.lower():
            return None
        return urljoin(base_url, ans)
    except Exception as e:
        print(f"Error in pick_most_likely_store_page_with_gpt: {e}")
        return None

def find_coffee_store_page(root_url, max_depth=MAX_DEPTH):
    visited = set()
    current = root_url
    for depth in range(max_depth):
        if current in visited:
            break
        visited.add(current)
        print(f"[Depth {depth+1}] Checking: {current}")
        _, soup = scrape_website(current)
        if not soup:
            return None
        if depth > 0 and verify_page_sells_coffee_gpt(soup, current):
            print(f"Found coffee-selling page at {current}")
            return current
        nxt = pick_most_likely_store_page_with_gpt(soup, current, visited)
        if not nxt:
            return None
        current = nxt
    return None

# ============================
# FILTER & WRITE FOR ONE CITY
# ============================
def filter_coffee_roasters(raw_csv, city):
    df = pd.read_csv(raw_csv)
    filtered = []
    for _, row in df.iterrows():
        site = row['Website']
        addr = row['Address']
        print(f"\nProcessing {site}")
        _, soup = scrape_website(site)
        if not soup:
            continue
        resp = analyze_with_gpt(_, addr, city)
        if not resp:
            continue
        offers = False
        located = False
        for line in resp.split('\n'):
            if line.startswith("1.") and "yes" in line:
                offers = True
            if line.startswith("2.") and "yes" in line:
                located = True
        if offers and located:
            store = find_coffee_store_page(site)
            if store:
                row['Original Website'] = row['Website']   # ← keep homepage
                row['Website'] = store                     # ← store page
                filtered.append(row)

        else:
            print(f"Skipped (offers={offers}, located={located}): {site}")
    if filtered:
        out_df = pd.DataFrame(filtered)
        out_csv = f"Filtered_{city}_specialty_coffee_roasters.csv"
        out_df.to_csv(out_csv, index=False)
        print(f"[Saved] filtered → {out_csv}")

# ============================
# MAIN EXECUTION FLOW
# ============================
def main():
    for city in CITIES:
        print(f"\n=== PROCESSING CITY: {city} ===")
        raw = scrape_google_places_for_city(city)
        filter_coffee_roasters(raw, city)

if __name__ == "__main__":
    main()
