In [None]:
import sys
import os
import importlib
# Add parent directory to sys.path
sys.path.append(os.path.abspath("../src"))
import data_generation
importlib.reload(data_generation)
from data_generation import collect_businesses_data
from data_generation import is_meaningful_description
from data_generation import test_pipeline
from data_generation import process_collected_data
from data_generation import analyze_fallback_patterns
from data_generation import load_existing_results

In [None]:
sectors_to_collect = [
    "restaurant",
    "retail store", 
    "law firm",
    "beauty salon",
    "gym",
    "nonprofit organization",
    "medical clinic",
    "nightclub",
    "café / bakery",
    "hotel / motel",
    "real estate agency",
    "construction / home services",
    "cleaning service",
    "veterinary clinic",
    "dentist",
    "physiotherapy clinic",
    "entertainment venue (cinema, bowling, etc.)",
    "transportation / taxi service",
    "accounting firm",
    "insurance broker",
    "financial advisory service",
    "IT services / software consultancy",
    "computer / phone repair shop",
    "coworking space",
    "education / tutoring center",
    "language school",
    "training institute"
]

cities = ["New York, USA", "Los Angeles, USA"]


In [None]:

print("Starting business data collection...")
print(f"Will collect up to 50 businesses per sector per city")
print(f"Sectors: {len(sectors_to_collect)}")
print(f"Cities: {len(cities)}")
print(f"Maximum total businesses: {len(sectors_to_collect) * len(cities) * 50}")

# Ask for confirmation before starting
response = input("\nDo you want to proceed? This may take a while... (y/n): ")
if response.lower() in ['y', 'yes']:
    data = collect_businesses_data()
    print("\nCollection completed!")
else:
    print("Collection cancelled.")

In [None]:
import json
from collections import defaultdict

# Input and output file paths
raw_file = "../data/all_businesses_data.json"
clean_file = "../data/all_businesses_data_clean.json"

# Load raw JSON
with open(raw_file, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# Extract raw businesses
businesses = raw_data.get("businesses", [])

# Filter + clean businesses
cleaned_businesses = []
for b in businesses:
    desc = b.get("scraped_description", "")
    website = b.get("website")
    
    # Skip if no website
    if not website:
        continue

    # Skip businesses with scraping errors or empty text
    if desc.startswith("Could not access website") or desc.startswith("An error occurred during scraping"):
        continue
    if desc.strip() in ["", "No description found on the website."]:
        continue

    if not is_meaningful_description(desc):
        continue
    
    cleaned_entry = {
        "fsq_place_id": b.get("fsq_place_id"),
        "name": b.get("name"),
        "sector": b.get("sector"),
        "city": b.get("city"),
        "website": b.get("website"),
        "scraped_description": desc,
        "address": b.get("address"),
        # Keep only category names
        "categories": [c.get("name") for c in b.get("categories", []) if "name" in c]
    }
    cleaned_businesses.append(cleaned_entry)

# ---- Rebuild metadata ----
collection_stats = {
    "total_collected": len(cleaned_businesses),
    "by_sector": defaultdict(lambda: {"total": 0, "by_city": defaultdict(int)}),
    "by_city": defaultdict(int),
    "errors": []  # all filtered out, so we don't carry them
}

for b in cleaned_businesses:
    sector = b["sector"]
    city = b["city"]
    
    collection_stats["total_collected"] += 0  # already counted
    collection_stats["by_sector"][sector]["total"] += 1
    collection_stats["by_sector"][sector]["by_city"][city] += 1
    collection_stats["by_city"][city] += 1

# Convert defaultdicts back to normal dicts for JSON
def deep_convert(d):
    if isinstance(d, defaultdict):
        d = {k: deep_convert(v) for k, v in d.items()}
    elif isinstance(d, dict):
        d = {k: deep_convert(v) for k, v in d.items()}
    return d

collection_stats = deep_convert(collection_stats)

clean_output_data = {
    "collection_metadata": {
        "timestamp": raw_data.get("collection_metadata", {}).get("timestamp"),
        "total_businesses": len(cleaned_businesses),
        "sectors_collected": raw_data.get("collection_metadata", {}).get("sectors_collected", []),
        "cities_collected": raw_data.get("collection_metadata", {}).get("cities_collected", []),
        "stats": collection_stats
    },
    "businesses": cleaned_businesses
}

# Save clean version
with open(clean_file, "w", encoding="utf-8") as f:
    json.dump(clean_output_data, f, indent=2, ensure_ascii=False)

print(f"Clean data saved to: {clean_file}")
print(f"Total businesses before filtering: {len(businesses)}")
print(f"Total businesses after filtering: {len(cleaned_businesses)}")


In [None]:
test_business = {
    "fsq_place_id": "test1",
    "name": "Sunrise Organic Coffee",
    "sector": "café / bakery",
    "city": "New York, USA",
    "scraped_description": "Welcome to our cozy coffee shop where we serve the finest organic coffee beans sourced directly from sustainable farms. We also offer fresh pastries baked daily and provide a comfortable space for work or relaxation."
}

test_pipeline(test_business)

In [None]:
input_file = "../data/all_businesses_data_clean.json"
output_file = "../data/all_businesses_descriptions_and_domains.json"

processed_collected_data = process_collected_data(input_file, output_file)

In [None]:
temp_file = "temp_data.json"
input_file = "../data/all_businesses_data_clean.json"
output_file = "../data/all_businesses_descriptions_and_domains.json"

# First, analyze what's in the temp file
print("Analyzing current temp file...")
analyze_fallback_patterns(temp_file)

# Then resume processing
print("\n" + "="*60)
print("STARTING RESUME PROCESSING")
print("="*60)

processed_collected_data = process_collected_data(
    input_file=input_file,
    output_file=output_file
)

In [None]:
import json


with open(temp_file, "r") as f:
    data = json.load(f)
    data = data["results"]

seen = set()
duplicates = []

for entry in data:
    business_id = entry["business"]["fsq_place_id"]
    if business_id in seen:
        duplicates.append(business_id)
    else:
        seen.add(business_id)

if duplicates:
    print("❌ Found duplicates:", duplicates)
    print(f"Total duplicates found: {len(duplicates)}")
else:
    print("✅ No duplicates found!")
