In [None]:
import requests
import re
import json
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import time

# Constants
INDEX_URL = "https://index.commoncrawl.org/CC-MAIN-2025-13-index"
OUTPUT_FILE = "australian_companies.csv"
TARGET_COUNT = 250000

# Industry keywords to identify company sector
INDUSTRY_KEYWORDS = {
    "banking": ["bank", "finance", "investment", "wealth", "superannuation"],
    "retail": ["retail", "shop", "store", "shopping", "ecommerce"],
    "technology": ["software", "technology", "it", "digital", "tech"],
    "healthcare": ["health", "medical", "hospital", "clinic", "pharmaceutical"],
    "education": ["education", "university", "school", "college", "training"],
    "manufacturing": ["manufacturing", "production", "factory", "industrial"],
    "construction": ["construction", "building", "development", "property"],
    "mining": ["mining", "resources", "minerals", "energy", "oil", "gas"],
    "agriculture": ["agriculture", "farming", "food", "agribusiness"],
    "hospitality": ["hotel", "restaurant", "cafe", "tourism", "hospitality"],
    "legal": ["legal", "law", "solicitor", "lawyer", "attorney"],
    "consulting": ["consulting", "consultant", "advisory", "services"]
}


def extract_company_name(url):
    """Extract potential company name from URL"""
    domain = urlparse(url).netloc
    
    # Remove common TLDs and subdomains
    domain = re.sub(r'\.com\.au$|\.net\.au$|\.org\.au$|\.au$', '', domain)
    domain = re.sub(r'^www\.', '', domain)
    
    # Convert dashes and underscores to spaces
    domain = domain.replace('-', ' ').replace('_', ' ')
    
    # Convert to title case for better readability
    company_name = ' '.join(word.capitalize() for word in domain.split())
    
    return company_name


def guess_industry(url, company_name):
    """Make a basic guess about the industry based on URL and company name"""
    text = url.lower() + ' ' + company_name.lower()
    
    industry_matches = {}
    for industry, keywords in INDUSTRY_KEYWORDS.items():
        count = sum(1 for keyword in keywords if keyword in text)
        if count > 0:
            industry_matches[industry] = count
    
    if industry_matches:
        return max(industry_matches, key=industry_matches.get)
    return None


def fetch_australian_websites():
    """Fetch Australian websites from Common Crawl index"""
    all_records = []
    page = 0
    
    with tqdm(total=TARGET_COUNT, desc="Fetching Australian websites") as pbar:
        while len(all_records) < TARGET_COUNT:
            # Query the index for Australian domains
            params = {
                'url': '*.au',
                'output': 'json',
                'page': page
            }
            
            try:
                response = requests.get(INDEX_URL, params=params)
                response.raise_for_status()
                
                # Each line in the response is a JSON record
                records = [json.loads(line) for line in response.text.strip().split('\n') if line]
                
                if not records:
                    print(f"No more records found after page {page}")
                    break
                
                # Process each record
                for record in records:
                    url = record.get('url')
                    status = record.get('status')
                    mime = record.get('mime-detected')
                    
                    # Skip non-HTML and error pages
                    if not url or status != '200' or 'html' not in mime or 'robots.txt' in url:
                        continue
                    
                    company_name = extract_company_name(url)
                    industry = guess_industry(url, company_name)
                    
                    all_records.append({
                        'url': url,
                        'company_name': company_name,
                        'industry': industry
                    })
                    
                    pbar.update(1)
                    if len(all_records) >= TARGET_COUNT:
                        break
                
                page += 1
                # Be nice to the Common Crawl server
                time.sleep(0.5)
                
            except Exception as e:
                print(f"Error fetching page {page}: {e}")
                time.sleep(5)  # Wait longer on error
    
    # Create a DataFrame and save to CSV
    df = pd.DataFrame(all_records)
    df.drop_duplicates(subset=['url'], inplace=True)
    df.to_csv(OUTPUT_FILE, index=False)
    
    print(f"Extracted {len(df)} Australian websites to {OUTPUT_FILE}")
    return df


def main():
    print("Starting extraction of Australian websites from Common Crawl...")
    fetch_australian_websites()
    print("Extraction complete!")


if __name__ == "__main__":
    main()