In [None]:
import os
import requests
import zipfile
import glob
import xml.etree.ElementTree as ET
import pandas as pd


# === CONFIGURATION ===
DATASET_ID = "5bd7fcab-e315-42cb-8daf-50b7efc2027e"
BASE_DIR = r"C:\Users\anujh\Desktop\Firmable\abn_bulk_extracts"
ZIP_DIR = BASE_DIR
EXTRACT_DIR = os.path.join(BASE_DIR, "unzipped")
OUTPUT_CSV = os.path.join(BASE_DIR, "abr_output.csv")


def fetch_zip_files(dataset_id, download_dir):
    os.makedirs(download_dir, exist_ok=True)
    api_url = f'https://data.gov.au/data/api/3/action/package_show?id={dataset_id}'
    response = requests.get(api_url)
    response.raise_for_status()
    resources = response.json()['result']['resources']
    zip_links = [r for r in resources if r['url'].lower().endswith('.zip')]

    print(f"Found {len(zip_links)} ZIP files to download.")

    for res in zip_links:
        file_url = res['url']
        file_name = file_url.split('/')[-1]
        save_path = os.path.join(download_dir, file_name)
        download_zip_file(file_url, save_path)


def download_zip_file(url, path):
    print(f"Downloading {os.path.basename(path)}...")
    resp = requests.get(url)
    if resp.status_code == 200:
        with open(path, 'wb') as f:
            f.write(resp.content)
        print(f"Saved: {path}")
    else:
        print(f"Failed to download {url} - Status: {resp.status_code}")


def extract_all_zip_files(zip_dir, extract_to):
    os.makedirs(extract_to, exist_ok=True)
    for zip_path in glob.glob(os.path.join(zip_dir, "*.zip")):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print(f"Extracted: {zip_path}")


def parse_abn_xml(xml_path):
    records = []
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()

        for entity in root.findall(".//ABR"):
            abn = entity.findtext("ABN")
            entity_type = entity.findtext("EntityType/EntityTypeText")
            status = entity.find("ABN").attrib.get("status")
            start_date = entity.find("ABN").attrib.get("ABNStatusFromDate")
            name = entity.findtext("MainEntity/NonIndividualName/NonIndividualNameText")
            state = entity.findtext("MainEntity/BusinessAddress/AddressDetails/State")
            postcode = entity.findtext("MainEntity/BusinessAddress/AddressDetails/Postcode")

            records.append({
                "ABN": abn,
                "Entity Name": name,
                "Entity Type": entity_type,
                "Entity Status": status,
                "Entity Address": "",
                "Entity Postcode": postcode,
                "Entity State": state,
                "Entity Start Date": start_date,
            })
    except Exception as e:
        print(f"Failed to parse {xml_path}: {e}")
    return records


def process_all_xml_files(xml_dir, output_csv):
    all_records = []
    for xml_file in glob.glob(os.path.join(xml_dir, "*.xml")):
        records = parse_abn_xml(xml_file)
        if records:
            all_records.extend(records)
            print(f"Parsed {len(records)} records from {os.path.basename(xml_file)}")
        else:
            print(f"No records found in {os.path.basename(xml_file)}")

    if all_records:
        df = pd.DataFrame(all_records)
        write_header = not os.path.exists(output_csv)
        df.to_csv(output_csv, mode='a', header=write_header, index=False)
        print(f"\nOutput saved to: {output_csv}")


def main():
    fetch_zip_files(DATASET_ID, ZIP_DIR)
    extract_all_zip_files(ZIP_DIR, EXTRACT_DIR)
    process_all_xml_files(EXTRACT_DIR, OUTPUT_CSV)


if __name__ == "__main__":
    main()
