In [1]:
import pandas as pd
import requests, re, time, json

# load data used to locate plans
retailers = pd.read_csv("./data/retailers.csv")

standing_offers = pd.read_csv(
    "./data/dmo.csv",
    thousands=",",
    converters={"price": lambda x: float(re.sub(r"[$,]", "", x))})

# collect basic plans data from retailers
plans = []

for retailer in retailers.sample(frac=1).index:
    
    brand, base_url = retailers.loc[retailer]
    print(f"collecting plans from {brand} ...".ljust(80), end="\r")

    # request parameters
    headers = {
        "Content-Type": "application/json",
        "x-v": "1"}
    params = {
        "fuelType": "ELECTRICITY",
        "effective": "ALL",
        "page-size": 1000}        

    # page loop begins
    page = 1

    while True:
        # update page parameter
        params["page"] = page

        # request generic plan information
        response = requests.get(
            base_url + "cds-au/v1/energy/plans/",
            params=params,
            headers=headers)

        # check for success
        if response.status_code == 200:

            # extract plan data
            data = response.json()["data"]["plans"]            
            number_of_plans = len(data)

            if number_of_plans > 0:
                plans.extend(data)
                print(f"page {page}: {number_of_plans} plans".ljust(80), end="\r")
                page = page + 1            
            else:
                print(f"no{' more' if page != 1 else ''} plans".ljust(80), end="\r")
                break
        else:
            break
print("finished loading basic plan data".ljust(80))
print(f"{len(plans):,.0f} plans in dataset".ljust(80))

# save raw plan data as JSON
with open("./data/plan_outlines_raw.json", "w") as json_file:
    json.dump(plans, json_file)

finished loading basic plan data                                                
16,548 plans in dataset                                                         


In [2]:
# filter data down to relevant details
filtered_plans = []

# iterate through plan JSON
for plan in plans:
    filtered_plan = {
        "planId": plan.get("planId"),
        "brand": plan.get("brand"),
        "brandName": plan.get("brandName"),
        "fuelType": plan.get("fuelType"),
        "type": plan.get("type"),
        "customerType": plan.get("customerType"),
        "distributors": plan.get("geography", {})
            .get("distributors", {}),
        "displayName": plan.get("displayName"),
        "effectiveFrom": plan.get("effectiveFrom")}
    filtered_plans.append(filtered_plan)

# save filtered plan data as JSON
with open("./data/plan_outlines_filtered.json", "w") as json_file:
    json.dump(filtered_plans, json_file)

# include only residential electricity plans
filtered_residential_plans = []
for plan in filtered_plans:
    if plan.get("customerType") == "RESIDENTIAL" and plan.get("fuelType") == "ELECTRICITY":
            filtered_residential_plans.append(plan)

# include only plans in relevant distribution zones
zones = standing_offers.distribution_zone.unique()
relevant_plans = []

for plan in filtered_residential_plans:      
    for zone in zones:
        if zone in plan["distributors"]:
            plan["distributor"] = zone
            plan.pop("distributors")
            relevant_plans.append(plan)
            break

# save relevant plan data as JSON
with open("./data/relevant_plan_outlines.json", "w") as json_file:
    json.dump(relevant_plans, json_file)

In [3]:
import random

# collect details of relevant plans
plan_details = []

headers = {
    "Content-Type": "application/json",
    "x-v": "1"}


start = time.time()
interval = 2

print("collecting details for plans ...", end="\r")
k = len(relevant_plans)

for i, plan in enumerate(relevant_plans):
    # make request for plan details
    response = requests.get(
        "https://cdr.energymadeeasy.gov.au/" + plan["brand"] + "/cds-au/v1/energy/plans/" + plan["planId"],
        headers=headers)

    # check for success
    if response.status_code == 200:

        # append details
        plan["contractDetails"] = response.json()["data"]["electricityContract"]
        plan_details.append(plan)

    if time.time() - interval > start:
        interval = interval + 2
        print(f"collecting details for plans ... {(i + 1) / k:.1%}", end="\r")

    if i % 50 == 0:
        with open("./data/relevant_plan_details.json", "w") as json_file:
            json.dump(plan_details, json_file)

print("collecting details for plans ... complete", end="\r")

# save plan details as JSON
with open("./data/relevant_plan_details.json", "w") as json_file:
    json.dump(plan_details, json_file)

collecting details for plans ... 42.8%

ProxyError: HTTPSConnectionPool(host='cdr.energymadeeasy.gov.au', port=443): Max retries exceeded with url: /amber/cds-au/v1/energy/plans/AMB873985MRE1@EME (Caused by ProxyError('Unable to connect to proxy', ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000021E9A7C6B40>, 'Connection to proxy.swg.internal timed out. (connect timeout=None)')))