In [1]:
import polars as pl

from benchmarks.utils import mock_snakemake

if "snakemake" not in globals() or hasattr(snakemake, "mock"):  # noqa: F821
    snakemake = mock_snakemake("process_generator_data")

In [2]:
# file has whitespaces that we must strip
df = pl.read_csv(snakemake.input[0], encoding="iso-8859-1")
df.columns = [c.strip() for c in df.columns]
df = df.with_columns(
    pl.col(
        c for c, t in zip(df.columns, df.dtypes) if (t == pl.String)
    ).str.strip_chars()
)
# now that they're removed we can convert to numbers
df = df.with_columns(
    pl.col("PlantCode").cast(pl.Int64),
    pl.col("Lat").cast(pl.Float64),
    pl.col("Lon").cast(pl.Float64),
    pl.col("bus").cast(pl.UInt32),
)
df

PlantCode,GenID,bus,FuelType,Pg,Pmax,Pmin,Qg,Qmax,Qmin,Lat,Lon
i64,str,u32,str,f64,f64,i64,i64,f64,f64,f64,f64
34,"""1P""",745,"""Conventional Hydroelectric""",4.0333628,12.1,0,0,18.777143,-14.670714,39.134259,-120.953341
151,"""1""",1804,"""Petroleum Liquids""",0.0,71.2,0,0,34.5,-23.7,37.62936,-120.93139
151,"""2""",1804,"""Natural Gas Fired Combustion T…",0.0,71.2,0,0,34.5,-23.7,37.62936,-120.93139
161,"""1""",1964,"""Conventional Hydroelectric""",0.366669,1.1,0,0,18.777143,-14.670714,37.611461,-120.594678
161,"""2""",1964,"""Conventional Hydroelectric""",0.366669,1.1,0,0,18.777143,-14.670714,37.611461,-120.594678
…,…,…,…,…,…,…,…,…,…,…,…
,,8862,,0.0,0.0,0,0,200.0,-200.0,,
,,8863,,0.0,0.0,0,0,200.0,-200.0,,
,,8864,,0.0,0.0,0,0,200.0,-200.0,,
,,8865,,0.0,0.0,0,0,200.0,-200.0,,


In [3]:
# remove reactive elements
assert (df.get_column("Pmin") == 0).all()
df2 = df.drop("Pmin")
reactive_elements = df2.filter(pl.col("Pmax") == 0)
assert (
    reactive_elements["FuelType"].is_null().all()
    and (reactive_elements["Qmax"] != 0).all()
), "Expected reactive elements to have Pmax == 0 and Qmax != 0"
df2 = df2.filter(pl.col("Pmax") > 0).drop("Qmax", "Qmin", "Qg")
df2

PlantCode,GenID,bus,FuelType,Pg,Pmax,Lat,Lon
i64,str,u32,str,f64,f64,f64,f64
34,"""1P""",745,"""Conventional Hydroelectric""",4.0333628,12.1,39.134259,-120.953341
151,"""1""",1804,"""Petroleum Liquids""",0.0,71.2,37.62936,-120.93139
151,"""2""",1804,"""Natural Gas Fired Combustion T…",0.0,71.2,37.62936,-120.93139
161,"""1""",1964,"""Conventional Hydroelectric""",0.366669,1.1,37.611461,-120.594678
161,"""2""",1964,"""Conventional Hydroelectric""",0.366669,1.1,37.611461,-120.594678
…,…,…,…,…,…,…,…
0,"""0""",7583,"""IMPORT""",0.0,200.0,34.288557,-114.103924
0,"""0""",7604,"""IMPORT""",0.0,200.0,34.086702,-113.914313
0,"""0""",1293,"""IMPORT""",0.0,200.0,39.52124,-120.008658
0,"""0""",1898,"""IMPORT""",0.0,200.0,42.079884,-121.390208


In [4]:
df2.select("FuelType", "Pmax").group_by("FuelType").sum().sort("Pmax", descending=True)

FuelType,Pmax
str,f64
"""Natural Gas Fired Combined Cyc…",19649.3
"""Solar Photovoltaic""",11639.398673
"""Natural Gas Fired Combustion T…",10893.4
"""Conventional Hydroelectric""",7923.0
"""Onshore Wind Turbine""",5303.0
…,…
"""Other Natural Gas""",131.7
"""All Other""",113.7
"""Conventional Steam Coal""",55.0
"""Petroleum Coke""",27.3


In [5]:
thermal_fuels = ["coal", "gas", "petroleum", "municipal solid waste"]
df3 = df2.with_columns(
    pl.col("FuelType")
    .str.to_lowercase()
    .map_elements(lambda x: "thermal" if any(f in x for f in thermal_fuels) else x)
    .replace(
        {
            "wood/wood waste biomass": "other renewables",
            "geothermal": "other renewables",
            "other waste biomass": "other renewables",
            "solar photovoltaic": "solar",
            "solar thermal without energy storage": "solar",
            "conventional hydroelectric": "hydro",
            "onshore wind turbine": "wind",
        }
    )
).rename({"FuelType": "type"})
df3.select("type", "Pmax").group_by("type").sum().sort("Pmax", descending=True)



type,Pmax
str,f64
"""thermal""",36756.9
"""solar""",12795.400002
"""hydro""",7923.0
"""wind""",5303.0
"""import""",5200.0
"""nuclear""",2323.0
"""other renewables""",1768.6
"""hydroelectric pumped storage""",759.1
"""batteries""",229.5
"""all other""",113.7


In [6]:
# Remove storage to simplify model and 'all other' since it's negligible
df4 = df3.filter(
    ~pl.col("type").is_in(["batteries", "all other", "hydroelectric pumped storage"])
)
df4.select("type", "Pmax").group_by("type").sum().sort("Pmax", descending=True)

type,Pmax
str,f64
"""thermal""",36756.9
"""solar""",12795.400002
"""hydro""",7923.0
"""wind""",5303.0
"""import""",5200.0
"""nuclear""",2323.0
"""other renewables""",1768.6


In [7]:
df4

PlantCode,GenID,bus,type,Pg,Pmax,Lat,Lon
i64,str,u32,str,f64,f64,f64,f64
34,"""1P""",745,"""hydro""",4.0333628,12.1,39.134259,-120.953341
151,"""1""",1804,"""thermal""",0.0,71.2,37.62936,-120.93139
151,"""2""",1804,"""thermal""",0.0,71.2,37.62936,-120.93139
161,"""1""",1964,"""hydro""",0.366669,1.1,37.611461,-120.594678
161,"""2""",1964,"""hydro""",0.366669,1.1,37.611461,-120.594678
…,…,…,…,…,…,…,…
0,"""0""",7583,"""import""",0.0,200.0,34.288557,-114.103924
0,"""0""",7604,"""import""",0.0,200.0,34.086702,-113.914313
0,"""0""",1293,"""import""",0.0,200.0,39.52124,-120.008658
0,"""0""",1898,"""import""",0.0,200.0,42.079884,-121.390208


In [8]:
# group by type and bus
df5 = (
    df4.drop("Lat", "Lon", "PlantCode", "GenID")
    .group_by("type", "bus")
    .sum()
    .sort("bus", "type")
)
df5

type,bus,Pg,Pmax
str,u32,f64,f64
"""hydro""",6,1.633345,4.9
"""solar""",16,1.362354,1.939383
"""thermal""",16,0.0,2.8
"""solar""",17,0.999059,1.422214
"""thermal""",17,0.0,1.1
…,…,…,…
"""hydro""",8832,5.233371,15.7
"""thermal""",8832,0.0,23.0
"""thermal""",8838,49.900364,49.9
"""thermal""",8843,1.40001,2.9


In [9]:
# write
df5.write_parquet(snakemake.output[0])