## Generate outputs

What we want out of this algorthym is a snapshot of all the fires at a given t. And a timeseries of each fire across time. 

In [1]:
import os
import datetime
import pandas as pd
import geopandas as gpd

import FireTime, FireObj, FireConsts, FireGpkg_sfs, FireGpkg, FireIO
from utils import timed

region = ["CONUS",]  # note you don't need the shape in here, just the name
tst = [2023, 8, 28, 'AM']
ted = [2023, 9, 6, 'AM']

## Read from disk

In [2]:
%%time
allpixels = pd.concat([
    pd.read_csv(f"out/{region[0]}/{t[0]}{t[1]:02}{t[2]:02}_{t[3]}.txt", index_col="uuid", parse_dates=["t"])
    for t in FireTime.t_generator(tst, ted)
])

CPU times: user 62.9 ms, sys: 23.7 ms, total: 86.6 ms
Wall time: 85.9 ms


In [3]:
%%time
t = ted
allfires_gdf = gpd.read_parquet(f"out/{region[0]}/allfires_{t[0]}{t[1]:02}{t[2]:02}_{t[3]}.parq")

CPU times: user 67.4 ms, sys: 72.4 ms, total: 140 ms
Wall time: 140 ms


## Pick out the large fires

Let's compare the existing object-oriented approach with the new geodataframe approach

In [4]:
%%time
t = ted
dt = FireTime.t2dt(t)
gdf = allfires_gdf.reset_index()

# gdf = gdf[gdf.t >= dt - datetime.timedelta(days=20)]
last_seen = gdf.drop_duplicates("fireID", keep="last")
last_large = last_seen[(last_seen.farea > 4) & (last_seen.invalid == False)]
large_fires = last_large.fireID.values

CPU times: user 5.43 ms, sys: 1.68 ms, total: 7.11 ms
Wall time: 6.72 ms


First we'll use the `allpixels` object to create the `nplist` layer

In [5]:
%%time
for fid in large_fires:
    output_dir = f"out/CONUS/fires/{fid}"
    os.makedirs(output_dir, exist_ok=True)

    data = allpixels[allpixels["fid"] == fid]
    subset = data[["x", "y", "FRP", "DS", "DT", "ampm", 'YYYYMMDD_HHMM', "Sat"]].copy()
    subset.columns = ["x", "y", "frp", "DS", "DT", "ampm", 'datetime', "sat"]
    subset["geometry"] = gpd.points_from_xy(subset.x, subset.y)
    subset = subset.set_geometry("geometry")
    
    subset.to_file(f"{output_dir}/nfplist.fgb", driver="FlatGeobuf")

CPU times: user 627 ms, sys: 38.2 ms, total: 665 ms
Wall time: 687 ms


The rest of the layers will be created directly from the `allfires_gdf`

First let's do a naive version without the merge fixups

In [6]:
%%time
for fid, data in gdf[gdf["fireID"].isin(large_fires)].groupby("fireID"):
    output_dir = f"out/CONUS/fires/{fid}"
    os.makedirs(output_dir, exist_ok=True)

    for layer in ["perimeter", "fireline", "newfirepix"]:
        columns = [col for col in FireGpkg_sfs.getdd(layer)]
        subset = data[columns].copy()
        if layer == "perimeter":
            subset["geometry"] = data["hull"]
        elif layer == "newfirepix":
            subset["geometry"] = data["nfp"]
        elif layer == "fireline":
            subset["geometry"] = data["fline"]
            subset = subset.dropna(subset=["geometry"])
        subset = subset.set_geometry("geometry")
        
        subset.to_file(f"{output_dir}/{layer}.fgb", driver="FlatGeobuf")

CPU times: user 945 ms, sys: 16.1 ms, total: 962 ms
Wall time: 984 ms


## Now with merge

These ones need some merge help:

In [7]:
merge_needed = (gdf.mergeid != gdf.fireID) & (gdf.invalid == False)
print(f"{merge_needed.sum()} rows that potentially need a merge")

# we'll set the "fireID" to "mergeid" in those spots
gdf.loc[merge_needed, "fireID"] = gdf.loc[merge_needed, "mergeid"]

93 rows that potentially need a merge


I have two different ideas of how to merge rows:

1) The first version of the `merge_rows` function uses a unary union to join the hull and then recalculated the fline and the ftype.
2) The second version of the `merge_rows` function uses code that is more similar to the existing merge function. It constructs a MultiGeometry out of the various geometry objects.

In [8]:
@timed
def merge_rows(data):
    """For a subset of allfires data containing only one fire, merge any
    rows that have the same `t`
    """
    
    from shapely.ops import unary_union

    dd = FireGpkg_sfs.getdd("all")
    output = data.set_index("t").copy()
    
    # clean up any merges that are needed
    for dt, rows in data[data.t.duplicated(False)].groupby("t"):
        f = FireObj.Fire(fid, FireTime.dt2t(dt), allpixels)
        f.t_st = FireTime.dt2t(rows["t_st"].min())
        f.hull = unary_union(rows["hull"].values)
        
        # this might be doing more work than it needs to
        f.updatefline()

        # ftype is unused in the output files
        f.ftype = rows.ftype.iloc[0]
    
        for k, tp in dd.items():
            if tp == "datetime64[ns]":
                output.loc[dt, k] = FireTime.t2dt(getattr(f, k))
            else:
                output.loc[dt, k] = getattr(f, k)
    
    for k, tp in dd.items():
        output[k] = output[k].astype(tp)
        
    return output.drop_duplicates().reset_index()

In [9]:
@timed
def merge_rows(data):
    """For a subset of allfires data containing only one fire, merge any
    rows that have the same `t`
    """
    output = data.drop_duplicates(subset=["t"]).set_index("t").copy()
    
    # clean up any merges that are needed
    for dt, rows in data[data.t.duplicated(False)].groupby("t"):
        # first get the weighted sums for pixden and meanFRP
        pixweight = (rows["pixden"] * rows["farea"]).sum()
        FRPweight = (rows["meanFRP"] * rows["n_pixels"]).sum()
        
        for col in ["n_pixels", "n_newpixels", "farea", "fperim", "flinelen"]:
            output.loc[dt, col] = rows[col].sum()

        output.loc[dt, "t_st"] = rows["t_st"].min()
        output.loc[dt, "pixden"] = pixweight / output.loc[dt, "farea"]
        output.loc[dt, "meanFRP"] = FRPweight / output.loc[dt, "n_pixels"]

        dissolved = rows.dissolve()
        for col in ["hull", "fline", "nfp"]:
            output.loc[dt, col] = dissolved[col].item()
        
    return output.reset_index()

In [10]:
%%time
for fid, data in gdf[gdf["fireID"].isin(large_fires)].groupby("fireID"):
    
    # merge any rows that have the same t
    if data.t.duplicated().any():
        data = merge_rows(data)
                
    output_dir = f"out/CONUS/fires/{fid}"
    os.makedirs(output_dir, exist_ok=True)

    for layer in ["perimeter", "fireline", "newfirepix"]:
        columns = [col for col in FireGpkg_sfs.getdd(layer)]
        subset = data[columns].copy()
        if layer == "perimeter":
            subset["geometry"] = data["hull"]
        elif layer == "newfirepix":
            subset["geometry"] = data["nfp"]
        elif layer == "fireline":
            subset["geometry"] = data["fline"]
            subset = subset.dropna(subset=["geometry"])
        subset = subset.set_geometry("geometry")
        
        subset.to_file(f"{output_dir}/{layer}.fgb", driver="FlatGeobuf")

2024-02-06 13:25:55,474 - FireLog - INFO - func:merge_rows took: 9.33 ms
2024-02-06 13:25:55,555 - FireLog - INFO - func:merge_rows took: 13.59 ms
2024-02-06 13:25:55,585 - FireLog - INFO - func:merge_rows took: 8.72 ms
2024-02-06 13:25:55,895 - FireLog - INFO - func:merge_rows took: 67.29 ms
2024-02-06 13:25:55,966 - FireLog - INFO - func:merge_rows took: 7.64 ms
2024-02-06 13:25:56,057 - FireLog - INFO - func:merge_rows took: 17.19 ms
2024-02-06 13:25:56,104 - FireLog - INFO - func:merge_rows took: 23.05 ms
2024-02-06 13:25:56,184 - FireLog - INFO - func:merge_rows took: 8.15 ms
2024-02-06 13:25:56,532 - FireLog - INFO - func:merge_rows took: 12.45 ms
2024-02-06 13:25:56,560 - FireLog - INFO - func:merge_rows took: 7.71 ms


CPU times: user 1.12 s, sys: 36.2 ms, total: 1.16 s
Wall time: 1.16 s


## Experiments

Does it make a big difference if you filter first rather than after?

In [11]:
%%time
for fid, data in gdf[gdf["fireID"].isin(large_fires)].groupby("fireID"):
    f = fid

CPU times: user 22.6 ms, sys: 376 µs, total: 23 ms
Wall time: 21.8 ms


In [12]:
%%time
for fid, data in gdf.groupby("fireID"):
    if fid in large_fires:
        f = fid

CPU times: user 564 ms, sys: 13 ms, total: 577 ms
Wall time: 572 ms


In [13]:
%%time
for fid in large_fires:
    data = gdf[gdf["fireID"] == fid]
    f = fid

CPU times: user 28.7 ms, sys: 276 µs, total: 29 ms
Wall time: 27.7 ms


In [14]:
%%time
for fid in large_fires:
    data = allfires_gdf.loc[fid]
    f = fid

CPU times: user 20.2 ms, sys: 2.82 ms, total: 23 ms
Wall time: 22.4 ms
