The Notebook explores and showcases Global Wildfire events

In [1]:
import os
import glob
import pandas as pd
import geopandas as gpd
import dask_geopandas as dgpd
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

In [2]:
cluster = LocalCluster()
client = Client(cluster)
print(f"Dask Dashboard: {client.dashboard_link}")

Perhaps you already have a cluster running?
Hosting the HTTP server on port 40197 instead
  next(self.gen)


Dask Dashboard: http://127.0.0.1:40197/status


In [None]:
yearly_fire = {
    "2015": "/workspace/_output/GLOBFIRE_burned_area_full_dataset_2002_2023/original_globfire_filtered_2015.shp", 
    "2016": "/workspace/_output/GLOBFIRE_burned_area_full_dataset_2002_2023/original_globfire_filtered_2016.shp",
    "2017": "/workspace/_output/GLOBFIRE_burned_area_full_dataset_2002_2023/original_globfire_filtered_2017.shp",
    "2018": "/workspace/_output/GLOBFIRE_burned_area_full_dataset_2002_2023/original_globfire_filtered_2018.shp",
    "2019": "/workspace/_output/GLOBFIRE_burned_area_full_dataset_2002_2023/original_globfire_filtered_2019.shp",
    "2020": "/workspace/_output/GLOBFIRE_burned_area_full_dataset_2002_2023/original_globfire_filtered_2020.shp", 
    "2021": "/workspace/_output/GLOBFIRE_burned_area_full_dataset_2002_2023/original_globfire_filtered_2021.shp", 
    "2022": "/workspace/_output/GLOBFIRE_burned_area_full_dataset_2002_2023/original_globfire_filtered_2022.shp", 
    "2023": "/workspace/_output/GLOBFIRE_burned_area_full_dataset_2002_2023/original_globfire_filtered_2023.shp"
}

years = sorted(yearly_fire.keys())

In [37]:
data_paths = yearly_fire
filtered_gdf = dgpd.read_file(data_paths[years[0]], chunksize=2048)


def select_unique_intersections(gdf):
    """Finds intersecting indices to remove"""
    gdf = gdf[gdf.index != gdf.index_right]
    return gdf.index
    
for i in range(1, len(years)):
    current_year = years[i]
    print(f"Processing: {years[i-1]} vs {current_year}")
        
    # Load the next year
    next_year_gdf = dgpd.read_file(data_paths[current_year], chunksize=2048)
        
    print(f"Checking for intersecting geometries: {years[i-1]} vs {current_year}")
    intersections = dgpd.sjoin(filtered_gdf, next_year_gdf, how="inner", predicate="intersects")
        
    print(f"Computing indices of intersecting geometries: {years[i-1]} vs {current_year}")
    intersecting_ids = intersections.map_partitions(select_unique_intersections).compute()
        
    print(f"Removing intersections from {years[i-1]}")
    filtered_gdf = filtered_gdf.map_partitions(lambda df: df[~df.index.isin(intersecting_ids)])
        
    print(f"Filtering non-intersecting geometries from {current_year}")
    non_intersecting_next_year = next_year_gdf.map_partitions(lambda df: df[~df.index.isin(intersecting_ids)])
        
    print(f"Appending non-intersecting geometries from {current_year} to the main dataset")
    filtered_gdf = dgpd.GeoDataFrame(dd.concat([filtered_gdf, non_intersecting_next_year], axis=0))
        
    # print(f"Remaining records after processing {current_year}: {filtered_gdf.compute().shape[0]}")git
    
final_filtered_gdf = filtered_gdf.compute()
final_filtered_gdf.to_file("final_non_intersecting_wildfires_2015_2023.shp")

Processing: 2021 vs 2023
Checking for intersecting geometries: 2021 vs 2023
Computing indices of intersecting geometries: 2021 vs 2023


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
2025-02-27 01:40:54,469 - distributed.worker - ERROR - failed during get data with tcp://127.0.0.1:37357 -> tcp://127.0.0.1:37703
Traceback (most recent call last):
  File "/home/rufaib/miniconda3/envs/pace_chl/lib/python3.12/site-packages/tornado/iostream.py", line 861, in _read_to_buffer
    bytes_read = self.read_from_fd(buf)
                 ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/rufaib/miniconda3/envs/pace_chl/lib/python3.12/site-packages/tornado/iostream.py", line 1116, in read_from_fd
    return self.socket.recv_into(buf, len(buf))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TimeoutError: [Errno 110] Connection timed out

The above exception was the direct cause of the following exception:

Traceback (most recent 

Removing intersections from 2021
Filtering non-intersecting geometries from 2023
Appending non-intersecting geometries from 2023 to the main dataset


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
  ogr_write(
  ogr_write(


In [3]:
unique_fire_df = dgpd.read_file("/workspace/_output/unique_fires/final_non_intersecting_wildfires_2015_2023.shp", chunksize=2048)
unique_fire_df = unique_fire_df.compute()
unique_fire_df.head()

Unnamed: 0,_uid_,id,initialdat,finaldate,area_ha,geometry
0,19.0,17157261.0,2015-02-27,2015-02-27,42.756104,"POLYGON ((-11.62481 9.70833, -11.61636 9.70833..."
1,152.0,17157623.0,2015-02-25,2015-02-25,42.754171,"POLYGON ((-10.14458 9.10833, -10.13614 9.10833..."
2,174.0,17157680.0,2015-02-27,2015-02-27,21.37684,"POLYGON ((-11.03013 8.95, -11.02591 8.95, -11...."
3,181.0,17157690.0,2015-02-27,2015-02-27,21.376795,"POLYGON ((-10.95755 8.92083, -10.95333 8.92083..."
4,213.0,17157734.0,2015-02-27,2015-03-08,320.648062,"MULTIPOLYGON (((-11.34913 8.7625, -11.34491 8...."


In [None]:
sample_50 = unique_fire_df.query("area_ha > 50")
len(sample_50)

847243

In [11]:
sample_500 = unique_fire_df.query("area_ha > 500")
len(sample_500)

121563

In [12]:
sample_500.head()

Unnamed: 0,_uid_,id,initialdat,finaldate,area_ha,geometry
56,343.0,17157903.0,2015-02-20,2015-03-01,534.383946,"POLYGON ((-11.328 7.8875, -11.32789 7.88333, -..."
70,380.0,17157944.0,2015-02-19,2015-03-01,1175.635966,"MULTIPOLYGON (((-11.11125 7.80417, -11.10704 7..."
206,755.0,17158465.0,2015-02-28,2015-03-02,513.270555,"POLYGON ((-3.04089 13.71667, -3.0366 13.71667,..."
461,2276.0,17161368.0,2015-02-17,2015-02-27,598.590119,"POLYGON ((-3.48021 9.825, -3.47598 9.825, -3.4..."
613,3158.0,17163164.0,2015-02-09,2015-02-25,748.156468,"MULTIPOLYGON (((-5.67209 8.3125, -5.67215 8.31..."


In [None]:
unique_fire_df.query(40)

Unnamed: 0,_uid_,id,initialdat,finaldate,area_ha,geometry
1704548,211539.0,26364409.0,2023-04-15,2023-04-15,149.751199,"POLYGON ((18.91852 -16.52083, 18.92287 -16.520..."
823228,836115.0,23104567.0,2020-04-22,2020-04-28,213.887702,"POLYGON ((-85.24192 14.85833, -85.24027 14.854..."
2066031,618089.0,26857822.0,2023-09-01,2023-09-01,21.372591,"POLYGON ((26.45299 -5.54583, 26.45717 -5.54583..."
278249,176550.0,20172872.0,2017-08-20,2017-08-20,64.164493,"POLYGON ((35.74614 -14.65417, 35.75045 -14.654..."
2118990,679732.0,26893386.0,2023-09-21,2023-09-22,106.95823,"POLYGON ((38.40719 -15.83333, 38.41152 -15.833..."
323422,583693.0,19523771.0,2017-01-10,2017-01-10,21.371843,"POLYGON ((12.79317 4.70833, 12.79735 4.70833, ..."
1484369,880911.0,25643146.0,2022-08-01,2022-08-05,42.743007,"MULTIPOLYGON (((26.85791 -4.27083, 26.86209 -4..."
775900,664116.0,22982375.0,2020-02-03,2020-02-03,42.741768,"POLYGON ((28.24791 3.3375, 28.25208 3.3375, 28..."
2165284,735048.0,27012298.0,2023-10-14,2023-10-14,21.404825,"POLYGON ((129.15227 -20.42917, 129.15672 -20.4..."
1056818,738100.0,24836345.0,2021-10-07,2021-10-07,42.7922,"POLYGON ((25.86866 -17.59583, 25.87303 -17.595..."


In [43]:
len(unique_fire_df)

2315073