# Machine Learning for Assessing Brush Fire Risk in The United States

## Import required packages

In [1]:
# !pip install geopandas shapely
#%pip install xarray
#%pip install zarr
#%pip install fsspec
#%pip install cartopy
#%pip install netCDF4
#%pip install scipy
#%pip install bottleneck
#%pip install gcsfs

In [2]:
#Importing required packages
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os
import re
import csv
import xarray as xr
import zarr
import fsspec
import cartopy.crs as ccrs
import glob as glob
import netCDF4 as nc
from netCDF4 import Dataset
from scipy.stats import skew,stats
import bottleneck
import gcsfs
import matplotlib.ticker as mticker
import warnings
warnings.filterwarnings("ignore") 

# import geopandas as gpd
# from shapely.geometry import Point

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

## Fire data

In [3]:
# Directory containing the CSV files
#directory = './'
'''

directory = 'dbfs:/FileStore/'

# Create a dictionary to hold the dataframes. The keys will be years.
modis_data = {}
all_dataframes = []


# Iterate over all files in the directory
for filename in os.listdir(directory):
    # Use a regex to match the pattern "modis" followed by a year and ending with "United_States.csv"
    match = re.match(r'modis_(\d{4})_United_States.csv', filename)
    if match:
        # Extract the year from the matched filename
        year = match.group(1)
        # Load the CSV file into a dataframe
        df = pd.read_csv(os.path.join(directory, filename))
        # Store the dataframe in the dictionary with the year as the key
        modis_data[year] = df
        all_dataframes.append(df)
        
all_in_one_data = pd.concat(all_dataframes, ignore_index=True)
'''

'\n\ndirectory = \'dbfs:/FileStore/\'\n\n# Create a dictionary to hold the dataframes. The keys will be years.\nmodis_data = {}\nall_dataframes = []\n\n\n# Iterate over all files in the directory\nfor filename in os.listdir(directory):\n    # Use a regex to match the pattern "modis" followed by a year and ending with "United_States.csv"\n    match = re.match(r\'modis_(\\d{4})_United_States.csv\', filename)\n    if match:\n        # Extract the year from the matched filename\n        year = match.group(1)\n        # Load the CSV file into a dataframe\n        df = pd.read_csv(os.path.join(directory, filename))\n        # Store the dataframe in the dictionary with the year as the key\n        modis_data[year] = df\n        all_dataframes.append(df)\n        \nall_in_one_data = pd.concat(all_dataframes, ignore_index=True)\n'

In [4]:
'''
modis_2012 = modis_data['2012']
modis_2013 = modis_data['2013']
modis_2014 = modis_data['2014']
modis_2015 = modis_data['2015']
modis_2016 = modis_data['2016']
modis_2017 = modis_data['2017']
modis_2018 = modis_data['2018']
modis_2019 = modis_data['2019']
modis_2020 = modis_data['2020']
modis_2021 = modis_data['2021']
modis_2022 = modis_data['2022']
'''

"\nmodis_2012 = modis_data['2012']\nmodis_2013 = modis_data['2013']\nmodis_2014 = modis_data['2014']\nmodis_2015 = modis_data['2015']\nmodis_2016 = modis_data['2016']\nmodis_2017 = modis_data['2017']\nmodis_2018 = modis_data['2018']\nmodis_2019 = modis_data['2019']\nmodis_2020 = modis_data['2020']\nmodis_2021 = modis_data['2021']\nmodis_2022 = modis_data['2022']\n"

In [5]:
'''
print(all_in_one_data.iloc[0])
print(type(all_in_one_data))
all_in_one_data.head()
'''

'\nprint(all_in_one_data.iloc[0])\nprint(type(all_in_one_data))\nall_in_one_data.head()\n'

In [6]:
# Create a GeoDataFrame from the latitude and longitude data
# Note: 'latitude' and 'longitude' will be the column names in your CSV that contain the geographical coordinates
# gdf = gpd.GeoDataFrame(
#     all_in_one_data,
#     geometry=gpd.points_from_xy(all_in_one_data.longitude, all_in_one_data.latitude)
# )

# Load the US state boundaries GeoJSON file
# us_states = gpd.read_file('US_State_Boundaries.geojson')

# Perform the spatial join between the points and the states
# This operation returns a GeoDataFrame with all the columns from the dataframe plus the attributes from the GeoJSON
# gdf = gdf.set_crs(us_states.crs)  # Make sure both GeoDataFrames use the same coordinate reference system
# gdf_with_states = gpd.sjoin(gdf, us_states, how="left", op='intersects')

# Now gdf_with_states will have a column with the state names associated with each point
# You might need to inspect the us_states GeoDataFrame to find out the name of the column that contains state names
# Let's assume the column is named 'STATE_NAME'
# all_in_one_data['state_name'] = gdf_with_states['NAME']

In [7]:
# all_in_one_data.to_csv('all_in_one_data_withStatesName.csv', index=False)

In [8]:
#AIO_df = pd.read_csv('all_in_one_withStatesName.csv')

In [9]:
#AIO_df.head()

## CMIP6 Data

#### burntFractionAll

In [10]:
#Loading CMIP6 data stored on google cloud
df = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')

#subselect for surface temparature and the two experiments:
# df_pr = df.query("activity_id=='HighResMIP' & (variable_id == 'pr' ) & (experiment_id == 'highresSST-present'|experiment_id == 'highresSST-future') & source_id=='MRI-AGCM3-2-S' & table_id == 'Amon'")


df_burntFractionAll = df.query("variable_id == 'burntFractionAll' & source_id == 'CNRM-ESM2-1' & member_id == 'r1i1p1f2'")

df_burntFractionAll

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version
44114,CMIP,CNRM-CERFACS,CNRM-ESM2-1,historical,r1i1p1f2,Lmon,burntFractionAll,gr,gs://cmip6/CMIP6/CMIP/CNRM-CERFACS/CNRM-ESM2-1...,,20181206
53402,CMIP,CNRM-CERFACS,CNRM-ESM2-1,esm-hist,r1i1p1f2,Lmon,burntFractionAll,gr,gs://cmip6/CMIP6/CMIP/CNRM-CERFACS/CNRM-ESM2-1...,,20190215
68560,ScenarioMIP,CNRM-CERFACS,CNRM-ESM2-1,ssp245,r1i1p1f2,Lmon,burntFractionAll,gr,gs://cmip6/CMIP6/ScenarioMIP/CNRM-CERFACS/CNRM...,,20190328
376059,ScenarioMIP,CNRM-CERFACS,CNRM-ESM2-1,ssp585,r1i1p1f2,Lmon,burntFractionAll,gr,gs://cmip6/CMIP6/ScenarioMIP/CNRM-CERFACS/CNRM...,,20191021


In [11]:
df

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version
0,HighResMIP,CMCC,CMCC-CM2-HR4,highresSST-present,r1i1p1f1,Amon,ps,gn,gs://cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/...,,20170706
1,HighResMIP,CMCC,CMCC-CM2-HR4,highresSST-present,r1i1p1f1,Amon,rsds,gn,gs://cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/...,,20170706
2,HighResMIP,CMCC,CMCC-CM2-HR4,highresSST-present,r1i1p1f1,Amon,rlus,gn,gs://cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/...,,20170706
3,HighResMIP,CMCC,CMCC-CM2-HR4,highresSST-present,r1i1p1f1,Amon,rlds,gn,gs://cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/...,,20170706
4,HighResMIP,CMCC,CMCC-CM2-HR4,highresSST-present,r1i1p1f1,Amon,psl,gn,gs://cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/...,,20170706
...,...,...,...,...,...,...,...,...,...,...,...
523769,CMIP,EC-Earth-Consortium,EC-Earth3-Veg,historical,r1i1p1f1,Amon,tas,gr,gs://cmip6/CMIP6/CMIP/EC-Earth-Consortium/EC-E...,,20211207
523770,CMIP,EC-Earth-Consortium,EC-Earth3-Veg,historical,r1i1p1f1,Amon,tauu,gr,gs://cmip6/CMIP6/CMIP/EC-Earth-Consortium/EC-E...,,20211207
523771,CMIP,EC-Earth-Consortium,EC-Earth3-Veg,historical,r1i1p1f1,Amon,hur,gr,gs://cmip6/CMIP6/CMIP/EC-Earth-Consortium/EC-E...,,20211207
523772,CMIP,EC-Earth-Consortium,EC-Earth3-Veg,historical,r1i1p1f1,Amon,hus,gr,gs://cmip6/CMIP6/CMIP/EC-Earth-Consortium/EC-E...,,20211207


In [12]:
burntFractionAll_store_present = df_burntFractionAll.zstore.values[0]
mapper = fsspec.get_mapper(burntFractionAll_store_present)
burntFractionAll_present = xr.open_zarr(mapper, consolidated=True)

burntFractionAll_present

Unnamed: 0,Array,Chunk
Bytes,30.94 kiB,30.94 kiB
Shape,"(1980, 2)","(1980, 2)"
Count,2 Graph Layers,1 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 30.94 kiB 30.94 kiB Shape (1980, 2) (1980, 2) Count 2 Graph Layers 1 Chunks Type datetime64[ns] numpy.ndarray",2  1980,

Unnamed: 0,Array,Chunk
Bytes,30.94 kiB,30.94 kiB
Shape,"(1980, 2)","(1980, 2)"
Count,2 Graph Layers,1 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,247.50 MiB,75.00 MiB
Shape,"(1980, 128, 256)","(600, 128, 256)"
Count,2 Graph Layers,4 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 247.50 MiB 75.00 MiB Shape (1980, 128, 256) (600, 128, 256) Count 2 Graph Layers 4 Chunks Type float32 numpy.ndarray",256  128  1980,

Unnamed: 0,Array,Chunk
Bytes,247.50 MiB,75.00 MiB
Shape,"(1980, 128, 256)","(600, 128, 256)"
Count,2 Graph Layers,4 Chunks
Type,float32,numpy.ndarray


In [13]:
burntFractionAll_store_present

'gs://cmip6/CMIP6/CMIP/CNRM-CERFACS/CNRM-ESM2-1/historical/r1i1p1f2/Lmon/burntFractionAll/gr/v20181206/'

In [14]:
min_value = burntFractionAll_present['burntFractionAll'].min()
max_value = burntFractionAll_present['burntFractionAll'].max()

print(f"Minimum burntFractionAll: {min_value.values}")
print(f"Maximum burntFractionAll: {max_value.values}")

Minimum burntFractionAll: 0.0
Maximum burntFractionAll: 1.2239598035812378


In [15]:
# Define a threshold for burnt fraction to classify as fire
fire_threshold = 0.3

# Label the data
burntFractionAll_present['fire_label'] = (burntFractionAll_present['burntFractionAll'] > fire_threshold).astype(int)

In [16]:
# Convert the DataArray to a pandas DataFrame
label_df = burntFractionAll_present['fire_label'].to_dataframe()

# Use value_counts on the DataFrame
label_counts = label_df['fire_label'].value_counts()

# Display the value counts
print(label_counts)

0    64708175
1      172465
Name: fire_label, dtype: int64


In [17]:
burntFractionAll_present

Unnamed: 0,Array,Chunk
Bytes,30.94 kiB,30.94 kiB
Shape,"(1980, 2)","(1980, 2)"
Count,2 Graph Layers,1 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 30.94 kiB 30.94 kiB Shape (1980, 2) (1980, 2) Count 2 Graph Layers 1 Chunks Type datetime64[ns] numpy.ndarray",2  1980,

Unnamed: 0,Array,Chunk
Bytes,30.94 kiB,30.94 kiB
Shape,"(1980, 2)","(1980, 2)"
Count,2 Graph Layers,1 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,247.50 MiB,75.00 MiB
Shape,"(1980, 128, 256)","(600, 128, 256)"
Count,2 Graph Layers,4 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 247.50 MiB 75.00 MiB Shape (1980, 128, 256) (600, 128, 256) Count 2 Graph Layers 4 Chunks Type float32 numpy.ndarray",256  128  1980,

Unnamed: 0,Array,Chunk
Bytes,247.50 MiB,75.00 MiB
Shape,"(1980, 128, 256)","(600, 128, 256)"
Count,2 Graph Layers,4 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,495.00 MiB,150.00 MiB
Shape,"(1980, 128, 256)","(600, 128, 256)"
Count,4 Graph Layers,4 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 495.00 MiB 150.00 MiB Shape (1980, 128, 256) (600, 128, 256) Count 4 Graph Layers 4 Chunks Type int64 numpy.ndarray",256  128  1980,

Unnamed: 0,Array,Chunk
Bytes,495.00 MiB,150.00 MiB
Shape,"(1980, 128, 256)","(600, 128, 256)"
Count,4 Graph Layers,4 Chunks
Type,int64,numpy.ndarray


#### pr

In [18]:
df_pr = df.query("variable_id == 'pr' & source_id == 'CNRM-ESM2-1' & member_id == 'r1i1p1f2' & experiment_id == 'historical' & table_id == 'Amon'")

pr_store_present = df_pr.zstore.values[0]

#pr_store_future = df_pr.zstore.values[1]
mapper = fsspec.get_mapper(pr_store_present)
pr_present = xr.open_zarr(mapper, consolidated=True)
pr_present_split = pr_present.sel(time=slice('2012-01-16T12:00:00' , '2014-12-16T12:00:00'))

#pr_combined = xr.concat([pr_present_split, pr_future_split], dim='time')
pr_combined = pr_present.pr.sel(time=slice('1940-01-16T12:00:00' , '1940-2-16T12:00:00'))
pr_df = pr_combined.to_dataframe()
pr_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pr
time,lat,lon,Unnamed: 3_level_1
1940-01-16 12:00:00,-88.927735,0.00000,4.780071e-07
1940-01-16 12:00:00,-88.927735,1.40625,4.780071e-07
1940-01-16 12:00:00,-88.927735,2.81250,4.780071e-07
1940-01-16 12:00:00,-88.927735,4.21875,4.780071e-07
1940-01-16 12:00:00,-88.927735,5.62500,4.780071e-07
...,...,...,...
1940-02-15 12:00:00,88.927735,352.96875,1.903327e-06
1940-02-15 12:00:00,88.927735,354.37500,1.903327e-06
1940-02-15 12:00:00,88.927735,355.78125,1.903327e-06
1940-02-15 12:00:00,88.927735,357.18750,1.903327e-06


#### sfcWind

In [20]:
df_sfcWind = df.query("variable_id == 'sfcWind' & source_id == 'CNRM-ESM2-1' & member_id == 'r1i1p1f2' & experiment_id == 'historical' & table_id == 'Amon'")

sfcWind_store_present = df_sfcWind.zstore.values[0]
mapper = fsspec.get_mapper(sfcWind_store_present)
sfcWind_present = xr.open_zarr(mapper, consolidated=True)

sfcWind_present_split = sfcWind_present.sel(time=slice('2012-01-16T12:00:00' , '2014-12-16T12:00:00'))
sfcWind_present_split

#sfcWind_combined = xr.concat([sfcWind_present_split, sfcWind_future_split], dim='time')
sfcWind_combined = sfcWind_present.sfcWind.sel(time=slice('1940-01-16T12:00:00' , '1940-2-16T12:00:00'))
sfcWind_df = sfcWind_combined.to_dataframe()
sfcWind_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,height,sfcWind
time,lat,lon,Unnamed: 3_level_1,Unnamed: 4_level_1
1940-01-16 12:00:00,-88.927735,0.00000,10.0,2.963656
1940-01-16 12:00:00,-88.927735,1.40625,10.0,2.963656
1940-01-16 12:00:00,-88.927735,2.81250,10.0,2.963656
1940-01-16 12:00:00,-88.927735,4.21875,10.0,2.963656
1940-01-16 12:00:00,-88.927735,5.62500,10.0,2.963656
...,...,...,...,...
1940-02-15 12:00:00,88.927735,352.96875,10.0,6.793680
1940-02-15 12:00:00,88.927735,354.37500,10.0,6.793680
1940-02-15 12:00:00,88.927735,355.78125,10.0,6.793680
1940-02-15 12:00:00,88.927735,357.18750,10.0,6.793680


In [21]:
pr_store_present

'gs://cmip6/CMIP6/CMIP/CNRM-CERFACS/CNRM-ESM2-1/historical/r1i1p1f2/Amon/pr/gr/v20181206/'

In [22]:
sfcWind_store_present

'gs://cmip6/CMIP6/CMIP/CNRM-CERFACS/CNRM-ESM2-1/historical/r1i1p1f2/Amon/sfcWind/gr/v20181206/'

#### hur

In [None]:
df_hur = df.query("variable_id == 'hur' & source_id == 'CNRM-ESM2-1' & member_id == 'r1i1p1f2' & experiment_id == 'historical' & table_id == 'Amon'")

hur_store_present = df_hur.zstore.values[0]
mapper = fsspec.get_mapper(hur_store_present)
hur_present = xr.open_zarr(mapper, consolidated=True)

hur_combined = hur_present.hur.sel(time=slice('1940-01-16T12:00:00' , '1940-2-16T12:00:00'))

hur_df = hur_combined.to_dataframe()
hur_df

#### ta

In [24]:
df_ta = df.query("variable_id == 'tas' & source_id == 'CNRM-ESM2-1' & member_id == 'r1i1p1f2' & experiment_id == 'historical' & table_id == 'Amon'")

ta_store_present = df_ta.zstore.values[0]
mapper = fsspec.get_mapper(ta_store_present)
ta_present = xr.open_zarr(mapper, consolidated=True)

ta_combined = ta_present.ta.sel(time=slice('1940-01-16T12:00:00' , '1940-2-16T12:00:00'))

tadf = ta_combined.to_dataframe()
ta_df


AttributeError: 'Dataset' object has no attribute 'ta'

#### Combining Datasets