# Machine Learning for Assessing Brush Fire Risk in The United States

## Import required packages

In [1]:
!pip install geopandas shapely



In [2]:
#Importing required packages
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os
import re
import csv
import xarray as xr
import zarr
import fsspec
import cartopy.crs as ccrs
import glob as glob
import netCDF4 as nc
from netCDF4 import Dataset
from scipy.stats import skew,stats
import bottleneck
import gcsfs
import matplotlib.ticker as mticker
import warnings
warnings.filterwarnings("ignore") 

import geopandas as gpd
from shapely.geometry import Point
import fiona

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

## Loading data

In [3]:
# Directory containing the CSV files
directory = './'

# Create a dictionary to hold the dataframes. The keys will be years.
modis_data = {}
all_dataframes = []

# Iterate over all files in the directory
for filename in os.listdir(directory):
    # Use a regex to match the pattern "modis" followed by a year and ending with "United_States.csv"
    match = re.match(r'modis_(\d{4})_United_States.csv', filename)
    if match:
        # Extract the year from the matched filename
        year = match.group(1)
        # Load the CSV file into a dataframe
        df = pd.read_csv(os.path.join(directory, filename))
        # Store the dataframe in the dictionary with the year as the key
        modis_data[year] = df
        all_dataframes.append(df)
        
all_in_one_data = pd.concat(all_dataframes, ignore_index=True)

In [4]:
modis_2012 = modis_data['2012']
modis_2013 = modis_data['2013']
modis_2014 = modis_data['2014']
modis_2015 = modis_data['2015']
modis_2016 = modis_data['2016']
modis_2017 = modis_data['2017']
modis_2018 = modis_data['2018']
modis_2019 = modis_data['2019']
modis_2020 = modis_data['2020']
modis_2021 = modis_data['2021']
modis_2022 = modis_data['2022']
test_df = modis_data['2022']

In [5]:
print(all_in_one_data.iloc[0])
print(type(all_in_one_data))
all_in_one_data.head()

latitude         33.0156
longitude       -97.0675
brightness         325.4
scan                 1.1
track                1.1
acq_date      2012-01-01
acq_time             426
satellite          Terra
instrument         MODIS
confidence           100
version              6.2
bright_t31         285.5
frp                 30.5
daynight               N
type                   0
Name: 0, dtype: object
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight,type
0,33.0156,-97.0675,325.4,1.1,1.1,2012-01-01,426,Terra,MODIS,100,6.2,285.5,30.5,N,0
1,36.3413,-96.521,313.3,1.0,1.0,2012-01-01,427,Terra,MODIS,87,6.2,279.3,15.9,N,0
2,36.3396,-96.5321,320.3,1.0,1.0,2012-01-01,427,Terra,MODIS,100,6.2,279.1,21.8,N,0
3,36.3306,-96.53,326.9,1.0,1.0,2012-01-01,427,Terra,MODIS,100,6.2,280.2,28.8,N,0
4,36.122,-96.0743,302.7,1.0,1.0,2012-01-01,427,Terra,MODIS,51,6.2,279.4,9.0,N,0


In [None]:
# Create a GeoDataFrame from the latitude and longitude data
# Note: 'latitude' and 'longitude' will be the column names in your CSV that contain the geographical coordinates
gdf = gpd.GeoDataFrame(
    all_in_one_data,
    geometry=gpd.points_from_xy(all_in_one_data.longitude, all_in_one_data.latitude)
)

# Load the US state boundaries GeoJSON file
us_states = gpd.read_file('./US_State_Boundaries.geojson')

# Perform the spatial join between the points and the states
# This operation returns a GeoDataFrame with all the columns from the dataframe plus the attributes from the GeoJSON
gdf = gdf.set_crs(us_states.crs)  # Make sure both GeoDataFrames use the same coordinate reference system
gdf_with_states = gpd.sjoin(gdf, us_states, how="left", op='intersects')

# Now gdf_with_states will have a column with the state names associated with each point
# You might need to inspect the us_states GeoDataFrame to find out the name of the column that contains state names
# Let's assume the column is named 'STATE_NAME'
all_in_one_data['state_name'] = gdf_with_states['NAME']

In [None]:
all_in_one_data.to_csv('all_in_one_data_withStatesName.csv', index=False)

## Model

In [None]:
#Loading CMIP6 data stored on google cloud
df = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')

#subselect for surface temparature and the two experiments:
df_pr = df.query("activity_id=='HighResMIP' & (variable_id == 'pr' ) & (experiment_id == 'highresSST-present'|experiment_id == 'highresSST-future') & source_id=='MRI-AGCM3-2-S' & table_id == 'Amon'")
df_pr