In [1]:

# %pip install geopandas
# %pip install shapely
# %pip install pygeos
# %pip install tensorflow-cpu
# %pip install tensorflow-data-validation
# %pip install tensorflow-transform
# %pip install tensorflow-model-analysis
# %pip install scikit-learn
# %pip install scipy
# %pip install matplotlib
# %pip install python-dotenv
# %pip install seaborn

In [2]:
import os
import pandas as pd
import geopandas as gpd
import pygeos as pg
import numpy as np
import tensorflow as tf
import tensorflow_data_validation as tfdv
import sklearn as sk
import scipy as sp
import seaborn as sns
from IPython.display import clear_output
from matplotlib import pyplot as plt
from shapely import wkt


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [4]:
# The following lines adjust the granularity of reporting.
#pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
os.chdir('F:\\Uni Files\\4710\\4710 Project\\MLweatherForestFire')

In [5]:
def getGPDfromPD(df: pd.DataFrame, geomCol: str, crs: str = "EPSG:3978") -> gpd.GeoDataFrame:
    """
    Convert a pandas dataframe to a geopandas dataframe
    :param df: pandas dataframe
    :param geomCol: name of the geometry column
    :param crs: coordinate reference system
    :return: geopandas dataframe
    """
    if 'geom' in df.columns:
        df.rename(columns={'geom': 'geometry'}, inplace=True)

    df[geomCol] = df[geomCol].apply(wkt.loads)
    gdf = gpd.GeoDataFrame(df, geometry=geomCol, crs=crs)
    return gdf

In [6]:
provinces = "Data/GEOProvincialBoundaries.csv"
dfProvinces = pd.read_csv(provinces)
dfProvinces['geom'] = dfProvinces['geom'].apply(wkt.loads)
gdfProvinces = gpd.GeoDataFrame(dfProvinces, geometry='geom', crs="EPSG:3347")
del dfProvinces
del provinces

In [7]:
# Load fire data
fireTable = "Data/GEOlgFireFifty.csv"
dfFire = pd.read_csv(fireTable)
dfFire['geom'] = dfFire['geom'].apply(wkt.loads)
gdfFire = gpd.GeoDataFrame(dfFire, geometry='geom', crs="EPSG:3347")
del dfFire
del fireTable

##### plot both to check
fig, ax = plt.subplots(figsize=(20, 20))
gdfProvinces.plot(ax=ax, color='white', edgecolor='black')
gdfFire.plot(ax=ax, color='red', markersize=1)


In [8]:
centroids = "Data/GEOlgFireFiftyCentroids.csv"
dfCentroids = pd.read_csv(centroids)
dfCentroids['geom'] = dfCentroids['geom'].apply(wkt.loads)
gdfCentroids = gpd.GeoDataFrame(dfCentroids, geometry='geom', crs="EPSG:3347")
del dfCentroids
del centroids

##### plot both to check
fig, ax = plt.subplots(figsize=(20, 20))
gdfProvinces.plot(ax=ax, color='white', edgecolor='black')
gdfCentroids.plot(ax=ax, color='red', markersize=1)


In [9]:
gdfFire['size_ha_bin'] = pd.qcut(gdfFire['SIZE_HA'], 4, labels=False)
gdfFire.drop(columns=['DECADE', 'CALC_HA', 'CFS_REF_ID', 'CAUSE', 'OUT_DATE'], inplace=True)

In [10]:
# join fire with centroids
gdfMerged = gdfFire.merge(gdfCentroids, on='EntryID', how='left')
gdfMerged.set_geometry('geom_y')
gdfMerged.drop(columns=['geom_x'], inplace=True)

In [11]:
# use fire name where fire id is null
gdfMerged['FIRE_ID'].fillna(gdfMerged['FIRENAME'], inplace=True)
# use fire id where fire name is null
gdfMerged['FIRENAME'].fillna(gdfMerged['FIRE_ID'], inplace=True)

In [12]:
# keep only fires in SK
gdfMerged = gpd.GeoDataFrame(gdfMerged, geometry='geom_y', crs="EPSG:3347")
gdfMerged = gpd.sjoin(gdfMerged, gdfProvinces, how='left', op='within')


  if (await self.run_code(code, result,  async_=asy)):


In [13]:
gdfMerged.head()

Unnamed: 0,EntryID,FIRE_ID,FIRENAME,YEAR,MONTH,DAY,REP_DATE,SIZE_HA,size_ha_bin,geom_y,index_right,provID
0,1,HWF278,Birch Complex Fire,2015,7,6,2015-07-06,3329.7,2,POINT (4934547.488 2608092.858),8.0,AB
1,2,HWF280,HWF280,2017,9,1,2017-09-01,13628.3,3,POINT (4820621.875 2904194.413),8.0,AB
2,3,HWF286,HWF286,2004,7,15,2004-07-15,4257.6,2,POINT (4879486.406 2874903.284),8.0,AB
3,15,LWF026,LWF026,2001,4,27,2001-04-27,656.2,1,POINT (5072729.959 2366897.953),8.0,AB
4,16,LWF026,LWF026,2003,5,25,2003-05-25,1230.1,1,POINT (5037099.451 2246123.844),8.0,AB


In [14]:
gdfSaskFires = gdfMerged[gdfMerged['provID'] == 'SK']

In [15]:
gdfSaskFires.count()

EntryID        1558
FIRE_ID        1558
FIRENAME       1558
YEAR           1558
MONTH          1558
DAY            1558
REP_DATE       1558
SIZE_HA        1558
size_ha_bin    1558
geom_y         1558
index_right    1558
provID         1558
dtype: int64

In [17]:
gdfSaskFires.rename(columns={'geom_y': 'geom'}, inplace=True)
gdfSaskFires = gdfSaskFires[gdfSaskFires['YEAR'] >= 2000]
gdfSaskFires.count()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdfSaskFires.rename(columns={'geom_y': 'geom'}, inplace=True)


EntryID        937
FIRE_ID        937
FIRENAME       937
YEAR           937
MONTH          937
DAY            937
REP_DATE       937
SIZE_HA        937
size_ha_bin    937
geom           937
index_right    937
provID         937
dtype: int64

In [None]:
# save to csv GEOSKFiresAfter2000centroids.csv
gdfSaskFires.to_csv('Data/GEOSKFiresAfter2000centroids.csv', index=False)