In [None]:
nb

In [None]:
# %pip install python-dotenv
# %pip install seaborn
# %pip install tensorflow_data_validation

In [None]:
import os
import pandas as pd
import geopandas as gpd
import pygeos as pg
import numpy as np
import tensorflow as tf
import tensorflow_data_validation as tfdv
import sklearn as sk
import scipy as sp
import seaborn as sns
from datetime import datetime
from dotenv import load_dotenv
from IPython.display import clear_output
from matplotlib import pyplot as plt
from shapely import wkt

In [None]:
# The following lines adjust the granularity of reporting.
#pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
os.chdir('/tf')

In [None]:
def getGPDfromPD(df: pd.DataFrame, geomCol: str, crs: str = "EPSG:3978") -> gpd.GeoDataFrame:
    """
    Convert a pandas dataframe to a geopandas dataframe
    :param df: pandas dataframe
    :param geomCol: name of the geometry column
    :param crs: coordinate reference system
    :return: geopandas dataframe
    """
    if 'geom' in df.columns:
        df.rename(columns={'geom': 'geometry'}, inplace=True)

    df[geomCol] = df[geomCol].apply(wkt.loads)
    gdf = gpd.GeoDataFrame(df, geometry=geomCol, crs=crs)
    return gdf

In [None]:
provinces = "Data/GEOProvincialBoundaries.csv"
dfProvinces = pd.read_csv(provinces)
dfProvinces['geom'] = dfProvinces['geom'].apply(wkt.loads)
gdfProvinces = gpd.GeoDataFrame(dfProvinces, geometry='geom', crs="EPSG:3347")

In [None]:
# Load fire data
fireTable = "Data/GEOlgFireFifty.csv"
dfFire = pd.read_csv(fireTable)
dfFire['geom'] = dfFire['geom'].apply(wkt.loads)
gdfFire = gpd.GeoDataFrame(dfFire, geometry='geom', crs="EPSG:3347")

## plot both to check
fig, ax = plt.subplots(figsize=(20, 20))
gdfProvinces.plot(ax=ax, color='white', edgecolor='black')
gdfFire.plot(ax=ax, color='red', markersize=1)


In [None]:
centroids = "Data/GEOlgFireFiftyCentroids.csv"
dfCentroids = pd.read_csv(centroids)
dfCentroids['geom'] = dfCentroids['geom'].apply(wkt.loads)
gdfCentroids = gpd.GeoDataFrame(dfCentroids, geometry='geom', crs="EPSG:3347")

## plot both to check
fig, ax = plt.subplots(figsize=(20, 20))
gdfProvinces.plot(ax=ax, color='white', edgecolor='black')
gdfCentroids.plot(ax=ax, color='red', markersize=1)


In [None]:
gdfFire['size_ha_bin'] = pd.qcut(dfFire['SIZE_HA'], 4, labels=False)
gdfFire.drop(columns=['DECADE', 'CALC_HA', 'CFS_REF_ID', 'CAUSE', 'OUT_DATE'], inplace=True)

In [None]:
# join fire with centroids
gdfMerged = gdfFire.merge(dfCentroids, on='EntryID', how='left')
gdfMerged.set_geometry('geom_y')
gdfMerged.drop(columns=['geom_x'], inplace=True)

In [None]:
# use fire name where fire id is null
gdfMerged['FIRE_ID'].fillna(gdfMerged['FIRENAME'], inplace=True)
# use fire id where fire name is null
gdfMerged['FIRENAME'].fillna(gdfMerged['FIRE_ID'], inplace=True)

In [None]:
dailyWeather = "Data/WeatherDataHourlyAggDaily.csv"
dfWeather = pd.read_csv(dailyWeather)
dfWeather.columns

In [None]:
dfWeather.columns

In [None]:
dfWeather.astype({'ClimateID': 'str', 'ProvinceCode': 'str', 
                'Year': 'int', 'Month': 'int', 'Day': 'int',
                'MeanTemp': 'float', 'MinTemp': 'float', 'MaxTemp': 'float',
                'MeanDewPoint': 'float', 'MinDewPoint': 'float', 'MaxDewPoint': 'float',
                'MeanHumidity': 'float', 'MinHumidity': 'float', 'MaxHumidity': 'float',
                'MeanPressure': 'float', 'MinPressure': 'float', 'MaxPressure': 'float',
                'MeanWindSpeed': 'float', 'MinWindSpeed': 'float', 'MaxWindSpeed': 'float',
                'MeanWindChill': 'float', 'MinWindChill': 'float', 'MaxWindChill': 'float',
                'TotalPrecip': 'float', 'MeanWindDirection': 'float'}, copy=False)

In [None]:
stations = "Data/climate_station_list.csv"
dfAllStations = pd.read_csv(stations)

In [None]:
dfStationIDGeom = 

In [None]:
# load firewaterelev
fireWaterElev = "Data/FireWaterElev.csv"
dfFireWaterElev = pd.read_csv(fireWaterElev)

In [None]:
print(dfFireWaterElev.count())
print(dfFireWaterElev.describe())
print(dfFireWaterElev.isnull().sum().sum())

In [None]:
# load fire centroids 
fireCentroids = "Data/GEOlgFireFiftyCentroids.csv"
dfTemp = pd.read_csv(fireCentroids)

dfCentroids = getGPDfromPD(dfTemp, 'geometry')
print(dfCentroids.columns)

In [None]:
dfTemp = dfCentroids.merge(dfFireWaterElev, on='EntryID', how='inner')
# dfTemp = dfFireWaterElevGPD.merge(dfCentroids, on='EntryID', how='left')

print(dfTemp.columns)
dfFireWECent = dfTemp

print(dfFireWECent.count())
print(dfFireWECent.columns)
print(dfFireWECent.head())

In [None]:
stations = "Data/GEOTenYrStationsHourly.csv"
dfTemp = pd.read_csv(stations)

dfStations = getGPDfromPD(dfTemp, 'geometry')

In [None]:
# dfFireWECent.drop(columns=['ClimateID'], inplace=True)

# keep only Province == SASKATCHEWAN
dfStations = dfStations[dfStations['Province'] == 'SASKATCHEWAN']
dfStations.drop(columns=['dataAvailable'], inplace=True)
print(dfStations.columns)
print(dfStations.head())

In [None]:

# gpd sjoin to find nearest station to each fire
dfNearest = gpd.sjoin_nearest(dfFireWECent, dfStations, how='left', max_distance=117590)
print(dfNearest.count())
print(dfNearest.columns)
print(dfNearest.head()) 

In [None]:
# drop rows with nan values
dfNearest.dropna(inplace=True)
print(dfNearest.count())
print(dfNearest.columns)
print(dfNearest.head()) 


In [None]:
dfNearest.drop(columns=['ClimateID_left'], inplace=True)
dfNearest.rename(columns={'ClimateID_right': 'ClimateID'}, inplace=True)

In [None]:
weatherDaily = "Data/WeatherDataHourlyAggDaily.csv"
dfWeatherDaily = pd.read_csv(weatherDaily)


In [None]:
print(dfWeatherDaily.count())
print(dfWeatherDaily.describe())
print(dfWeatherDaily.isnull().sum().sum())

In [None]:
# rename dfWeatherDaily columns to use all caps for year month day
dfWeatherDaily.rename(columns={'climateid': 'ClimateID', 'Year': 'YEAR', 'Month': 'MONTH', 'Day': 'DAYw', 'utc': 'utcWeather'}, inplace=True)
print(dfWeatherDaily.columns)

In [None]:
print(dfNearest.dtypes)
print(dfWeatherDaily.dtypes)

In [None]:
# dfAll = dfFireWECent.copy(deep=True)

# dfAll.rename(columns={'utc': 'utcFire'}, inplace=True)
# # left join dfAll and dfWeatherDaily on ClimateID and YEAR MONTH
# dfAll = dfAll.merge(dfWeatherDaily, on=['ClimateID', 'YEAR', 'MONTH', 'DAY'], how='left')


# print(dfAll.columns)
# print(dfFireWECent.count())
# print(dfAll.count())


In [None]:
dfNearest.describe()


In [None]:
dfNearest['ClimateID'] = dfNearest['ClimateID'].astype(str)
dfWeatherDaily['ClimateID'] = dfWeatherDaily['ClimateID'].astype(str)

In [None]:
# join dfAll plus weather

dfAll = dfNearest.copy(deep=True)

dfAll.rename(columns={'utc': 'utcFire'}, inplace=True)
# left join dfAll and dfWeatherDaily on ClimateID and YEAR and MONTH
dfAll = dfAll.merge(dfWeatherDaily, on=['ClimateID', 'MONTH', 'YEAR'], how='left')
print(dfAll.head())
# keep only rows where utcWeather = utcFire - ONEDAY
dfAll = dfAll[(dfAll['DAYw'] == dfAll['DAY'] - 1)]

print(dfAll.columns)
print(dfNearest.count())
print(dfAll.count())


In [None]:
# print rows with NaN values
print(dfAll[dfAll.isna().any(axis=1)])

In [None]:
# first read in the csv file into pd
dfTemp = pd.read_csv('Data/GEOProvincialBoundaries.csv')

dfProvinces = getGPDfromPD(dfTemp, 'geometry')

In [None]:
# keep only dfNearest where geomtry is containe din dfProvinces Saskatchewan
dfAll['keep'] = False
for index, row in dfAll.iterrows():
    if dfProvinces[dfProvinces['provID'] == 'SK'].contains(row['geometry']).any():
        dfAll.at[index, 'keep'] = True


In [None]:
print(dfAll.head())
print(dfAll.count())

In [None]:
dfAll = dfAll[dfAll['keep'] == True]
print(dfAll.head())
print(dfAll.count())

In [None]:
dfAll.drop(columns=['MeanTemp', 'MinTemp', 'MeanDewPoint', 'MinDewPoint', 'MaxDewPoint',
                    'MinHumidity', 'MaxHumidity', 'MeanPressure', 'MinPressure',
                    'MaxPressure', 'MinWindSpeed', 'MeanWindChill', 'MinWindChill', 'MaxWindChill',
                    'MeanWindDirection' ], inplace=True)

In [None]:
dfRain = dfAll.copy(deep = True)
dfRain['Rain'] = (dfRain['TotalPrecip'] > 0).astype(bool)

In [None]:
dfRain['Longitude'] = dfRain['geometry'].x
dfRain['Latitude'] = dfRain['geometry'].y
print(dfRain.head())
print(dfRain.count())

In [None]:
# create longitude binned column
dfRain['LongitudeBin'], longBins = pd.cut(dfRain['Longitude'], 570, labels=False, retbins=True)
dfRain['LatitudeBin'], latBins = pd.cut(dfRain['Latitude'], 570, labels=False, retbins=True)
print(dfRain.head())
print(dfRain.count())
print(longBins)
print(latBins)

In [None]:
# plot LongitudeBin vs LatitudeBin scatter plot
dfRain.plot.scatter(x='LongitudeBin', y='LatitudeBin', c='blue', colormap='viridis')


In [None]:
# save dfRain to csv
dfRain.to_csv('Data/FinalFeature.csv', index=False)