In [30]:
%%writefile ../../src/utils/dataset.py
from enum import Enum

# define an dataset enum
class Dataset(Enum):
    FIRE = {
        'id': 'fire',
        'index_col': 'division_id',
        'is_geo': False,
        'date_column': 'start_date',
        'data_columns': [
            'start_date', 
            'area_burnt_ha'
        ]
    }
    SUBDIVISION = {
        'id': 'subdivision',
        'index_col': "cid",
        'is_geo': True,
        'date_column': None,
        'data_columns': None
    }
    WEATHER = {
        'id': 'weather',
        'index_col': 'division_id',
        'is_geo': False,
        'date_column': 'date',
        'data_columns': [
            'extraterrestrial_irradiance',
            'global_horizontal_irradiance',
            'direct_normal_irradience',
            'diffuse_horizontal_irradiance',
            'global_horizontal_illumination_klux',
            'direct_normal_illumination_klux',
            'diffuse_horizontal_illumination_klux',
            'zenith_illumination',
            'sunlight_min',
            'ceiling_height_meters',
            'sky_layer_1',
            'sky_layer_2',
            'sky_layer_3',
            'sky_layer_4',
            'visibility_km',
            'weather_thunderstorm',
            'weather_rain',
            'weather_drizzle',
            'weather_snow_1',
            'weather_snow_2',
            'weather_ice',
            'weather_visibility_1',
            'weather_visibility_2',
            'pressure_kpa',
            'dry_bulb_temp_c',
            'dew_point_temp_c',
            'wind_direction_deg',
            'wind_speed_mps',
            'sky_cover',
            'sky_cover_opaque',
            'snow',
        ]
    }
    LIGHTNING = {
        'id': 'lightning',
        'index_col': 'division_id',
        'is_geo': False,
        'date_column': 'timestamp',
        'data_columns': [
            'multiplicity_sum', 
            'multiplicity_min', 
            'multiplicity_max', 
            'multiplicity_mean', 
            'event_strength_kiloamperes_mean', 
            'event_strength_kiloamperes_min',
            'event_strength_kiloamperes_max'
        ]
    }

Overwriting ../../src/utils/dataset.py


In [8]:
%%writefile ../../src/utils/generate_subdivision.py
from sqlalchemy.engine import URL
from sqlalchemy import create_engine, inspect, text
from pandas import read_sql, DataFrame
from geopandas import read_postgis, GeoDataFrame
from utils.dataset import Dataset


class GenSubdivision():
    def __init__(
            self,
            n:int,
            m:int,
            k:int,
            d_full:Dataset,
            s:Dataset = Dataset.SUBDIVISION,
            db_url:URL = None
    ) -> None:
        """ Constructor for the subdivision class

        Args:
            n (int): num of historical days to consider in a datapoint
            m (int): num of months to look back at to consider non fire points
            k (int): num of nearest stations
            d_full (Dataset): type of dataset beeing processed into subdivisions
            s (Dataset, optional): subdivion data type. Defaults to Dataset.SUBDIVISION.
            db_url (URL, optional): database connection info. Defaults to None.
        """
        self.n = n
        self.m = m
        self.k = k
        self.engine = create_engine(db_url)
        self.d_full = d_full

    def __get_subdivion_data_query(self) -> str:
        query =  """SELECT * FROM "S";"""
        return query
    
    def __get_lightning_data_query(self) -> str:
        query =  """SELECT * FROM "L_s";"""
        return query 
    
    def __get_fire_data_query(self) -> str:
        query = """ 
            SELECT
                fs.division_id,
                fs.start_date,
                fs.area_burnt_ha
            FROM 
                "F_s" fs
            WHERE
                fs.cause = 'L'
        """
        return query

    def __create_cache_table(
        self, 
        table_name:str,
        query:str,
    ) -> None:
        create_query = f"""
        CREATE TABLE IF NOT EXISTS "{table_name}"
        as (
            {query}
        );
        """

        with self.engine.connect() as con:
            con.execute(text(create_query))
            con.execute(text(f'ALTER TABLE "{table_name}" ADD PRIMARY KEY ("division_id", "climate_ID", "area_burnt_ha", "start_date");'))
            con.close()

    def __get_cache_k_nearest_table_query(self) -> str:
        query = f"""
        select 
            f.division_id,
            f.start_date,
            f.area_burnt_ha,
            w."climate_ID"
        from (
            select 
                division_id,
                start_date,
                area_burnt_ha,
                cause,
                geometry 
            from "F_s" as fs 
        ) as f
        cross join lateral (
            select 
                wm."climate_ID", 
                wm."geometry" <-> f."geometry" as distance
            from (
                select 
                    wms."climate_ID", 
                    wms."geometry",
                    wms."first_yr",
                    wms."last_yr"
                from "W_ms" as wms
            ) as wm
            where 
                EXTRACT(year FROM f.start_date) > wm.first_yr and 
                EXTRACT(year FROM f.start_date) < wm.last_yr  
            order by distance
            limit {self.k}
        ) as w
        """
        return query
    
    def __get_cache_weather_table_name(self) -> str:
        return f"W_m_k{self.k}"
    
    def __get_weather_data_query(self) -> str:
        table_name = self.__get_cache_weather_table_name()
        table_exists = inspect(self.engine).has_table(table_name, schema="public")
        if not table_exists:
            cache_query = self.__get_cache_k_nearest_table_query()
            self.__create_cache_table(
                table_name = table_name,
                query = cache_query
            )
            print(f"Table does not exist for weather cache k={self.k}!!! Creating table.")
        print(f"Cache Weather table found for k={self.k}!")
        query = f"""
            select 
                wmk.division_id,
                wmk.start_date,
                wmk.area_burnt_ha,
                wsk.date,
                wsk.extraterrestrial_irradiance,
                wsk.global_horizontal_irradiance,
                wsk.direct_normal_irradience,
                wsk.diffuse_horizontal_irradiance,
                wsk.global_horizontal_illumination_klux,
                wsk.direct_normal_illumination_klux,
                wsk.diffuse_horizontal_illumination_klux,
                wsk.zenith_illumination,
                wsk.sunlight_min,
                wsk.ceiling_height_meters,
                wsk.sky_layer_1,
                wsk.sky_layer_2,
                wsk.sky_layer_3,
                wsk.sky_layer_4,
                wsk.visibility_km,
                wsk.weather_thunderstorm,
                wsk.weather_rain,
                wsk.weather_drizzle,
                wsk.weather_snow_1,
                wsk.weather_snow_2,
                wsk.weather_ice,
                wsk.weather_visibility_1,
                wsk.weather_visibility_2,
                wsk.pressure_kpa,
                wsk.dry_bulb_temp_c,
                wsk.dew_point_temp_c,
                wsk.wind_direction_deg,
                wsk.wind_speed_mps,
                wsk.sky_cover,
                wsk.sky_cover_opaque,
                wsk.snow
            from "W_m_k1" wmk
            inner join "W_sp" wsk 
                on wsk.climate_id = wmk."climate_ID" 
            where 
                DATE(wsk.date) <= DATE(wmk.start_date) and 
                DATE(wmk.start_date) - make_interval(days => 7) - make_interval(months => 1)<= DATE(wsk.date)
        """
        return query 
    
    def __read_geodata(
            self, 
            query:str, 
            index_col:str = None, 
            geom_col:str = 'geometry',
            crs:str = "EPSG:4326"
    ) -> GeoDataFrame:
        data = read_postgis(
            sql = query,
            con = self.engine,
            geom_col = geom_col,
            index_col = index_col,
            crs = crs
        )
        return data
    
    def __read_data(
            self,
            query:str,
            index_col:str = None,
    ) -> DataFrame:
        data = read_sql(
            sql = query,
            con = self.engine,
            index_col = index_col
        )
        return data

    def get_subdivision_dataset(self) -> GeoDataFrame:
        subdivision_data_query = self.__get_subdivion_data_query()
        subdivion_data = self.__read_geodata(
            subdivision_data_query,
            index_col = Dataset.SUBDIVISION.value['index_col']
        )
        return subdivion_data
    
    def __get_data_query(self) -> str:
        """ Generates the query to get the appropriate data

        Raises:
            ValueError: The dataset type is invalid.

        Returns:
            str: dataset query
        """
        if self.d_full == Dataset.LIGHTNING:
            return self.__get_lightning_data_query()
        elif self.d_full == Dataset.WEATHER:
            return self.__get_weather_data_query()
        elif self.d_full == Dataset.FIRE:
            return self.__get_fire_data_query()
        else:
            raise ValueError("Invalid return dataset type!!!")
    
    def __get_full_data_cache_table_name(self) -> str:
        return f"{self.d_full.name}_K{self.k}_N{self.n}_M{self.m}"

    def __get_full_data_cache_query(self) -> str:
        table_name = self.__get_full_data_cache_table_name()
        return f"""SELECT * from "{table_name}"; """
    
    def gen_subdivisions(self):
        # Check if data is cached 
        cache_table_name = self.__get_full_data_cache_table_name()
        table_exists = inspect(self.engine).has_table(cache_table_name, schema="public")
        if table_exists:
            print(f"Cache of {cache_table_name} found!")
            # load cache data
            data = read_sql(
                sql = self.__get_full_data_cache_query(),
                con = self.engine,
                index_col = self.d_full.value['index_col']
            )
        else: 
            print(f"Cache of {cache_table_name} not found!")
            # process data 
            data_query = self.__get_data_query()
            if self.d_full.value['is_geo']:
                data = self.__read_geodata(
                    data_query,
                    index_col = self.d_full.value['index_col']
                )
            else:
                data = self.__read_data(
                    data_query,
                    index_col = self.d_full.value['index_col']
                )
            
            # cache data
            data.to_sql(
                name=cache_table_name, 
                con=self.engine, 
                if_exists='replace', 
                index=True
            )
        
        d_map = data.groupby(
            by = self.d_full.value['index_col']
        )

        return d_map

Overwriting ../../src/utils/generate_subdivision.py


In [1]:
import os
from dotenv import load_dotenv

from sqlalchemy.engine import URL

import sys
src_path = "../../src/"
sys.path.append(src_path)
from utils.generate_subdivision import GenSubdivision
from utils.dataset import Dataset

In [2]:
PATH_TO_DOT_ENV = "../../.env"

DATABASE_TYPE = "postgresql"
DATABASE_HOST = "localhost"


In [3]:
load_dotenv(PATH_TO_DOT_ENV)

DATABASE_NAME = os.environ.get("DATABASE_NAME")
POSTGRES_USER = os.environ.get("POSTGRES_USER")
POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")
POSTGRES_HOST_PORT = os.environ.get("POSTGRES_HOST_PORT")
POSTGRES_CONTAINER_PORT = os.environ.get("POSTGRES_CONTAINER_PORT")

In [4]:
DATABASE_URL = URL.create(
    DATABASE_TYPE,
    username=POSTGRES_USER,
    password=POSTGRES_PASSWORD,  # plain (unescaped) text
    host=DATABASE_HOST,
    port=POSTGRES_HOST_PORT,
    database=DATABASE_NAME,
)

In [25]:
fire_generator = GenSubdivision(
    d_full = Dataset.FIRE,
    db_url = DATABASE_URL,
    n = 7,
    m = 1,
    k = 1,
)
f_map = fire_generator.gen_subdivisions()

for s_id, f_data in f_map:
    if s_id == 71:
        break

w_generator = GenSubdivision(
    d_full = Dataset.WEATHER,
    db_url = DATABASE_URL,
    n = 7,
    m = 1,
    k = 1,
)
w_map = w_generator.gen_subdivisions()

for s_id, w_data in w_map:
    if s_id == 71:
        break

l_generator = GenSubdivision(
    d_full = Dataset.LIGHTNING,
    db_url = DATABASE_URL,
    n = 7,
    m = 1,
    k = 1,
)
l_map = l_generator.gen_subdivisions()

for s_id, l_data in l_map:
    if s_id == 71:
        break

Cache Weather table found for k=1!


In [26]:
f_data

Unnamed: 0_level_0,start_date,area_burnt_ha
division_id,Unnamed: 1_level_1,Unnamed: 2_level_1
71,1919-07-01,1094.543856
71,1919-07-25,517.811840
71,1919-07-30,1504.863833
71,1919-08-01,7.385316
71,1919-08-01,78.171012
...,...,...
71,2020-08-17,34.179079
71,2020-08-17,380.653945
71,2020-08-18,3.529941
71,2020-08-18,6.553449


In [27]:
l_data

Unnamed: 0_level_0,timestamp,multiplicity_sum,multiplicity_min,multiplicity_max,multiplicity_mean,event_strength_kiloamperes_mean,event_strength_kiloamperes_min,event_strength_kiloamperes_max
division_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
71,1999-02-01,4,1.0,1.0,1.000000,39.575000,22.6,70.4
71,1999-02-04,1,1.0,1.0,1.000000,80.300000,80.3,80.3
71,1999-02-05,4,1.0,1.0,1.000000,86.500000,24.9,191.1
71,1999-02-06,7,1.0,2.0,1.166667,82.033333,39.1,120.2
71,1999-02-09,2,1.0,1.0,1.000000,39.000000,17.6,60.4
...,...,...,...,...,...,...,...,...
71,2022-06-20,4,1.0,1.0,1.000000,35.825000,25.8,60.1
71,2022-06-22,20,1.0,2.0,1.052632,23.747368,10.2,56.6
71,2022-06-23,21,1.0,2.0,1.105263,34.752632,12.1,107.8
71,2022-06-28,271,1.0,3.0,1.101626,26.507317,2.7,125.2


In [28]:
w_data

Unnamed: 0_level_0,start_date,area_burnt_ha,date,extraterrestrial_irradiance,global_horizontal_irradiance,direct_normal_irradience,diffuse_horizontal_irradiance,global_horizontal_illumination_klux,direct_normal_illumination_klux,diffuse_horizontal_illumination_klux,...,weather_visibility_1,weather_visibility_2,pressure_kpa,dry_bulb_temp_c,dew_point_temp_c,wind_direction_deg,wind_speed_mps,sky_cover,sky_cover_opaque,snow
division_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
71,2013-07-07,0.203668,2013-06-17,1738.208333,1105.541667,952.958333,398.833333,34.704167,28.654167,13.379167,...,0.0,0.0,101.222917,17.108333,12.604167,186.666667,2.904167,7.125000,6.750000,0.0
71,2013-07-07,0.203668,2013-06-16,1737.208333,967.958333,616.458333,513.958333,30.833333,18.591667,17.112500,...,0.0,0.0,101.326250,17.820833,12.087500,143.333333,1.895833,7.750000,6.708333,0.0
71,2013-07-07,0.203668,2013-06-15,1736.125000,960.875000,588.125000,528.125000,30.629167,17.720833,17.554167,...,0.0,0.0,101.300000,16.025000,10.670833,117.083333,1.670833,7.666667,5.333333,0.0
71,2013-07-07,0.203668,2013-06-14,1734.750000,1018.666667,852.166667,477.041667,32.120833,24.887500,15.900000,...,0.0,0.0,101.795833,15.158333,10.862500,179.166667,3.600000,6.125000,5.083333,0.0
71,2013-07-07,0.203668,2013-06-13,1733.291667,954.000000,718.333333,418.750000,30.091667,21.604167,13.925000,...,0.0,0.0,101.826667,14.433333,9.333333,190.000000,2.162500,5.583333,5.041667,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,2009-07-25,863.074895,2009-07-11,1696.166667,1192.500000,1284.708333,336.166667,37.200000,37.883333,11.475000,...,0.0,0.0,99.041667,20.875000,11.050000,152.916667,1.625000,0.000000,0.000000,1.0
71,2009-07-25,863.074895,2009-07-10,1700.250000,1135.041667,1104.166667,418.125000,35.650000,32.479167,14.129167,...,0.0,0.0,99.292500,20.154167,11.525000,122.916667,1.550000,0.000000,0.000000,1.0
71,2009-07-25,863.074895,2009-07-09,1704.125000,870.416667,527.958333,522.166667,27.795833,15.766667,17.312500,...,0.0,0.0,99.260833,17.458333,10.737500,234.166667,1.850000,0.000000,0.000000,1.0
71,2009-07-25,863.074895,2009-07-08,1707.791667,609.625000,144.583333,509.958333,19.633333,4.354167,16.629167,...,0.0,0.0,99.030417,14.895833,10.195833,215.416667,2.004167,0.000000,0.000000,1.0


In [29]:
w_data['area_burnt_ha'].value_counts()

area_burnt_ha
0.203668       39
1146.301656    39
89.085929      39
13.067582      39
94.280205      39
               ..
0.109925       38
4303.334354    38
766.135392     38
474.993860     38
863.074895     38
Name: count, Length: 491, dtype: int64