In [1]:
%pip install torch torchvision torchaudio

Note: you may need to restart the kernel to use updated packages.


In [2]:
%%writefile ../../src/utils/target_types.py
from enum import Enum

# define an target enum
class DTarget(Enum):
    BOOLEAN = 0
    AREA = 1

Overwriting ../../src/utils/target_types.py


In [3]:
%%writefile ../../src/utils/stas.py
from pandas import DataFrame, DateOffset, merge
from torch import tensor, float32
import sys
from utils.dataset import Dataset
from utils.target_types import DTarget

class STASDataGenerator():
    def __init__(
            self,
            k:int,
            n:int,
            m:int,
            d_target:DTarget,
            d_station:bool,
            d:DataFrame,
            d_type:Dataset,
            f:DataFrame,
    ) -> None:
        self.k = k
        self.n = n
        self.m = m
        self.d_target = d_target
        self.d_station = d_station
        self.d_type = d_type
        self.d = d
        self.f = f

        self.generate()

    def get_processed_data(self):
        d_p = None
        if self.d_station:
            d_p = self._get_k_nearest_station_data()
        else:
            d_p = self._get_aggregrated_dataset()
        # adding extra compuation here to reduce repeating the same operation late 
        d_p = merge(
            left = self.f,
            right = self.d,
            on = 'division_id'
        )
        return d_p

    def _get_k_nearest_station_data(self):
        pass

    def _get_aggregrated_dataset(self):
        # to save on computation this step was completedd in the preprocessing notebooks
        assert self.d_type == Dataset.LIGHTNING, f"ONLY event-based data can be passed to this function!"
        return self.d
        
    def get_fire_points(self, d_p):
        e_f = d_p[(d_p["start_date"] <= d_p["timestamp"]) & (d_p["start_date"] >= d_p["timestamp"]-DateOffset(days=self.n))]
        e_f = e_f.groupby(Dataset.FIRE.value['data_columns'])
        return e_f
    
    def get_non_fire_points(self, d_p):
        e_nf = d_p[(d_p["start_date"] <= d_p["timestamp"]-DateOffset(months=self.m)) & (d_p["start_date"] >= d_p["timestamp"]-DateOffset(months=self.m,days=self.n))]
        e_nf = e_nf.groupby(Dataset.FIRE.value['data_columns'])
        return e_nf
    
    def __del_spatio_temporal_info(self, df:DataFrame):
        data_columns = self.d_type.value['data_columns']
        df = df[data_columns]
        del data_columns
        return df
    
    def __flatten_data(self, df:DataFrame):
        flattend_df = df.stack().reset_index(drop=True).to_list()
        return flattend_df

    def get_dataset(self, e_f, e_nf):
        # initializa dataset
        dataset = []
        targets = []

        # push e_f to datset
        for (_fire_date, area_burn), datapoint_df in e_f:
            # append target value 
            if self.d_target == DTarget.AREA:
                targets.append(area_burn)
            elif self.d_target == DTarget.BOOLEAN:
                targets.append(1)
            else:
                targets.append(None)
            # deleted spatial or teporal indo
            datapoint_df = self.__del_spatio_temporal_info(datapoint_df)
            datapoint = self.__flatten_data(datapoint_df)
            del datapoint_df
            # append data_points 
            dataset.append(datapoint)
            del datapoint

        # push e_nf to datset
        for (_fire_date, area_burn), datapoint_df in e_nf:
            # append target value 
            if (self.d_target == DTarget.AREA) or (self.d_target == DTarget.BOOLEAN):
                targets.append(0)
            else:
                targets.append(None)
            # deleted spatial or teporal indo
            datapoint_df = self.__del_spatio_temporal_info(datapoint_df)
            datapoint = self.__flatten_data(datapoint_df)
            del datapoint_df
            # append data_points 
            dataset.append(datapoint)
            del datapoint
        
        self.target_df = DataFrame({
            "target": targets
        })
        dataset_columns = [f"{column}_{n}" for n in range(self.n+1) for column in self.d_type.value['data_columns']]
        self.dataset_df = DataFrame(
            data = dataset,
            columns = dataset_columns
        )
        del dataset
        del targets
        del dataset_columns
        self.dataset_df.fillna(0, inplace=True)

        # randomly sample train and test data
        self.train_index = self.dataset_df.sample(frac=0.8).index
        self.test_index = self.dataset_df.drop(self.train_index).index
        
        # standardize the data 
        self.mean = self.dataset_df.loc[self.train_index].mean()
        self.std = self.dataset_df.loc[self.train_index].std()
        self.dataset_df = (self.dataset_df - self.mean) / self.std

        # split into datasets
        self.train_x = tensor(
            self.dataset_df.loc[self.train_index].values, 
            dtype=float32
        )
        self.train_y = tensor(
            self.target_df.loc[self.train_index].values, 
            dtype=float32
        )
        self.test_x = tensor(
            self.dataset_df.loc[self.test_index].values,
            dtype=float32
        )
        self.test_y = tensor(
            self.target_df.loc[self.test_index].values,
            dtype=float32
        )

        # discard excess info
        del self.dataset_df
        del self.target_df
        del self.test_index
        del self.train_index

    def generate(self):
        d_p = self.get_processed_data()
        e_f = self.get_fire_points(d_p = d_p)
        e_nf = self.get_non_fire_points(d_p = d_p)
        self.get_dataset(e_f=e_f, e_nf=e_nf)

Overwriting ../../src/utils/stas.py


In [13]:
import os
from dotenv import load_dotenv

from sqlalchemy.engine import URL

import sys
src_path = "../../src/"
sys.path.append(src_path)
from utils.generate_subdivision import GenSubdivision
from utils.dataset import Dataset
from utils.stas import STASDataGenerator
from utils.target_types import DTarget

In [5]:
PATH_TO_DOT_ENV = "../../.env"

DATABASE_TYPE = "postgresql"
DATABASE_HOST = "localhost"

K = 1
N = 7
M = 3


In [6]:
load_dotenv(PATH_TO_DOT_ENV)

DATABASE_NAME = os.environ.get("DATABASE_NAME")
POSTGRES_USER = os.environ.get("POSTGRES_USER")
POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")
POSTGRES_HOST_PORT = os.environ.get("POSTGRES_HOST_PORT")
POSTGRES_CONTAINER_PORT = os.environ.get("POSTGRES_CONTAINER_PORT")

In [7]:
DATABASE_URL = URL.create(
    DATABASE_TYPE,
    username=POSTGRES_USER,
    password=POSTGRES_PASSWORD,  # plain (unescaped) text
    host=DATABASE_HOST,
    port=POSTGRES_HOST_PORT,
    database=DATABASE_NAME,
)

In [8]:
lightning_generator = GenSubdivision(
    d_full = Dataset.LIGHTNING,
    db_url = DATABASE_URL
)
lightning_d_map = lightning_generator.gen_subdivisions()

In [9]:
fire_generator = GenSubdivision(
    d_full = Dataset.FIRE,
    db_url = DATABASE_URL
)
fire_d_map = fire_generator.gen_subdivisions()

In [10]:
lightning_s_id, lightning_d = next(iter(lightning_d_map))
fire_s_id, fire_d = next(iter(fire_d_map))

In [11]:
assert lightning_s_id == fire_s_id

In [14]:
stas_generatr = STASDataGenerator(
    k=K,
    n=N,
    m=M,
    d_target=DTarget.BOOLEAN,
    d_station=False,
    d=lightning_d,
    d_type=Dataset.LIGHTNING,
    f=fire_d
)

In [20]:
# test if there are (N+1) x (num_of_fetures) columns present in the dataset
assert stas_generatr.test_x.shape[1] == (N+1)*len(Dataset.LIGHTNING.value['data_columns'])