In [2]:
%pip install geopandas pandas sqlalchemy psycopg2-binary openpyxl geoalchemy2 python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [4]:
from sqlalchemy import create_engine 
import geopandas as gpd
import pandas as pd

import os
from dotenv import load_dotenv

In [6]:
PATH_TO_DOT_ENV = "../.env"

DATABASE_TYPE = "postgresql"
DATABASE_HOST = "localhost"

LIGHTNING_TABLE_NAME = "L"
SUBDIVISON_TABLE_NAME = "S"
LIGHTNING_SUBDIVISION_AGGREGRATION_TABLE_NAME = "L_s"

In [8]:
load_dotenv(PATH_TO_DOT_ENV)

DATABASE_NAME = os.environ.get("DATABASE_NAME")
POSTGRES_USER = os.environ.get("POSTGRES_USER")
POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")
POSTGRES_HOST_PORT = os.environ.get("POSTGRES_HOST_PORT")
POSTGRES_CONTAINER_PORT = os.environ.get("POSTGRES_CONTAINER_PORT")

In [9]:
engine = create_engine(f"{DATABASE_TYPE}://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{DATABASE_HOST}:{POSTGRES_HOST_PORT}/{DATABASE_NAME}")

In [29]:
query_lightning = f"""select * from "{LIGHTNING_TABLE_NAME}" l """
query_subdivision = f"""select * from "{SUBDIVISON_TABLE_NAME}" s """

add_cluster_id_index = f"""create index if not exists idx_cluster_id on "{LIGHTNING_SUBDIVISION_AGGREGRATION_TABLE_NAME}" (division_id)"""
add_cluster_datetime_index = f"""create index if not exists idx_cluster_time on "{LIGHTNING_SUBDIVISION_AGGREGRATION_TABLE_NAME}" (timestamp)"""

In [10]:
lightning_gdf = gpd.read_postgis(
    sql=query_lightning, 
    con=engine,
    geom_col="geometry",
    crs="EPSG:4326"
)

In [13]:
subdivision_gdf = gpd.read_postgis(
    sql=query_subdivision, 
    con=engine,
    geom_col="geometry",
    crs="EPSG:4326"
)

In [14]:
len(lightning_gdf)

7659443

In [15]:
lightning_gdf.drop_duplicates(inplace=True, ignore_index=True)

In [16]:
len(lightning_gdf)

7659443

In [17]:
len(subdivision_gdf)

19

In [18]:
division_lightning_gdf = gpd.sjoin(lightning_gdf, subdivision_gdf, how='inner',op='within') 

  if await self.run_code(code, result, async_=asy):


In [20]:
division_lightning_gdf.head()

Unnamed: 0,index,event_strength_kiloamperes,multiplicity,timestamp,geometry,index_right,cid
0,2091302,27.8,1,2020-01-04 02:24:45.435503,POINT (-126.45540 51.65900),16,71
33,2091335,146.1,1,2020-02-02 09:44:09.206257,POINT (-127.55540 50.33000),16,71
34,2091336,40.5,2,2020-02-02 08:57:49.432394,POINT (-127.80590 50.38300),16,71
35,2091337,42.3,1,2020-02-02 07:47:58.341129,POINT (-127.92800 50.53210),16,71
36,2091338,79.2,1,2020-02-02 07:51:15.851691,POINT (-127.90200 50.53680),16,71


In [21]:
cluster_ids = division_lightning_gdf['cid'].unique()

division_dfs = []

for cluster_id in cluster_ids:
    # extract only the cluste information
    df = division_lightning_gdf[division_lightning_gdf['cid'] == cluster_id]
    # group cluster by time
    df = df.groupby([
        pd.Grouper(key="timestamp", freq="1d", dropna=False),
    ],
    ).agg({
        'multiplicity': ['sum','min', 'max', 'mean'],
        'event_strength_kiloamperes': ['mean', 'min', 'max']
    })
    # reset index
    df.reset_index(inplace=True)
    # flattern the row headers
    df.columns = [column_name if column_name[-1] != '_' else column_name[:-1] for column_name in ["_".join(column_data) for column_data in df.columns]]
    # add cluster id
    df['division_id'] = cluster_id
    # append to dataset list
    division_dfs.append(df)

division_lightning_gdf = pd.concat(division_dfs)

In [25]:
division_lightning_gdf.dropna(inplace=True)

In [26]:
division_lightning_gdf

Unnamed: 0,timestamp,multiplicity_sum,multiplicity_min,multiplicity_max,multiplicity_mean,event_strength_kiloamperes_mean,event_strength_kiloamperes_min,event_strength_kiloamperes_max,division_id
0,1999-02-01,4,1.0,1.0,1.000000,39.575000,22.6,70.4,71
3,1999-02-04,1,1.0,1.0,1.000000,80.300000,80.3,80.3,71
4,1999-02-05,4,1.0,1.0,1.000000,86.500000,24.9,191.1,71
5,1999-02-06,7,1.0,2.0,1.166667,82.033333,39.1,120.2,71
8,1999-02-09,2,1.0,1.0,1.000000,39.000000,17.6,60.4,71
...,...,...,...,...,...,...,...,...,...
8489,2022-06-25,411,1.0,4.0,1.090186,32.107427,5.4,177.1,29
8490,2022-06-26,19,1.0,2.0,1.266667,30.473333,10.3,110.5,29
8491,2022-06-27,904,1.0,5.0,1.177083,32.875130,5.1,272.1,29
8492,2022-06-28,194,1.0,4.0,1.141176,36.043529,3.5,255.6,29


In [30]:
division_lightning_gdf.to_sql(
    name=LIGHTNING_SUBDIVISION_AGGREGRATION_TABLE_NAME, 
    con=engine, 
    if_exists='replace', 
    index=False
)

182

In [31]:
with engine.connect() as con:
    con.execute(add_cluster_id_index)
    con.execute(add_cluster_datetime_index)