In [1]:
# %pip install geopandas pandas sqlalchemy psycopg2-binary openpyxl geoalchemy2 python-dotenv tqdm

# Import Block

In [2]:
import pandas as pd
import geopandas as gpd
from sqlalchemy import create_engine

import os
from tqdm import tqdm 
from dotenv import load_dotenv

# Global Constants

In [3]:
PREPROCESSED_POSITIVE_FLASHES_2016_2022 = "../../data/lighting/porcessed_lightning_csv/2016-2022-positive-cg-flashes.csv"
PATH_TO_DOT_ENV = "../../.env"

LIGHTNING_COLUMNS = ["date","time","lat","long","event_strength_kiloamperes","chi_square","major_axies_confidence","minor_axies_confidence","angle_of_confidence","multiplicity","cloud_or_ground","na"]

START_YEAR = 2016
END_YEAR = 2022
START_MONTH = 1
END_MONTH = 12

DATABASE_TYPE = "postgresql"
DATABASE_HOST = "localhost"

LIGHTNING_TABLE_NAME = "L"
CANADIAN_BOUNDARY_TABLE_NAME = "C"

In [4]:
load_dotenv(PATH_TO_DOT_ENV)

DATABASE_NAME = os.environ.get("DATABASE_NAME")
POSTGRES_USER = os.environ.get("POSTGRES_USER")
POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")
POSTGRES_HOST_PORT = os.environ.get("POSTGRES_HOST_PORT")
POSTGRES_CONTAINER_PORT = os.environ.get("POSTGRES_CONTAINER_PORT")

In [5]:
engine = create_engine(f"{DATABASE_TYPE}://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{DATABASE_HOST}:{POSTGRES_HOST_PORT}/{DATABASE_NAME}")

# Helper Functions

In [6]:
def dataset_location(year:int, month:int): 
    month_str = str(month) if month >= 10 else f"0{month}"
    year_str = str(year)
    return f"../../data/lighting/2016-2022-raw/{year_str}{month_str}f.txt"


## Load Canadian geometry for filtering

In [7]:
query_canada_boundary = f"""SELECT * from "{CANADIAN_BOUNDARY_TABLE_NAME}" """

canada_gdf = gpd.read_postgis(
    sql=query_canada_boundary, 
    con=engine, 
    geom_col="geometry", 
    crs="EPSG:4326"
)

## Filtering and merging lightining data

In [8]:
lighting_df = []

for year in range(START_YEAR, END_YEAR + 1):

    lightning_year_data_progress_bar= tqdm(
        range(START_MONTH, END_MONTH + 1),
        desc = f"{year}"
    )

    for month in lightning_year_data_progress_bar:

        lightning_csv_path = dataset_location(year, month)
       
        try:
            month_lighting_data = pd.read_csv(
                lightning_csv_path, 
                sep=" ", 
                header=None, 
                names=LIGHTNING_COLUMNS
            )
            
            filtered_lightning_df = month_lighting_data[(month_lighting_data['event_strength_kiloamperes'] >= 0) & (month_lighting_data['cloud_or_ground'] == 'G')]
            
            if len(filtered_lightning_df) > 0:
                geo_lighting = gpd.GeoDataFrame(
                    filtered_lightning_df[['date','time','event_strength_kiloamperes','multiplicity']],
                    crs="EPSG:4326",
                    geometry=gpd.points_from_xy(filtered_lightning_df.long, filtered_lightning_df.lat))
                del filtered_lightning_df
                canada_lightning = geo_lighting.clip(canada_gdf)
                del geo_lighting
                lighting_df.append(canada_lightning)
                del canada_lightning
            else:
                print(f"No Data for MONTH: {month}")
            del month_lighting_data
        except Exception as e:
            print(f"No Data for YEAR: {year} MONTH: {month} --> {e}")

        lightning_year_data_progress_bar.set_postfix_str(lightning_csv_path)
    
positive_flashes_gpd = pd.concat(lighting_df)
positive_flashes_gpd.reset_index(drop=True, inplace=True)
del lighting_df
print(f"Finished building positive lighting flashes geo data!!!")

2016:   0%|          | 0/12 [00:00<?, ?it/s]

2016: 100%|██████████| 12/12 [29:10<00:00, 145.90s/it, ../../data/lighting/2016-2022-raw/201612f.txt]
2017: 100%|██████████| 12/12 [30:26<00:00, 152.24s/it, ../../data/lighting/2016-2022-raw/201712f.txt]
2018: 100%|██████████| 12/12 [35:26<00:00, 177.25s/it, ../../data/lighting/2016-2022-raw/201812f.txt]
2019: 100%|██████████| 12/12 [29:30<00:00, 147.51s/it, ../../data/lighting/2016-2022-raw/201912f.txt]
2020: 100%|██████████| 12/12 [29:32<00:00, 147.73s/it, ../../data/lighting/2016-2022-raw/202012f.txt]
2021: 100%|██████████| 12/12 [29:32<00:00, 147.68s/it, ../../data/lighting/2016-2022-raw/202112f.txt]
2022: 100%|██████████| 12/12 [14:45<00:00, 73.76s/it, ../../data/lighting/2016-2022-raw/202212f.txt]


No Data for YEAR: 2022 MONTH: 7 --> [Errno 2] No such file or directory: '../../data/lighting/2016-2022-raw/202207f.txt'
No Data for YEAR: 2022 MONTH: 8 --> [Errno 2] No such file or directory: '../../data/lighting/2016-2022-raw/202208f.txt'
No Data for YEAR: 2022 MONTH: 9 --> [Errno 2] No such file or directory: '../../data/lighting/2016-2022-raw/202209f.txt'
No Data for YEAR: 2022 MONTH: 10 --> [Errno 2] No such file or directory: '../../data/lighting/2016-2022-raw/202210f.txt'
No Data for YEAR: 2022 MONTH: 11 --> [Errno 2] No such file or directory: '../../data/lighting/2016-2022-raw/202211f.txt'
No Data for YEAR: 2022 MONTH: 12 --> [Errno 2] No such file or directory: '../../data/lighting/2016-2022-raw/202212f.txt'
Finished building positive lighting flashes geo data!!!


In [9]:
positive_flashes_gpd.head()

Unnamed: 0,date,time,event_strength_kiloamperes,multiplicity,geometry
0,2016-01-17,23:11:09.950952548,20.1,1,POINT (-76.32870 44.28670)
1,2016-01-30,05:17:00.281329752,346.2,1,POINT (-124.93310 49.01510)
2,2016-01-30,04:36:45.432085524,60.8,1,POINT (-125.92340 49.33560)
3,2016-01-30,04:26:50.139326144,44.0,1,POINT (-126.50690 49.81860)
4,2016-01-30,05:12:30.572452623,296.3,1,POINT (-124.21370 48.54420)


In [10]:
positive_flashes_gpd.shape

(3588201, 5)

## Adding timeing information

In [11]:
positive_flashes_gpd['timestamp'] = positive_flashes_gpd.apply(lambda row: f"{row.date} {row.time}", axis=1)
positive_flashes_gpd["timestamp"] = pd.to_datetime(positive_flashes_gpd["timestamp"])

In [12]:
positive_flashes_gpd = positive_flashes_gpd.drop(['date','time'], axis=1)
positive_flashes_gpd.head()

Unnamed: 0,event_strength_kiloamperes,multiplicity,geometry,timestamp
0,20.1,1,POINT (-76.32870 44.28670),2016-01-17 23:11:09.950952548
1,346.2,1,POINT (-124.93310 49.01510),2016-01-30 05:17:00.281329752
2,60.8,1,POINT (-125.92340 49.33560),2016-01-30 04:36:45.432085524
3,44.0,1,POINT (-126.50690 49.81860),2016-01-30 04:26:50.139326144
4,296.3,1,POINT (-124.21370 48.54420),2016-01-30 05:12:30.572452623


In [13]:
# store a CSV cache
positive_flashes_gpd.to_csv(PREPROCESSED_POSITIVE_FLASHES_2016_2022)

In [None]:
# Store in DB
positive_flashes_gpd.to_postgis(
    name=LIGHTNING_TABLE_NAME, 
    con=engine, 
    if_exists='append', 
    index=True
)