In [2]:
%pip install geopandas pandas sqlalchemy psycopg2-binary openpyxl geoalchemy2 python-dotenv dask 

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import geopandas as gpd
from sqlalchemy import create_engine

import os
from dotenv import load_dotenv

# Constants

In [2]:
RAW_LIGHTNING_DATA_2011_2015 = "../../data/lighting/2011-2015-raw/2011-2015.txt"
PREPROCESSED_POSITIVE_FLASHES_2011_2015 = "../../data/lighting/porcessed_lightning_csv/2011-2015-positive-cg-flashes.csv"
PATH_TO_DOT_ENV = "../.env"

LIGHTNING_COLUMNS = ["date","time","lat","long","event_strength_kiloamperes","chi_square","major_axies_confidence","minor_axies_confidence","angle_of_confidence","multiplicity","cloud_or_ground","na"]

START_YEAR = 1998
END_YEAR = 2010

DATABASE_TYPE = "postgresql"
DATABASE_HOST = "localhost"

CANADIAN_BOUNDARY_TABLE_NAME = "C"
LIGHTNING_TABLE_NAME = "L"

In [3]:
load_dotenv(PATH_TO_DOT_ENV)

DATABASE_NAME = os.environ.get("DATABASE_NAME")
POSTGRES_USER = os.environ.get("POSTGRES_USER")
POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")
POSTGRES_HOST_PORT = os.environ.get("POSTGRES_HOST_PORT")
POSTGRES_CONTAINER_PORT = os.environ.get("POSTGRES_CONTAINER_PORT")

In [4]:
engine = create_engine(f"{DATABASE_TYPE}://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{DATABASE_HOST}:{POSTGRES_HOST_PORT}/{DATABASE_NAME}")

# Load Canadian geometry for filtering

In [6]:
query_canada_boundary = f"""SELECT * from "{CANADIAN_BOUNDARY_TABLE_NAME}" """

canada_gdf = gpd.read_postgis(
    sql=query_canada_boundary, 
    con=engine, 
    geom_col="geometry", 
    crs="EPSG:4326"
)

# Read data file into pandas

In [5]:
lightning_data_2011_to_2015_df = pd.read_csv(
    RAW_LIGHTNING_DATA_2011_2015, 
    header=None, 
    sep=" ",
    names=LIGHTNING_COLUMNS
)

In [7]:
lightning_data_2011_to_2015_df

Unnamed: 0,date,time,lat,long,event_strength_kiloamperes,chi_square,major_axies_confidence,minor_axies_confidence,angle_of_confidence,multiplicity,cloud_or_ground,na
0,2011-01-01,00:42:20.864178221,40.4133,-86.1272,20.0,11.4,0.2,0.2,141,1,C,
1,2011-01-01,03:58:18.273089591,40.1700,-85.5770,-22.3,0.8,0.2,0.2,140,1,C,
2,2011-01-01,03:58:18.278492082,40.1642,-85.5767,-21.9,1.4,0.2,0.2,102,2,G,
3,2011-01-01,04:09:00.126383544,40.2733,-85.4885,18.4,0.7,0.2,0.2,122,3,G,
4,2011-01-01,04:09:00.148252640,40.3036,-85.5081,14.2,3.1,0.9,0.2,134,1,C,
...,...,...,...,...,...,...,...,...,...,...,...,...
59636405,2015-12-30,08:16:29.928067936,41.5109,-54.3392,-75.2,11.9,1.3,0.3,151,1,G,
59636406,2015-12-30,08:20:15.688690987,41.4905,-54.3313,-35.1,3.0,4.0,0.3,155,1,G,
59636407,2015-12-30,08:21:15.644064045,41.4145,-53.0374,67.0,2.8,3.6,0.4,155,1,G,
59636408,2015-12-30,08:21:15.806073037,41.8114,-53.1512,59.2,1.8,3.5,0.3,148,1,G,


# Preprocess the data 

In [None]:
# remove Cloud - Cloud lightning
lightning_data_2011_to_2015_df = lightning_data_2011_to_2015_df[lightning_data_2011_to_2015_df['cloud_or_ground'] == 'G']

In [None]:
# remove negative flashes
lightning_data_2011_to_2015_df = lightning_data_2011_to_2015_df[lightning_data_2011_to_2015_df['event_strength_kiloamperes'] > 0]

In [None]:
# drop columns
lightning_data_2011_to_2015_df = lightning_data_2011_to_2015_df.drop([
    'chi_square',
    'major_axies_confidence',
    'minor_axies_confidence',
    'angle_of_confidence',
    'cloud_or_ground',
    'na'
],axis=1)

In [None]:
# read as GeoPandas
lightning_data_2011_to_2015_gdf = gpd.GeoDataFrame(
                    lightning_data_2011_to_2015_df[['date','time','event_strength_kiloamperes','multiplicity']],
                    crs="EPSG:4326",
                    geometry=gpd.points_from_xy(lightning_data_2011_to_2015_df.long, lightning_data_2011_to_2015_df.lat)
)

In [None]:
del lightning_data_2011_to_2015_df

In [None]:
# clip to only canada
lightning_data_2011_to_2015_gdf = lightning_data_2011_to_2015_gdf.clip(canada_gdf)

In [None]:
# creating datetime field
lightning_data_2011_to_2015_gdf['timestamp'] = lightning_data_2011_to_2015_gdf.apply(lambda row: f"{row.date} {row.time}", axis=1)
lightning_data_2011_to_2015_gdf['timestamp'] = pd.to_datetime(lightning_data_2011_to_2015_gdf['timestamp'])
lightning_data_2011_to_2015_gdf = lightning_data_2011_to_2015_gdf.drop(['date','time'], axis=1)

In [None]:
# reset index
lightning_data_2011_to_2015_gdf.reset_index(drop=True, inplace=True)

# Save Dataset

In [None]:
lightning_data_2011_to_2015_gdf.to_csv(PREPROCESSED_POSITIVE_FLASHES_2011_2015)

In [None]:
lightning_data_2011_to_2015_gdf.to_postgis(
    name=LIGHTNING_TABLE_NAME, 
    con=engine, 
    if_exists='append', 
    index=True
)