# Import block

In [1]:
! pip install geopandas pandas sqlalchemy psycopg2-binary openpyxl geoalchemy2 python-dotenv tqdm

/bin/bash: /media/mutakabbir/HDD_2TB_02/Forest_Fire/.venv/bin/pip: /media/mutakabbir/HDD_2TB_01/Forest_Fire/.venv/bin/python: bad interpreter: No such file or directory


In [2]:
import pandas as pd
import geopandas as gpd
from sqlalchemy import create_engine

import os
from tqdm import tqdm 
from dotenv import load_dotenv

# Preprocess Lighting Data

## Constants

In [3]:
PREPROCESSED_POSITIVE_FLASHES_1998_2010 = "../../data/lighting/porcessed_lightning_csv/1998-2010-positive-cg-flashes.csv"
PATH_TO_DOT_ENV = "../.env"

LIGHTNING_COLUMNS = ["year","month","day","hour","minute","second","lat","long","event_strength_kiloamperes","multiplicity","cloud_or_ground","na"]
LIGHTNING_DATE_COLUMNS = ['year', 'month', 'day', 'hour', 'minute', 'second']

START_YEAR = 1998
END_YEAR = 2010

DATABASE_TYPE = "postgresql"
DATABASE_HOST = "localhost"

CANADIAN_BOUNDARY_TABLE_NAME = "C"
LIGHTNING_TABLE_NAME = "L"

In [4]:
load_dotenv(PATH_TO_DOT_ENV)

DATABASE_NAME = os.environ.get("DATABASE_NAME")
POSTGRES_USER = os.environ.get("POSTGRES_USER")
POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")
POSTGRES_HOST_PORT = os.environ.get("POSTGRES_HOST_PORT")
POSTGRES_CONTAINER_PORT = os.environ.get("POSTGRES_CONTAINER_PORT")

In [5]:
engine = create_engine(f"{DATABASE_TYPE}://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{DATABASE_HOST}:{POSTGRES_HOST_PORT}/{DATABASE_NAME}")

## Helper Functions

In [6]:
def dataset_location (year:int): 
    year_str = str(year)
    return f"../../data/lighting/1998-2010-raw/Canada_ltng_Jan-Dec_{year_str}.txt"


## Load Canadian geometry for filtering

In [7]:
query_canada_boundary = f"""SELECT * from "{CANADIAN_BOUNDARY_TABLE_NAME}" """

canada_gdf = gpd.read_postgis(
    sql=query_canada_boundary, 
    con=engine, 
    geom_col="geometry", 
    crs="EPSG:4326"
)

## Filtering and merging lightining data

In [8]:
lighting_dfs = []

lightning_progress_bar = tqdm(
    range(START_YEAR, END_YEAR + 1),
    desc = "Lightning"
)

for year in lightning_progress_bar:
    try:
        # get file path 
        yearly_data_file_path = dataset_location(year)
        
        # read data
        yearly_lighting_df = pd.read_csv(
            yearly_data_file_path, 
            sep=" ", 
            header=None, 
            names=LIGHTNING_COLUMNS
        )
        
        # conver to timestamp
        yearly_lighting_df['timestamp'] = pd.to_datetime(yearly_lighting_df[LIGHTNING_DATE_COLUMNS])
        yearly_lighting_df.drop(
            columns=LIGHTNING_DATE_COLUMNS+['na'],
            inplace=True
        )

        # encode lightning type
        yearly_lighting_df['cloud_or_ground'] = yearly_lighting_df['cloud_or_ground'].apply(lambda row: 'G' if row == 1 else 'C')

        # remove negative lightning and cloud-to-cloud lightning
        filtered_lightning_df = yearly_lighting_df[(yearly_lighting_df['event_strength_kiloamperes'] >= 0) & (yearly_lighting_df['cloud_or_ground'] == 'G')]
        
        if len(filtered_lightning_df) > 0:
            # convert to geopandas
            filtered_lighting_gdf = gpd.GeoDataFrame(
                filtered_lightning_df[['event_strength_kiloamperes','multiplicity','timestamp']],
                crs="EPSG:4326",
                geometry=gpd.points_from_xy(filtered_lightning_df.long, filtered_lightning_df.lat)
            )
            
            # filter to only canada
            canadian_lightning_gdf = filtered_lighting_gdf.clip(canada_gdf)

            # add data to list to list
            lighting_dfs.append(canadian_lightning_gdf)

            del filtered_lighting_gdf
            del canadian_lightning_gdf
        else:
            # No data found
            continue
        
        del yearly_lighting_df
        del filtered_lightning_df

    except Exception as e:
        print(f"Error on data for YEAR: {year}: {e}")
    
    lightning_progress_bar.set_postfix_str(f"{year}")
    
positive_flashes_gpd = pd.concat(lighting_dfs)
positive_flashes_gpd.reset_index(
    drop=True, 
    inplace=True
)

del lighting_dfs

print(f"Finished building positive lighting flashes geo data!!!")

Lightning: 100%|██████████| 13/13 [29:08<00:00, 134.50s/it, 2010]


Finished building positive lighting flashes geo data!!!


In [9]:
positive_flashes_gpd.head()

Unnamed: 0,event_strength_kiloamperes,multiplicity,timestamp,geometry
0,204.3,2,1999-12-04 04:17:18.131325233,POINT (-82.16530 42.22310)
1,50.4,1,1999-04-22 07:52:12.552416300,POINT (-82.15990 42.22470)
2,55.0,1,1999-05-31 11:12:06.365744700,POINT (-82.16500 42.22820)
3,24.7,1,1999-07-09 21:08:12.541602400,POINT (-81.85270 42.26090)
4,17.0,1,1999-05-31 11:17:37.366857700,POINT (-82.14310 42.24170)


In [10]:
positive_flashes_gpd.shape

(2786010, 4)

In [11]:
positive_flashes_gpd.to_csv(PREPROCESSED_POSITIVE_FLASHES_1998_2010)

In [None]:
positive_flashes_gpd.to_postgis(
    name=LIGHTNING_TABLE_NAME, 
    con=engine, 
    if_exists='replace', 
    index=True
)