In [11]:
! pip install geopandas pandas sqlalchemy psycopg2-binary openpyxl geoalchemy2 python-dotenv tqdm

/bin/bash: /media/mutakabbir/HDD_2TB_02/Forest_Fire/.venv/bin/pip: /media/mutakabbir/HDD_2TB_01/Forest_Fire/.venv/bin/python: bad interpreter: No such file or directory


In [2]:
import pandas as pd
import geopandas as gpd
from datetime import datetime
from sqlalchemy import create_engine, TIMESTAMP

import os
import warnings
from tqdm import tqdm
from dotenv import load_dotenv

In [3]:
warnings.filterwarnings("ignore")

In [4]:
climate_ics_date_pattern = '(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})(?P<hour>\d{2})'

In [5]:
climate_dtypes = {
    0: 'object',
    1: 'object',
    2: 'object',
    3: 'int64',
    4: 'int64',
    5: 'object',
    6: 'int64',
    7: 'object',
    8: 'int64',
    9: 'object',
    10: 'int64',
    11: 'object',
    12: 'int64',
    13: 'object',
    14: 'int64',
    15: 'object',
    16: 'int64',
    17: 'object',
    18: 'int64',
    19: 'object',
    20: 'int64',
    21: 'object',
    22: 'int64',
    23: 'object',
    24: 'int64',
    25: 'object',
    26: 'int64',
    27: 'object',
    28: 'int64',
    29: 'object',
    30: 'int64',
    31: 'object',
    32: 'int64',
    33: 'object',
    34: 'int64',
    35: 'object',
    36: 'int64',
    37: 'object',
    38: 'int64',
    39: 'object',
    40: 'int64',
    41: 'object',
    42: 'int64',
    43: 'object',
}

sql_dtypes = {
    "Year Month Day Hour (YYYYMMDDHH)": TIMESTAMP
}

In [6]:
CWEEDS_STATION_DATA_DIR = "../../data/ics/CWEEDS"
PATH_TO_DOT_ENV = "../.env"

DATABASE_TYPE = "postgresql"
DATABASE_HOST = "localhost"

CWEEDS_STATION_TABLE_NAME = "W_s"

In [7]:
load_dotenv(PATH_TO_DOT_ENV)

DATABASE_NAME = os.environ.get("DATABASE_NAME")
POSTGRES_USER = os.environ.get("POSTGRES_USER")
POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")
POSTGRES_HOST_PORT = os.environ.get("POSTGRES_HOST_PORT")
POSTGRES_CONTAINER_PORT = os.environ.get("POSTGRES_CONTAINER_PORT")

In [8]:
engine = create_engine(f"{DATABASE_TYPE}://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{DATABASE_HOST}:{POSTGRES_HOST_PORT}/{DATABASE_NAME}")

In [9]:
provience_list = os.listdir(CWEEDS_STATION_DATA_DIR)
for provience_dir in provience_list:
    province_dir_path = f"{CWEEDS_STATION_DATA_DIR}/{provience_dir}"
    provience_station_list = os.listdir(province_dir_path)
    provience_progress_bar = tqdm(
        provience_station_list,
        desc = provience_dir,
    )
    for file_name in provience_progress_bar:
        file_path = f"{province_dir_path}{os.sep}{file_name}"
        # read data to DF
        climate_df = pd.read_csv(file_path, sep=",", skiprows=[0,1,2], header=None, parse_dates=[2], dtype=climate_dtypes)
        climate_header_df = pd.read_csv(file_path, nrows=0, header=2)
        # merging the cloumns 
        climate_df.drop([44], axis=1, inplace=True)
        climate_df.rename({index:name for index, name in enumerate(climate_header_df.columns)}, axis=1, inplace=True)
        # parse as datatime
        climate_df["Year Month Day Hour (YYYYMMDDHH)"] = pd.to_datetime(climate_df["Year Month Day Hour (YYYYMMDDHH)"].str.extract(climate_ics_date_pattern, expand=True))
        try:
            climate_df.to_sql(name=CWEEDS_STATION_TABLE_NAME, con=engine, if_exists='append', index=False, dtype=sql_dtypes)  
        except Exception as e:
            print(f"{file_name}:   {e}")

        # deleted all data 
        del climate_header_df
        del climate_df

        provience_progress_bar.set_postfix_str(file_name)

CWEEDS_2020_NS: 100%|██████████| 29/29 [06:47<00:00, 14.04s/it, CAN_NS_HART-ISLAND-(AUT)_8202318_CWEEDS2011_2003-2017.csv]       
CWEEDS_2020_NB: 100%|██████████| 13/13 [03:22<00:00, 15.61s/it, CAN_NB_CHARLO-AUTO_8100885_CWEEDS2011_2004-2017.csv]       
CWEEDS_2020_ON: 100%|██████████| 61/61 [16:27<00:00, 16.18s/it, CAN_ON_KEMPTVILLE-CS_6104027_CWEEDS2011_2002-2017.csv]           
CWEEDS_2020_NT_Rev_20210324: 100%|██████████| 31/31 [05:46<00:00, 11.18s/it, CAN_NT_FORT-LIARD_2201579_CWEEDS2011_2005-2017.csv]           
CWEEDS_2020_MB: 100%|██████████| 39/39 [10:22<00:00, 15.97s/it, CAN_MB_MELITA_501A7AR_CWEEDS2011_1998-2017.csv]              
CWEEDS_2020_SK: 100%|██████████| 44/44 [11:21<00:00, 15.48s/it, CAN_SK_KINDERSLEY-A_4043901_CWEEDS2011_1998-2017.csv]         
CWEEDS_2020_QC: 100%|██████████| 81/81 [21:38<00:00, 16.03s/it, CAN_QC_ROBERVAL-A_7066686_CWEEDS2011_1998-2017.csv]              
CWEEDS_2020_YT_Rev_20210324: 100%|██████████| 14/14 [02:24<00:00, 10.34s/it, CAN_YT_OLD-CROW-

In [10]:
with engine.connect() as con:
    con.execute(f'ALTER TABLE "{CWEEDS_STATION_TABLE_NAME}" ADD PRIMARY KEY ("ECCC station identifier", "Year Month Day Hour (YYYYMMDDHH)");')