In [1]:
import pandas as pd
import geopandas as gpd
from datetime import datetime
from sqlalchemy import create_engine, TIMESTAMP



In [2]:
climate_ics_date_pattern = '(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})(?P<hour>\d{2})'

In [3]:
climate_dtypes = {
    0: 'object',
    1: 'object',
    2: 'object',
    3: 'int64',
    4: 'int64',
    5: 'object',
    6: 'int64',
    7: 'object',
    8: 'int64',
    9: 'object',
    10: 'int64',
    11: 'object',
    12: 'int64',
    13: 'object',
    14: 'int64',
    15: 'object',
    16: 'int64',
    17: 'object',
    18: 'int64',
    19: 'object',
    20: 'int64',
    21: 'object',
    22: 'int64',
    23: 'object',
    24: 'int64',
    25: 'object',
    26: 'int64',
    27: 'object',
    28: 'int64',
    29: 'object',
    30: 'int64',
    31: 'object',
    32: 'int64',
    33: 'object',
    34: 'int64',
    35: 'object',
    36: 'int64',
    37: 'object',
    38: 'int64',
    39: 'object',
    40: 'int64',
    41: 'object',
    42: 'int64',
    43: 'object',
}

sql_dtypes = {
    "Year Month Day Hour (YYYYMMDDHH)": TIMESTAMP
}

In [4]:
DATABASE_TYPE="postgresql"
USERNAME="mutakabbir"
PASSWORD="lightning"
HOST="localhost"
PORT=5432
DATABASE_NAME="postgres"
ICS_TABLE_NAME="ics"
engine = create_engine(f"{DATABASE_TYPE}://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/{DATABASE_NAME}")

In [5]:
# # test DB insert
# from shapely.geometry import Point
# gpd.GeoDataFrame([], geometry=[Point(10,10)], crs='epsg:4326').to_postgis('test_table', engine)

In [6]:
data_dir = "/data/mutakabbir/Lightning/data/ics"

In [7]:
for provience_dir in os.listdir(data_dir):
    province_dir_path = f"{data_dir}/{provience_dir}"
    print(f"Started dir: {province_dir_path}")
    for file_name in os.listdir(province_dir_path):
        file_path = f"{province_dir_path}/{file_name}"
        # read data to DF
        climate_df = pd.read_csv(file_path, sep=",", skiprows=[0,1,2], header=None, parse_dates=[2], dtype=climate_dtypes)
        climate_header_df = pd.read_csv(file_path, nrows=0, header=2)
        # merging the cloumns 
        climate_df.drop([44], axis=1, inplace=True)
        climate_df.rename({index:name for index, name in enumerate(climate_header_df.columns)}, axis=1, inplace=True)
        # parse as datatime
        climate_df["Year Month Day Hour (YYYYMMDDHH)"] = pd.to_datetime(climate_df["Year Month Day Hour (YYYYMMDDHH)"].str.extract(climate_ics_date_pattern, expand=True))
        try:
            climate_df.to_sql(name=ICS_TABLE_NAME, con=engine, if_exists='append', index=False, dtype=sql_dtypes)  
        except Exception as e:
            print(f"{file_name}:   {e}")

        # deleted all data 
        del climate_header_df
        del climate_df
        print(f"\tDone: {file_name}")
    print(f"Finished dir: {province_dir_path}")

Started dir: /data/mutakabbir/Lightning/data/ics/CWEEDS_2020_NU
	Done: CAN_NU_TALOYOAK-A_2403855_CWEEDS2011_2005-2017.csv
	Done: CAN_NU_BROUGHTON-ISLAND_2400570_CWEEDS2011_2005-2017.csv
	Done: CAN_NU_LUPIN-CS_230N002_CWEEDS2011_2005-2017.csv
	Done: CAN_NU_GJOA-HAVEN-CLIMATE_2302340_CWEEDS2011_2005-2017.csv
	Done: CAN_NU_CAPE-MERCY_2400F63_CWEEDS2011_2005-2017.csv
	Done: CAN_NU_HAT-ISLAND_2302370_CWEEDS2011_2005-2017.csv
	Done: CAN_NU_IQALUIT-A_2402596_CWEEDS2011_2005-2017.csv
	Done: CAN_NU_CORAL-HARBOUR-A_2301002_CWEEDS2011_2005-2017.csv
	Done: CAN_NU_PANGNIRTUNG-A_2403054_CWEEDS2011_2005-2017.csv
	Done: CAN_NU_ARVIAT-CLIMATE_2301153_CWEEDS2011_2005-2017.csv
	Done: CAN_NU_CAPE-HOOPER_2400660_CWEEDS2011_2005-2017.csv
	Done: CAN_NU_SHEPHERD-BAY-A_2303685_CWEEDS2011_2005-2017.csv
	Done: CAN_NU_KUGAARUK-CLIMATE_2303094_CWEEDS2011_2005-2017.csv
	Done: CAN_NU_CAMBRIDGE-BAY-A_2400601_CWEEDS2011_2005-2017.csv
	Done: CAN_NU_ROWLEY-ISLAND_2403625_CWEEDS2011_2005-2017.csv
	Done: CAN_NU_HALL-BEACH

In [None]:
with engine.connect() as con:
    con.execute('ALTER TABLE ics ADD PRIMARY KEY ("ECCC station identifier", "Year Month Day Hour (YYYYMMDDHH)");')