#Group3_Data_Operations

In [None]:
# %pip install jupyter-black

In [None]:
# @title Import relevant modules
import os
import sys
import pandas as pd
import geopandas as gpd  # type: ignore
import sqlalchemy as sq
import black
import jupyter_black as bl  # type: ignore
from matplotlib import pyplot as plt  # type: ignore
from dotenv import load_dotenv
from Shared.DataService import DataService

In [None]:
# The following lines adjust the granularity of reporting.
pd.options.display.max_rows = 100
pd.options.display.float_format = "{:.1f}".format
pd.set_option("display.max_columns", None)

In [None]:
os.chdir("../../src")
bl.load()
load_dotenv("docker/.env")
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")

if PG_PORT is None:
    sys.exit("Error: Could not find the port for the database.")

In [None]:
regions = "WeatherStation/data/2006CensusAgRegions/gcar000b07a_e.shp"
gdfRegions = gpd.read_file(regions, encoding="utf-8")

In [None]:
stations = "WeatherStation/data/climate_station_list.csv"
dfStations = pd.read_csv(stations)

In [None]:
gdfStationsHly = gpd.GeoDataFrame(
    dfStations, geometry=gpd.points_from_xy(dfStations.Longitude, dfStations.Latitude)
)
gdfStationsHly = gdfStationsHly.loc[dfStations["HLY Last Year"] > 1995]

In [None]:
gdfStationsDly = gpd.GeoDataFrame(
    dfStations, geometry=gpd.points_from_xy(dfStations.Longitude, dfStations.Latitude)
)
gdfStationsDly = gdfStationsDly.loc[dfStations["HLY Last Year"] > 1995]

In [None]:
gdfRegions = gdfRegions.to_crs(crs={"init": "EPSG:3347"})
gdfStationsHly = gdfStationsHly.set_crs("EPSG:4326", allow_override=True)
gdfStationsHly = gdfStationsHly.to_crs("EPSG:3347")
gdfStationsDly = gdfStationsDly.set_crs("EPSG:4326", allow_override=True)
gdfStationsDly = gdfStationsDly.to_crs("EPSG:3347")

In [None]:
minx, miny, maxx, maxy = gdfRegions.total_bounds
# plot the both
fig, ax = plt.subplots(figsize=(20, 20))
ax.set_ylim(miny, maxy / 1.1)
gdfRegions.plot(ax=ax, cmap="Pastel1", edgecolor="black")
gdfStationsHly.plot(ax=ax, color="red", markersize=0.4)
plt.show()

In [None]:
minx, miny, maxx, maxy = gdfRegions.total_bounds
# plot the both
fig, ax = plt.subplots(figsize=(20, 20))
ax.set_ylim(miny, maxy / 1.1)
gdfRegions.plot(ax=ax, cmap="Pastel1", edgecolor="black")
gdfStationsDly.plot(ax=ax, color="red", markersize=0.4)
plt.show()

In [None]:
gdfRegions.head()

In [None]:
# remove rows with nan
gdfPruned = gdfRegions.copy(deep=True)
# keep ab mb sk
gdfPruned["PRuid"] = gdfPruned["PRuid"].astype(int)
gdfPruned = gdfPruned.loc[gdfPruned["PRuid"].isin([46, 47, 48])]

In [None]:
gdfPruned.set_crs("EPSG:4326", allow_override=True)
gdfProjected = gdfPruned.to_crs("EPSG:3347")

In [None]:
# plot stations vs projected regions
fig, ax = plt.subplots(figsize=(20, 20))
ax.set_ylim(miny * 2, maxy / 1.2)
ax.set_xlim(minx * 1.15, maxx / 1.4)
gdfProjected.plot(ax=ax, cmap="Pastel1", edgecolor="black")
gdfStationsHly.plot(ax=ax, color="red", markersize=1)

In [None]:
# plot stations vs projected regions
fig, ax = plt.subplots(figsize=(20, 20))
ax.set_ylim(miny * 2, maxy / 1.2)
ax.set_xlim(minx * 1.15, maxx / 1.4)
gdfProjected.plot(ax=ax, cmap="Pastel1", edgecolor="black")
gdfStationsDly.plot(ax=ax, color="red", markersize=1)

In [None]:
gdfStationsDly.count()

In [None]:
# reindex
gdfProjected.reset_index(inplace=True)

In [None]:
gdfProjected

In [None]:
# gray in rgba
gdfProjected["color"] = "#808080"

In [None]:
gdfProjected["CRnum"] = 0

In [None]:
# list of 11 colors complimentary in rgba
colors = [
    "#d3d3d3",
    "#556b2f",
    "#008080",
    "#6495ed",
    "#00ff7f",
    "#db7093",
    "#f0e68c",
    "#ffa07a",
    "#ee82ee",
    "#1e90ff",
    "#eee8aa",
]

In [None]:
gdfProjected["CARuid"] = gdfProjected["CARuid"].astype(int)

In [None]:
gdfProjected.loc[
    gdfProjected["CARuid"].isin([4612, 4601, 4602, 4603, 4608]), "CRnum"
] = 3
gdfProjected.loc[gdfProjected["CARuid"].isin([4604, 4605, 4606]), "CRnum"] = 2
gdfProjected.loc[gdfProjected["CARuid"].isin([4607, 4609, 4610, 4611]), "CRnum"] = 1

gdfProjected.loc[gdfProjected["CRnum"] == 3, "color"] = colors[0]
gdfProjected.loc[gdfProjected["CRnum"] == 2, "color"] = colors[1]
gdfProjected.loc[gdfProjected["CRnum"] == 1, "color"] = colors[2]

In [None]:
gdfProjected.loc[gdfProjected["CARuid"].isin([4810, 4820, 4830]), "CRnum"] = 8
gdfProjected.loc[gdfProjected["CARuid"].isin([4840, 4841, 4850]), "CRnum"] = 9
gdfProjected.loc[gdfProjected["CARuid"].isin([4860, 4870]), "CRnum"] = 10

gdfProjected.loc[gdfProjected["CRnum"] == 8, "color"] = colors[3]
gdfProjected.loc[gdfProjected["CRnum"] == 9, "color"] = colors[4]
gdfProjected.loc[gdfProjected["CRnum"] == 10, "color"] = colors[10]

In [None]:
gdfProjected.loc[
    gdfProjected["CARuid"].isin([4710, 4711, 4720, 4721, 4731]), "CRnum"
] = 4
gdfProjected.loc[
    gdfProjected["CARuid"].isin([4730, 4732, 4733, 4740, 4741]), "CRnum"
] = 5
gdfProjected.loc[
    gdfProjected["CARuid"].isin([4750, 4751, 4760, 4780, 4781]), "CRnum"
] = 6
gdfProjected.loc[
    gdfProjected["CARuid"].isin([4761, 4770, 4771, 4790, 4791]), "CRnum"
] = 7


gdfProjected.loc[gdfProjected["CRnum"] == 4, "color"] = colors[5]
gdfProjected.loc[gdfProjected["CRnum"] == 5, "color"] = colors[7]
gdfProjected.loc[gdfProjected["CRnum"] == 6, "color"] = colors[6]
gdfProjected.loc[gdfProjected["CRnum"] == 7, "color"] = colors[8]

In [None]:
# plot using color column
fig, ax = plt.subplots(figsize=(20, 20))
ax.set_ylim(miny * 2, maxy / 1.2)
ax.set_xlim(minx * 1.15, maxx / 1.4)
gdfProjected.plot(ax=ax, color=gdfProjected["color"], edgecolor="black")

In [None]:
# now the regions and stations are in the same projection
# the regions dataframe has a column called 'color' that has the color for each region and the number of the CR
# we will now add a column to the stations list for the CR number
# first we only keep ALBERTA MANITOBA SASKATCHEWAN in Province column from gdfStationsDly
gdfStationsDly = gdfStationsDly[
    gdfStationsDly["Province"].isin(["ALBERTA", "MANITOBA", "SASKATCHEWAN"])
]

gdfStationsDly = gdfStationsDly.sjoin(gdfProjected, how="left", op="within")
# gdfStationsDly
gdfStationsDly.drop(
    columns=["index_right", "index", "CARname", "PRuid", "AGuid", "color"], inplace=True
)
gdfStationsDly.rename(columns={"CARuid": "district"}, inplace=True)

In [None]:
# we do the same for gdfStationsHly
gdfStationsHly = gdfStationsHly[
    gdfStationsHly["Province"].isin(["ALBERTA", "MANITOBA", "SASKATCHEWAN"])
]
gdfStationsHly = gdfStationsHly.sjoin(gdfProjected, how="left", op="within")
# gdfStationsHly
gdfStationsHly.drop(
    columns=["index_right", "index", "CARname", "PRuid", "AGuid", "color"], inplace=True
)
gdfStationsHly.rename(columns={"CARuid": "district"}, inplace=True)

In [None]:
# gdfStationsDly.head()
# print unique CR numbers
# gdfProjected["CRnum"].unique()
# print rows with CRnum = 0
# gdfProjected[gdfProjected["CRnum"] == 0]

In [None]:
# add column 'scraped' to gdfStationsDly and gdfStationsHly and set it to false
gdfStationsDly["scraped"] = False
gdfStationsHly["scraped"] = False

In [None]:
# remove index column from gdfProjected
gdfProjected = gdfProjected.drop(columns=["index"])

# reindex gdfProjected gdfStationsDly gdfStationsHly
gdfProjected = gdfProjected.reset_index(drop=True)
gdfStationsDly = gdfStationsDly.reset_index(drop=True)
gdfStationsHly = gdfStationsHly.reset_index(drop=True)

In [None]:
def pushGdfToPostGresDB(
    tableName: str,
    geodf: gpd.GeoDataFrame,
    db: str,
    addr: str,
    port: int,
    user: str,
    pw: str,
) -> None:
    if db is None or addr is None or port is None or user is None or pw is None:
        raise ValueError("Environment variables not set")

    db_service = DataService(db, addr, port, user, pw)
    db_con = db_service.connect()

    geodf.to_postgis(tableName, db_con, index=False, if_exists="replace")
    db_service.cleanup()

In [None]:
gdfProjected.rename(columns={"CARuid": "car_uid"}, inplace=True)
gdfProjected.rename(columns={"CARname": "car_name"}, inplace=True)
gdfProjected.rename(columns={"PRuid": "pr_uid"}, inplace=True)
gdfProjected.rename(columns={"AGuid": "ag_uid"}, inplace=True)

In [None]:
gdfStationsDly.rename(columns={"Station Name": "station_name"}, inplace=True)
gdfStationsDly.rename(columns={"Province": "province"}, inplace=True)
gdfStationsDly.rename(columns={"Latitude": "latitude"}, inplace=True)
gdfStationsDly.rename(columns={"Longitude": "longitude"}, inplace=True)
gdfStationsDly.rename(columns={"Elevation": "elevation"}, inplace=True)
gdfStationsDly.rename(columns={"Climate ID": "station_id"}, inplace=True)
gdfStationsDly.rename(columns={"WMO Identifier": "wmo_identifier"}, inplace=True)
gdfStationsDly.rename(columns={"TC Identifier": "tc_identifier"}, inplace=True)
gdfStationsDly.rename(columns={"First Year": "first_year"}, inplace=True)
gdfStationsDly.rename(columns={"Last Year": "last_year"}, inplace=True)
gdfStationsDly.rename(columns={"HLY First Year": "hly_first_year"}, inplace=True)
gdfStationsDly.rename(columns={"HLY Last Year": "hly_last_year"}, inplace=True)
gdfStationsDly.rename(columns={"DLY First Year": "dly_first_year"}, inplace=True)
gdfStationsDly.rename(columns={"DLY Last Year": "dly_last_year"}, inplace=True)
gdfStationsDly.rename(columns={"MLY First Year": "mly_first_year"}, inplace=True)
gdfStationsDly.rename(columns={"MLY Last Year": "mly_last_year"}, inplace=True)
# gdfStationsDly.rename(columns={gdfStationsDly.columns[16]: "geometry"}, inplace=True)
gdfStationsDly.rename(columns={"CRnum": "cr_num"}, inplace=True)

gdfStationsDly.loc[gdfStationsDly["province"] == "ALBERTA", "province"] = "AB"
gdfStationsDly.loc[gdfStationsDly["province"] == "MANITOBA", "province"] = "MB"
gdfStationsDly.loc[gdfStationsDly["province"] == "SASKATCHEWAN", "province"] = "SK"

In [None]:
gdfStationsHly.rename(columns={"Station Name": "station_name"}, inplace=True)
gdfStationsHly.rename(columns={"Province": "province"}, inplace=True)
gdfStationsHly.rename(columns={"Latitude": "latitude"}, inplace=True)
gdfStationsHly.rename(columns={"Longitude": "longitude"}, inplace=True)
gdfStationsHly.rename(columns={"Elevation": "elevation"}, inplace=True)
gdfStationsHly.rename(columns={"Climate ID": "station_id"}, inplace=True)
gdfStationsHly.rename(columns={"WMO Identifier": "wmo_identifier"}, inplace=True)
gdfStationsHly.rename(columns={"TC Identifier": "tc_identifier"}, inplace=True)
gdfStationsHly.rename(columns={"First Year": "first_year"}, inplace=True)
gdfStationsHly.rename(columns={"Last Year": "last_year"}, inplace=True)
gdfStationsHly.rename(columns={"HLY First Year": "hly_first_year"}, inplace=True)
gdfStationsHly.rename(columns={"HLY Last Year": "hly_last_year"}, inplace=True)
gdfStationsHly.rename(columns={"DLY First Year": "dly_first_year"}, inplace=True)
gdfStationsHly.rename(columns={"DLY Last Year": "dly_last_year"}, inplace=True)
gdfStationsHly.rename(columns={"MLY First Year": "mly_first_year"}, inplace=True)
gdfStationsHly.rename(columns={"MLY Last Year": "mly_last_year"}, inplace=True)
# gdfStationsHly.rename(columns={gdfStationsDly.columns[16]: "geometry"}, inplace=True)
gdfStationsHly.rename(columns={"CRnum": "cr_num"}, inplace=True)

gdfStationsHly.loc[gdfStationsHly["province"] == "ALBERTA", "province"] = "AB"
gdfStationsHly.loc[gdfStationsHly["province"] == "MANITOBA", "province"] = "MB"
gdfStationsHly.loc[gdfStationsHly["province"] == "SASKATCHEWAN", "province"] = "SK"

# change next cell to code to push to DB

In [None]:
### push dataframes to postgres

if (
    PG_DB is None
    or PG_ADDR is None
    or PG_PORT is None
    or PG_USER is None
    or PG_PW is None
):
    raise ValueError("Environment variables not set")

tblName = "census_ag_regions"
pushGdfToPostGresDB(tblName, gdfProjected, PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)

tblName = "stations_dly"
pushGdfToPostGresDB(
    tblName, gdfStationsDly, PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW
)

tblName = "stations_hly"
pushGdfToPostGresDB(
    tblName, gdfStationsHly, PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW
)