#Group3_Data_Operations

In [None]:
# %pip install jupyter-black

In [None]:
# @title Import relevant modules
import os
import sys
import pandas as pd
import geopandas as gpd
import sqlalchemy as sq
import black
import jupyter_black as bl
from matplotlib import pyplot as plt
from dotenv import load_dotenv

In [None]:
# The following lines adjust the granularity of reporting.
pd.options.display.max_rows = 100
pd.options.display.float_format = "{:.1f}".format
pd.set_option("display.max_columns", None)

In [None]:
os.chdir("../../src")
bl.load()
load_dotenv("docker/.env")
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")

if PG_PORT is None:
    sys.exit("Error: Could not find the port for the database.")

In [None]:
regions = "WeatherStation/data/2006CensusAgRegions/gcar000b07a_e.shp"
gdfRegions = gpd.read_file(regions, encoding="utf-8")

In [None]:
stations = "WeatherStation/data/climate_station_list.csv"
dfStations = pd.read_csv(stations)

In [None]:
gdfStationsHly = gpd.GeoDataFrame(
    dfStations, geometry=gpd.points_from_xy(dfStations.Longitude, dfStations.Latitude)
)
gdfStationsHly = gdfStationsHly.loc[dfStations["HLY Last Year"] > 1995]

In [None]:
gdfStationsDly = gpd.GeoDataFrame(
    dfStations, geometry=gpd.points_from_xy(dfStations.Longitude, dfStations.Latitude)
)
gdfStationsDly = gdfStationsDly.loc[dfStations["HLY Last Year"] > 1995]

In [None]:
gdfRegions = gdfRegions.to_crs(crs={"init": "EPSG:3347"})
gdfStationsHly = gdfStationsHly.set_crs("EPSG:4326", allow_override=True)
gdfStationsHly = gdfStationsHly.to_crs("EPSG:3347")
gdfStationsDly = gdfStationsDly.set_crs("EPSG:4326", allow_override=True)
gdfStationsDly = gdfStationsDly.to_crs("EPSG:3347")

In [None]:
minx, miny, maxx, maxy = gdfRegions.total_bounds
# plot the both
fig, ax = plt.subplots(figsize=(20, 20))
ax.set_ylim(miny, maxy / 1.1)
gdfRegions.plot(ax=ax, cmap="Pastel1", edgecolor="black")
gdfStationsHly.plot(ax=ax, color="red", markersize=0.4)
plt.show()

In [None]:
minx, miny, maxx, maxy = gdfRegions.total_bounds
# plot the both
fig, ax = plt.subplots(figsize=(20, 20))
ax.set_ylim(miny, maxy / 1.1)
gdfRegions.plot(ax=ax, cmap="Pastel1", edgecolor="black")
gdfStationsDly.plot(ax=ax, color="red", markersize=0.4)
plt.show()

In [None]:
gdfRegions.head()

In [None]:
# remove rows with nan
gdfPruned = gdfRegions.copy(deep=True)
# keep ab mb sk
gdfPruned["PRuid"] = gdfPruned["PRuid"].astype(int)
gdfPruned = gdfPruned.loc[gdfPruned["PRuid"].isin([46, 47, 48])]

In [None]:
gdfPruned.set_crs("EPSG:4326", allow_override=True)
gdfProjected = gdfPruned.to_crs("EPSG:3347")

In [None]:
# plot stations vs projected regions
fig, ax = plt.subplots(figsize=(20, 20))
ax.set_ylim(miny * 2, maxy / 1.2)
ax.set_xlim(minx * 1.15, maxx / 1.4)
gdfProjected.plot(ax=ax, cmap="Pastel1", edgecolor="black")
gdfStationsHly.plot(ax=ax, color="red", markersize=1)

In [None]:
# plot stations vs projected regions
fig, ax = plt.subplots(figsize=(20, 20))
ax.set_ylim(miny * 2, maxy / 1.2)
ax.set_xlim(minx * 1.15, maxx / 1.4)
gdfProjected.plot(ax=ax, cmap="Pastel1", edgecolor="black")
gdfStationsDly.plot(ax=ax, color="red", markersize=1)

In [None]:
gdfStationsDly.count()

In [None]:
# reindex
gdfProjected.reset_index(inplace=True)

In [None]:
gdfProjected

In [None]:
# gray in rgba
gdfProjected["color"] = "#808080"

In [None]:
gdfProjected["CRnum"] = 0

In [None]:
# list of 11 colors complimentary in rgba
colors = [
    "#d3d3d3",
    "#556b2f",
    "#008080",
    "#6495ed",
    "#00ff7f",
    "#db7093",
    "#f0e68c",
    "#ffa07a",
    "#ee82ee",
    "#1e90ff",
    "#eee8aa",
]

In [None]:
gdfProjected["CARuid"] = gdfProjected["CARuid"].astype(int)

In [None]:
gdfProjected.loc[
    gdfProjected["CARuid"].isin([4612, 4601, 4602, 4603, 4608]), "CRnum"
] = 3
gdfProjected.loc[gdfProjected["CARuid"].isin([4604, 4605, 4606]), "CRnum"] = 2
gdfProjected.loc[gdfProjected["CARuid"].isin([4607, 4609, 4610, 4611]), "CRnum"] = 1

gdfProjected.loc[gdfProjected["CRnum"] == 3, "color"] = colors[0]
gdfProjected.loc[gdfProjected["CRnum"] == 2, "color"] = colors[1]
gdfProjected.loc[gdfProjected["CRnum"] == 1, "color"] = colors[2]

In [None]:
gdfProjected.loc[gdfProjected["CARuid"].isin([4810, 4820, 4830]), "CRnum"] = 8
gdfProjected.loc[gdfProjected["CARuid"].isin([4840, 4841, 4850]), "CRnum"] = 9
gdfProjected.loc[gdfProjected["CARuid"].isin([4860, 4870]), "CRnum"] = 10

gdfProjected.loc[gdfProjected["CRnum"] == 8, "color"] = colors[3]
gdfProjected.loc[gdfProjected["CRnum"] == 9, "color"] = colors[4]
gdfProjected.loc[gdfProjected["CRnum"] == 10, "color"] = colors[10]

In [None]:
gdfProjected.loc[
    gdfProjected["CARuid"].isin([4710, 4711, 4720, 4721, 4731]), "CRnum"
] = 4
gdfProjected.loc[
    gdfProjected["CARuid"].isin([4730, 4732, 4733, 4740, 4741]), "CRnum"
] = 5
gdfProjected.loc[
    gdfProjected["CARuid"].isin([4750, 4751, 4760, 4780, 4781]), "CRnum"
] = 6
gdfProjected.loc[
    gdfProjected["CARuid"].isin([4761, 4770, 4771, 4790, 4791]), "CRnum"
] = 7


gdfProjected.loc[gdfProjected["CRnum"] == 4, "color"] = colors[5]
gdfProjected.loc[gdfProjected["CRnum"] == 5, "color"] = colors[7]
gdfProjected.loc[gdfProjected["CRnum"] == 6, "color"] = colors[6]
gdfProjected.loc[gdfProjected["CRnum"] == 7, "color"] = colors[8]

In [None]:
# plot using color column
fig, ax = plt.subplots(figsize=(20, 20))
ax.set_ylim(miny * 2, maxy / 1.2)
ax.set_xlim(minx * 1.15, maxx / 1.4)
gdfProjected.plot(ax=ax, color=gdfProjected["color"], edgecolor="black")

In [None]:
# now the regions and stations are in the same projection
# the regions dataframe has a column called 'color' that has the color for each region and the number of the CR
# we will now add a column to the stations list for the CR number

In [None]:
# first we only keep ALBERTA MANITOBA SASKATCHEWAN in Province column from gdfStationsDly
gdfStationsDly = gdfStationsDly[
    gdfStationsDly["Province"].isin(["ALBERTA", "MANITOBA", "SASKATCHEWAN"])
]

# then we add a column to gdfStationsDly called CRnum and set it to 0
gdfStationsDly["CRnum"] = 0

# then we loop through the rows in gdfStationsDly and for each row we loop through the rows in gdfProjected
# if the station is in the region we set the CRnum to the CRnum of the region
for index, row in gdfStationsDly.iterrows():
    for index2, row2 in gdfProjected.iterrows():
        if row["geometry"].within(row2["geometry"]):
            gdfStationsDly.loc[index, "CRnum"] = row2["CRnum"]

In [None]:
# we do the same for gdfStationsHly
gdfStationsHly = gdfStationsHly[
    gdfStationsHly["Province"].isin(["ALBERTA", "MANITOBA", "SASKATCHEWAN"])
]
gdfStationsHly["CRnum"] = 0

for index, row in gdfStationsHly.iterrows():
    for index2, row2 in gdfProjected.iterrows():
        if row["geometry"].within(row2["geometry"]):
            gdfStationsHly.loc[index, "CRnum"] = row2["CRnum"]

In [None]:
gdfStationsDly.head()

In [None]:
# print unique CR numbers
gdfProjected["CRnum"].unique()

In [None]:
# print rows with CRnum = 0
gdfProjected[gdfProjected["CRnum"] == 0]

In [None]:
# add column 'scraped' to gdfStationsDly and gdfStationsHly and set it to false
gdfStationsDly["scraped"] = False
gdfStationsHly["scraped"] = False

In [None]:
# remove index column from gdfProjected
gdfProjected = gdfProjected.drop(columns=["index"])

# reindex gdfProjected gdfStationsDly gdfStationsHly
gdfProjected = gdfProjected.reset_index(drop=True)
gdfStationsDly = gdfStationsDly.reset_index(drop=True)
gdfStationsHly = gdfStationsHly.reset_index(drop=True)

In [None]:
def pushGdfToPostGresDB(
    tableName: str,
    geodf: gpd.GeoDataFrame,
    db: str,
    addr: str,
    port: int,
    user: str,
    pw: str,
) -> None:
    import geopandas as gpd

    sys.path.append("../")
    from DataService import DataService

    db_service = DataService(db, addr, port, user, pw)
    db_con = db_service.connect()

    geodf.to_postgis(tableName, db_con, index=False, if_exists="replace")
    db_service.cleanup()

In [None]:
gdfProjected.rename(columns={gdfProjected.columns[0]: "car_uid"}, inplace=True)
gdfProjected.rename(columns={gdfProjected.columns[1]: "car_name"}, inplace=True)
gdfProjected.rename(columns={gdfProjected.columns[2]: "pr_uid"}, inplace=True)
gdfProjected.rename(columns={gdfProjected.columns[3]: "ag_uid"}, inplace=True)


gdfStationsDly.rename(columns={gdfStationsDly.columns[0]: "station_name"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[1]: "province"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[2]: "latitude"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[3]: "longitude"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[4]: "elevation"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[5]: "station_id"}, inplace=True)
gdfStationsDly.rename(
    columns={gdfStationsDly.columns[6]: "wmo_identifier"}, inplace=True
)
gdfStationsDly.rename(
    columns={gdfStationsDly.columns[7]: "tc_identifier"}, inplace=True
)
gdfStationsDly.rename(columns={gdfStationsDly.columns[8]: "first_year"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[9]: "last_year"}, inplace=True)
gdfStationsDly.rename(
    columns={gdfStationsDly.columns[10]: "hly_first_year"}, inplace=True
)
gdfStationsDly.rename(
    columns={gdfStationsDly.columns[11]: "hly_last_year"}, inplace=True
)
gdfStationsDly.rename(
    columns={gdfStationsDly.columns[12]: "dly_first_year"}, inplace=True
)
gdfStationsDly.rename(
    columns={gdfStationsDly.columns[13]: "dly_last_year"}, inplace=True
)
gdfStationsDly.rename(
    columns={gdfStationsDly.columns[14]: "mly_first_year"}, inplace=True
)
gdfStationsDly.rename(
    columns={gdfStationsDly.columns[15]: "mly_last_year"}, inplace=True
)
gdfStationsDly.rename(columns={gdfStationsDly.columns[16]: "geometry"}, inplace=True)
gdfStationsDly.rename(columns={gdfStationsDly.columns[17]: "cr_num"}, inplace=True)

gdfStationsDly.loc[gdfStationsDly["province"] == "ALBERTA", "province"] = "AB"
gdfStationsDly.loc[gdfStationsDly["province"] == "MANITOBA", "province"] = "MB"
gdfStationsDly.loc[gdfStationsDly["province"] == "SASKATCHEWAN", "province"] = "SK"

gdfStationsHly.rename(columns={gdfStationsHly.columns[0]: "station_name"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[1]: "province"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[2]: "latitude"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[3]: "longitude"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[4]: "elevation"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[5]: "station_id"}, inplace=True)
gdfStationsHly.rename(
    columns={gdfStationsHly.columns[6]: "wmo_identifier"}, inplace=True
)
gdfStationsHly.rename(
    columns={gdfStationsHly.columns[7]: "tc_identifier"}, inplace=True
)
gdfStationsHly.rename(columns={gdfStationsHly.columns[8]: "first_year"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[9]: "last_year"}, inplace=True)
gdfStationsHly.rename(
    columns={gdfStationsHly.columns[10]: "hly_first_year"}, inplace=True
)
gdfStationsHly.rename(
    columns={gdfStationsHly.columns[11]: "hly_last_year"}, inplace=True
)
gdfStationsHly.rename(
    columns={gdfStationsHly.columns[12]: "dly_first_year"}, inplace=True
)
gdfStationsHly.rename(
    columns={gdfStationsHly.columns[13]: "dly_last_year"}, inplace=True
)
gdfStationsHly.rename(
    columns={gdfStationsHly.columns[14]: "mly_first_year"}, inplace=True
)
gdfStationsHly.rename(
    columns={gdfStationsHly.columns[15]: "mly_last_year"}, inplace=True
)
gdfStationsHly.rename(columns={gdfStationsHly.columns[16]: "geometry"}, inplace=True)
gdfStationsHly.rename(columns={gdfStationsHly.columns[17]: "cr_num"}, inplace=True)

gdfStationsHly.loc[gdfStationsHly["province"] == "ALBERTA", "province"] = "AB"
gdfStationsHly.loc[gdfStationsHly["province"] == "MANITOBA", "province"] = "MB"
gdfStationsHly.loc[gdfStationsHly["province"] == "SASKATCHEWAN", "province"] = "SK"

# change next cell to code to push to DB

In [None]:
### push dataframes to postgres

tblName = "census_ag_regions"
pushGdfToPostGresDB(tblName, gdfProjected, PG_DB, PG_ADDR, PG_PORT, PG_USER, PG_PW)

tblName = "stations_dly"
pushGdfToPostGresDB(tblName, gdfStationsDly, PG_DB, PG_ADDR, PG_PORT, PG_USER, PG_PW)

tblName = "stations_hly"
pushGdfToPostGresDB(tblName, gdfStationsHly, PG_DB, PG_ADDR, PG_PORT, PG_USER, PG_PW)