In [None]:
import os
import xarray as xr
import geopandas as gpd  # type: ignore

from dotenv import load_dotenv
import sqlalchemy as sq
import sys
import pandas as pd
import matplotlib.pyplot as plt  # type: ignore

sys.path.append("../")
from Shared.DataService import DataService

In [None]:
SOIL_GEOM_TABLE = "soil_geometry"
SOIL_COMP_TABLE = "soil_components"
SOIL_SURRONDINGS_TABLE = "soil_surronding_land"
SOIL_DATA_TABLE = "soil_data"

load_dotenv()
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")

In [None]:
db = DataService(PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)
conn = db.connect()

In [None]:
def pullSoilData(conn):
    query = sq.text("select * FROM public.soil_data")
    soil_data = pd.read_sql(query, conn)
    soil_data = soil_data[
        [
            "id",
            "province",
            "kind",
            "name",
            "water_table",
            "root_restrict",
            "drainage",
            "parent_material_texture_1",
            "parent_material_chemical_1",
            "mode_of_depo_1",
            "percnt_coarse_frag",
            "sand_texture",
            "total_sand",
            "total_silt",
            "total_clay",
            "percnt_carbon",
            "calcium_ph",
            "proj_ph",
            "water_reten_0",
            "water_reten_10",
            "water_reten_33",
            "water_reten_1500",
            "bulk_density",
            "elec_cond",
            "percnt_wood",
        ]
    ]
    soil_data.rename(columns={"id": "soil_id"}, inplace=True)
    return soil_data

In [None]:
def pullSurroundingSoil(conn):
    query = sq.text("select * FROM public.soil_surronding_land")
    surronding_soil = pd.read_sql(query, conn)
    surronding_soil = surronding_soil[["poly_id", "land_area", "water_area"]]
    return surronding_soil

In [None]:
def pullSoilComponents(conn):
    query = sq.text("select * FROM public.soil_components")
    soil_components = pd.read_sql(query, conn)
    soil_components = soil_components[
        [
            "poly_id",
            "cmp",
            "percent",
            "province",
            "profile",
            "soil_id",
            "water_holding_cap",
        ]
    ]
    return soil_components

In [None]:
def pullSoilGeometry(conn):
    # load the boundaries for the agriculture regions
    query = sq.text("select district, geometry FROM public.census_ag_regions")  # type: ignore
    agRegions = gpd.GeoDataFrame.from_postgis(query, conn, crs="EPSG:3347", geom_col="geometry")  # type: ignore

    query = sq.text("select * FROM public.soil_geometry")
    soilRegions = gpd.GeoDataFrame.from_postgis(
        query, conn, crs="EPSG:3347", geom_col="geometry"
    )

    # Join to add district to df
    soil_geometry = gpd.sjoin(soilRegions, agRegions, how="inner", op="intersects")
    soil_geometry.drop(columns=["geometry", "index_right"], inplace=True)
    soil_geometry = pd.DataFrame(soil_geometry)
    return soil_geometry

In [None]:
def getMergedf(soil_data, surronding_soil, soil_components, soil_geometry):
    # Commands to join tables
    merge_df = soil_components.merge(soil_data, on=["soil_id", "province"], how="inner")
    merge_df = merge_df.merge(surronding_soil, on="poly_id", how="inner")
    merge_df = merge_df.merge(soil_geometry, on="poly_id", how="inner")

    #  Commands to remove duplicates
    merge_df.duplicated().sum()
    merge_df = merge_df.drop_duplicates()

    # Commands to change attributes which are object types
    merge_df = pd.get_dummies(
        merge_df,
        columns=[
            "profile",
            "kind",
            "water_table",
            "root_restrict",
            "drainage",
            "parent_material_texture_1",
            "parent_material_chemical_1",
            "mode_of_depo_1",
            "sand_texture",
        ],
    )
    merge_df["water_holding_cap"] = pd.to_numeric(
        merge_df["water_holding_cap"], errors="coerce"
    )

    # Aggregate Data
    merge_df.groupby(["poly_id", "province", "soil_id", "name"]).agg(
        "mean"
    ).reset_index()
    return merge_df

In [None]:
def main():
    if (
        PG_DB is None
        or PG_ADDR is None
        or PG_PORT is None
        or PG_USER is None
        or PG_PW is None
    ):
        raise ValueError("Environment variables not set")

    db = DataService(PG_DB, PG_ADDR, PG_PORT, PG_USER, PG_PW)
    conn = db.connect()

    # get data
    soil_data = pullSoilData(conn)
    surronding_soil = pullSurroundingSoil(conn)
    soil_components = pullSoilComponents(conn)
    soil_geometry = pullSoilGeometry(conn)

    # Merge data and aggregate them
    merge_df = getMergedf(soil_data, surronding_soil, soil_components, soil_geometry)
    TABLENAME = "agg_soil_data"

    # Command to add to db
    merge_df.tosql(TABLENAME, conn, schema="public", if_exists="replace", index=False)

In [None]:
if __name__ == "__main__":
    main()