In [None]:
import os
import xarray as xr
import geopandas as gpd  # type: ignore

from dotenv import load_dotenv
import sqlalchemy as sq
import sys
import pandas as pd
import matplotlib.pyplot as plt  # type: ignore

sys.path.append("../")
from Shared.DataService import DataService

In [None]:
SOIL_GEOM_TABLE = "soil_geometry"
SOIL_COMP_TABLE = "soil_components"
SOIL_SURRONDINGS_TABLE = "soil_surronding_land"
SOIL_DATA_TABLE = "soil_data"

load_dotenv()
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")

In [None]:
if (
    PG_DB is None
    or PG_ADDR is None
    or PG_PORT is None
    or PG_USER is None
    or PG_PW is None
):
    raise ValueError("Environment variables not set")
db = DataService(PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)
conn = db.connect()

In [None]:
query = sq.text("select * FROM public.soil_data")
soil_data = pd.read_sql(query, conn)
soil_data.head()

In [None]:
soil_data = soil_data[
    [
        "id",
        "province",
        "kind",
        "water_table",
        "root_restrict",
        "drainage",
        "parent_material_texture_1",
        "parent_material_chemical_1",
        "mode_of_depo_1",
        "percnt_coarse_frag",
        "sand_texture",
        "total_sand",
        "total_silt",
        "total_clay",
        "percnt_carbon",
        "calcium_ph",
        "proj_ph",
        "water_reten_0",
        "water_reten_10",
        "water_reten_33",
        "water_reten_1500",
        "bulk_density",
        "elec_cond",
        "percnt_wood",
    ]
]

In [None]:
soil_data

In [None]:
query = sq.text("select * FROM public.soil_surronding_land")
surronding_soil = pd.read_sql(query, conn)
surronding_soil.head()

In [None]:
surronding_soil = surronding_soil[["poly_id", "land_area", "water_area"]]

In [None]:
surronding_soil.head()

In [None]:
query = sq.text("select * FROM public.soil_components")
soil_components = pd.read_sql(query, conn)
soil_components.head()

In [None]:
soil_components = soil_components[
    ["poly_id", "cmp", "percent", "province", "profile", "soil_id", "water_holding_cap"]
]

In [None]:
soil_components.head()

In [None]:
query = sq.text("select * FROM public.soil_geometry")
soil_geometry = pd.read_sql(query, conn)
soil_geometry.head()

In [None]:
soil_data.rename(columns={"id": "soil_id"}, inplace=True)

In [None]:
soil_data

In [None]:
merge_df = soil_components.merge(soil_data, on=["soil_id", "province"], how="inner")

In [None]:
# merge_df = merge_df.merge(soil_geometry, on="poly_id", how="inner")

In [None]:
merge_df.head()

In [None]:
merge_df = merge_df.merge(surronding_soil, on="poly_id", how="inner")

In [None]:
merge_df.head()

In [None]:
merge_df = merge_df.merge(soil_geometry, on="poly_id", how="inner")

In [None]:
merge_df.info()

In [None]:
merge_df.duplicated().sum()

In [None]:
merge_df = merge_df.drop_duplicates()

In [None]:
merge_df.head()

In [None]:
merge_df = pd.get_dummies(
    merge_df,
    columns=[
        "profile",
        "kind",
        "water_table",
        "root_restrict",
        "drainage",
        "parent_material_texture_1",
        "parent_material_chemical_1",
        "mode_of_depo_1",
        "sand_texture",
    ],
)

In [None]:
merge_df.columns

In [None]:
merge_df["water_holding_cap"] = pd.to_numeric(
    merge_df["water_holding_cap"], errors="coerce"
)

In [None]:
merge_df.info()

In [None]:
merge_df.groupby(["poly_id", "province", "soil_id"]).agg("mean").reset_index()

In [None]:
merge_df

In [None]:
db.cleanup()