In [None]:
%pip install xarray
%pip install netCDF4

In [1]:
import os
import xarray as xr
import geopandas as gpd  # type: ignore
from QueryHandler import QueryHandler
from dotenv import load_dotenv
import sqlalchemy as sq
import sys

sys.path.append("../")
from Shared.DataService import DataService  # type: ignore


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
TABLE = "soil_moisture"

In [3]:
load_dotenv()
PG_USER = os.getenv("POSTGRES_USER", "")
PG_PW = os.getenv("POSTGRES_PW", "")
PG_DB = os.getenv("POSTGRES_DB", "")
PG_ADDR = os.getenv("POSTGRES_ADDR", "")
PG_PORT = os.getenv("POSTGRES_PORT", 5432)

In [4]:
# netcdf_file_path = '/home/student/anghandk/CGC_Grain_Outcome_Predictions/src/MoistureSatelliteTest/Images/2004/ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED_ADJUSTED-20040605000000-fv07.1.nc'
netcdf_file_path = "/data/common/Images/2004/ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-20040605000000-fv07.1.nc"
queryHandler = QueryHandler()

db = DataService(PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)
conn = db.connect()

queryHandler.createSoilMoistureTableReq(db)

In [5]:
query = sq.text("select cr_num, car_uid, geometry FROM public.census_ag_regions")
agRegions = gpd.GeoDataFrame.from_postgis(
    query, conn, crs="EPSG:3347", geom_col="geometry"
)

### Preparing NetCDF file

In [6]:
dataset = xr.open_dataset(netcdf_file_path)
df = (
    dataset.to_dataframe().reset_index()
)  # Converts the contents into a dataframe and corrects indexes

dataset.close()

In [7]:
df.drop(columns=["flag", "freqbandID", "dnflag", "mode", "sensor", "t0"], inplace=True)
df.rename(columns={df.columns[0]: "date"}, inplace=True)
df.rename(columns={df.columns[3]: "soil_moisture"}, inplace=True)
df = df[df["soil_moisture"].notna()]

In [8]:
df = gpd.GeoDataFrame(
    df, crs="EPSG:4326", geometry=gpd.points_from_xy(df.lon, df.lat)
)  # Creates geometry from df using lon and lat as cords to create points (points being geometry)
df = df.to_crs(  # type: ignore
    crs="EPSG:3347"
)  # Changes the points projection to match the agriculture regions of EPSG:3347
df = gpd.sjoin(
    df, agRegions, how="left", predicate="within"
)  # Join the two dataframes based on which points fit within what agriculture regions

In [9]:
df.drop(columns=["index_right", "geometry"], inplace=True)
df = df[df["cr_num"].notna()]  # Take rows that are valid numbers
df[["cr_num"]] = df[["cr_num"]].astype(int)

In [10]:
df = df.reset_index()

In [11]:
df.drop(columns=["index"], inplace=True)
df.drop(columns=["sm_uncertainty"], inplace=True)

In [13]:
df.to_sql(TABLE, conn, schema="public", if_exists="append", index=False)

661