In [1]:
import sys
import subprocess
import importlib

# ---- CONFIGURATION ----
REQUIRED_PYTHON = (3, 9)
REQUIRED_PACKAGES = [
    "pandas",
    "openpyxl",
    "sqlalchemy",
    "psycopg[binary]",
    "jupyterlab",
]

# ---- PYTHON VERSION CHECK ----
if sys.version_info < REQUIRED_PYTHON:
    raise SystemExit(
        f"‚ùå Python {REQUIRED_PYTHON[0]}.{REQUIRED_PYTHON[1]}+ required, "
        f"found {sys.version_info.major}.{sys.version_info.minor}"
    )
else:
    print(f"‚úÖ Python version OK: {sys.version_info.major}.{sys.version_info.minor}")

# ---- PACKAGE CHECK / AUTO-INSTALL ----
def install(pkg):
    """Install a package via pip in the current environment."""
    print(f"‚¨áÔ∏è  Installing missing package: {pkg}")
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

missing = []
for pkg in REQUIRED_PACKAGES:
    # psycopg[binary] isn't a real import name, so handle that separately
    import_name = pkg.split("[")[0]
    try:
        importlib.import_module(import_name)
    except ImportError:
        missing.append(pkg)

if missing:
    print(f"\n‚ö†Ô∏è Missing packages detected: {missing}")
    for pkg in missing:
        install(pkg)
else:
    print("‚úÖ All required packages are already installed.")

# ---- SHOW INSTALLED VERSIONS ----
print("\nüì¶ Installed versions:")
for pkg in REQUIRED_PACKAGES:
    pkg_base = pkg.split("[")[0]
    try:
        mod = importlib.import_module(pkg_base)
        version = getattr(mod, "__version__", "unknown")
        print(f"  - {pkg_base} == {version}")
    except ImportError:
        print(f"  - {pkg_base} not installed")


‚úÖ Python version OK: 3.10


‚úÖ All required packages are already installed.

üì¶ Installed versions:
  - pandas == 2.3.3
  - openpyxl == 3.1.5
  - sqlalchemy == 2.0.44
  - psycopg == 3.2.12
  - jupyterlab == 4.4.10


In [2]:
!pip freeze > requirements.txt
print("‚úÖ requirements.txt updated")

‚úÖ requirements.txt updated


In [17]:
# --- CONFIG ---
from pathlib import Path

# PostgreSQL
PG_USER = "postgres"
PG_PASS = "greenage"
PG_HOST = "localhost"
# PG_HOST = "192.168.100.17"
# PG_USER = "agronomics"
# PG_PASS = "Agronomics/psql!!!"
# PG_HOST = "172.16.10.7"
PG_PORT = 5432
PG_DB   = "moin_weather"   # make sure DB is UTF8
# PG_DB   = "agronomics_weather_data"   # make sure DB is UTF8
PG_SCHEMA = "lgs2"
TABLE_NAME = "varieties_substages"

# Excel source
EXCEL_PATH = Path(r"./Variety Data V5.0 (4).xlsx")
SHEET_NAME = "Sunflower"
USECOLS    = "A:L"     # adjust if needed
HEADER_ROW = 0          # Excel row 4 -> pandas header=3 (0-indexed)

# --- ENGINE ---
from sqlalchemy import create_engine, text

conn_str = f"postgresql+psycopg://{PG_USER}:{PG_PASS}@{PG_HOST}:{PG_PORT}/{PG_DB}"
# client_encoding can be SET after connect; psycopg3 also accepts options, but SET works reliably
engine = create_engine(conn_str, pool_pre_ping=True)

with engine.connect() as conn:
    server_enc = conn.execute(text("SHOW SERVER_ENCODING;")).scalar_one()
    conn.execute(text("SET client_encoding TO 'UTF8';"))
    client_enc = conn.execute(text("SHOW CLIENT_ENCODING;")).scalar_one()
    print("SERVER_ENCODING:", server_enc)
    print("CLIENT_ENCODING:", client_enc)


SERVER_ENCODING: UTF8
CLIENT_ENCODING: UTF8


In [18]:
import pandas as pd

# Read as strings first to keep full control over conversion
df_raw = pd.read_excel(
    EXCEL_PATH,
    sheet_name=SHEET_NAME,
    header=HEADER_ROW,
    usecols=USECOLS,
    dtype=str,
    engine="openpyxl",
)

# strip whitespace in *all* string cells
df_raw = df_raw.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# drop fully empty rows
df_raw = df_raw.dropna(how="all")

print("Shape after load & trim:", df_raw.shape)
print("Raw columns:")
print(df_raw.columns.tolist())

display(df_raw.head())
print("total rows : ", len(df_raw))


Shape after load & trim: (44, 12)
Raw columns:
['Crop_Name', 'Principal_Stage', 'Main_Stage', 'Sub_Stage', 'Start_GDD', 'End_GDD', 'Daily_N_Kg/ha', 'Daily_P_Kg/ha', 'Daily_K_Kg/ha', 'Crop_Coefficient', 'K_Ext (PAR)', 'SalineSensitivity']


  df_raw = df_raw.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Unnamed: 0,Crop_Name,Principal_Stage,Main_Stage,Sub_Stage,Start_GDD,End_GDD,Daily_N_Kg/ha,Daily_P_Kg/ha,Daily_K_Kg/ha,Crop_Coefficient,K_Ext (PAR),SalineSensitivity
0,sunflower,Germination,Germination,00: Dry seed (achene),0,0,0.02,0.005,0.02,0.3,0.45,0.88
1,sunflower,Germination,Germination,01: Beginning of seed imbibition,1,10,,,,,,
2,sunflower,Germination,Germination,03: Seed imbibition complete,11,20,,,,,,
3,sunflower,Germination,Germination,05: Radicle emerged from seed,21,50,,,,,,
4,sunflower,Germination,Germination,"06: Radicle elongated, root hairs developing",51,70,,,,,,


total rows :  44


In [19]:
import re

def to_snake(s: str) -> str:
    s = str(s).strip()
    s = re.sub(r"[^\w\s]+", " ", s)  # punctuation -> spaces
    s = re.sub(r"\s+", "_", s)       # whitespace -> underscore
    return s.lower()

snake_map = {c: to_snake(c) for c in df_raw.columns}

# Apply initial snake-case
df_snake = df_raw.rename(columns=snake_map)

# --- Your custom overrides (keys are snake-case) ---
# e.g. raw "Crop Name" -> snake "crop_name" -> override to "crop_fk"
custom_overrides = {
    # "crop_name": "crop_fk",
    "variety_name": "variety_fk",
    # "k_ext_par_": "k_ext_par",
    # "hi": "harvest_index",
    # "opt__temp": "opt_temp"
    }

# Apply overrides
rename_map = {**{v: v for v in df_snake.columns}, **custom_overrides}
df_renamed = df_snake.rename(columns=rename_map)

print("Column rename preview:")
display(pd.DataFrame({
    "original": list(df_raw.columns),
    "snake": [snake_map[c] for c in df_raw.columns],
    "final": [rename_map.get(snake_map[c], snake_map[c]) for c in df_raw.columns],
}))

print("Renamed columns:")
print(df_renamed.columns.tolist())
display(df_renamed.head())


Column rename preview:


Unnamed: 0,original,snake,final
0,Crop_Name,crop_name,crop_name
1,Principal_Stage,principal_stage,principal_stage
2,Main_Stage,main_stage,main_stage
3,Sub_Stage,sub_stage,sub_stage
4,Start_GDD,start_gdd,start_gdd
5,End_GDD,end_gdd,end_gdd
6,Daily_N_Kg/ha,daily_n_kg_ha,daily_n_kg_ha
7,Daily_P_Kg/ha,daily_p_kg_ha,daily_p_kg_ha
8,Daily_K_Kg/ha,daily_k_kg_ha,daily_k_kg_ha
9,Crop_Coefficient,crop_coefficient,crop_coefficient


Renamed columns:
['crop_name', 'principal_stage', 'main_stage', 'sub_stage', 'start_gdd', 'end_gdd', 'daily_n_kg_ha', 'daily_p_kg_ha', 'daily_k_kg_ha', 'crop_coefficient', 'k_ext_par_', 'salinesensitivity']


Unnamed: 0,crop_name,principal_stage,main_stage,sub_stage,start_gdd,end_gdd,daily_n_kg_ha,daily_p_kg_ha,daily_k_kg_ha,crop_coefficient,k_ext_par_,salinesensitivity
0,sunflower,Germination,Germination,00: Dry seed (achene),0,0,0.02,0.005,0.02,0.3,0.45,0.88
1,sunflower,Germination,Germination,01: Beginning of seed imbibition,1,10,,,,,,
2,sunflower,Germination,Germination,03: Seed imbibition complete,11,20,,,,,,
3,sunflower,Germination,Germination,05: Radicle emerged from seed,21,50,,,,,,
4,sunflower,Germination,Germination,"06: Radicle elongated, root hairs developing",51,70,,,,,,


In [22]:
# # Put your exact desired order here (subset or superset is OK)
# REQUIRED_ORDER = [
#     # ---- identifiers / descriptors ----
#     "uuid",
#     "stage_uuid",
#     # "principal_stage",
#     "sub_stage",
#     "start_gdd",
#     "end_gdd",
#     # "variety_fk",
#     # ---- batch timestamps ----
#     "created_at",
#     "updated_at",
# ]

# # Ensure presence; create missing with NA
# for col in REQUIRED_ORDER:
#     if col not in df_renamed.columns:
#         df_renamed[col] = pd.NA

# # Put required columns first, keep any extras at the end (in their current order)
# ordered = [c for c in REQUIRED_ORDER if c in df_renamed.columns]
# extras  = [c for c in df_renamed.columns if c not in ordered]
# df_ordered = df_renamed[ordered + extras].copy()

# print("Final order (first 11 shown):", (ordered + extras)[:11], "...")
# display(df_ordered.head())


In [20]:
import pandas as pd
from decimal import Decimal, ROUND_HALF_UP

# ---- classify columns ----
INT_COLS = [
    "start_gdd",
    "end_gdd"
]

TS_COLS = [
    "created_at",
    "updated_at"
]

# IDs that must NOT be lowercased or otherwise normalized as strings
ID_COLS = [
    "uuid",
    "stage_uuid"
]

# data = df_ordered.copy()
data = df_renamed.copy()

# ---- normalize strings ----
string_cols = [c for c in data.columns if c not in INT_COLS + TS_COLS + ID_COLS]
for c in string_cols:
    data[c] = data[c].astype("string").str.strip().str.lower()

# ---- integers ----
for c in INT_COLS:
    if c in data.columns:
        data[c] = pd.to_numeric(data[c], errors="coerce").astype("Int64")

# ---- timestamps ----
current_ts = pd.Timestamp.now(tz="UTC")
data["created_at"] = current_ts
data["updated_at"] = current_ts

# ---- ensure NULLs ----
data = data.where(pd.notna(data), None)

print("‚úÖ Cleaning complete. dtypes summary:")
display(data.dtypes)
display(data.head())


‚úÖ Cleaning complete. dtypes summary:


crop_name                 string[python]
principal_stage           string[python]
main_stage                string[python]
sub_stage                 string[python]
start_gdd                          Int64
end_gdd                            Int64
daily_n_kg_ha             string[python]
daily_p_kg_ha             string[python]
daily_k_kg_ha             string[python]
crop_coefficient          string[python]
k_ext_par_                string[python]
salinesensitivity         string[python]
created_at           datetime64[us, UTC]
updated_at           datetime64[us, UTC]
dtype: object

Unnamed: 0,crop_name,principal_stage,main_stage,sub_stage,start_gdd,end_gdd,daily_n_kg_ha,daily_p_kg_ha,daily_k_kg_ha,crop_coefficient,k_ext_par_,salinesensitivity,created_at,updated_at
0,sunflower,germination,germination,00: dry seed (achene),0,0,0.02,0.005,0.02,0.3,0.45,0.88,2026-01-13 05:25:53.443258+00:00,2026-01-13 05:25:53.443258+00:00
1,sunflower,germination,germination,01: beginning of seed imbibition,1,10,,,,,,,2026-01-13 05:25:53.443258+00:00,2026-01-13 05:25:53.443258+00:00
2,sunflower,germination,germination,03: seed imbibition complete,11,20,,,,,,,2026-01-13 05:25:53.443258+00:00,2026-01-13 05:25:53.443258+00:00
3,sunflower,germination,germination,05: radicle emerged from seed,21,50,,,,,,,2026-01-13 05:25:53.443258+00:00,2026-01-13 05:25:53.443258+00:00
4,sunflower,germination,germination,"06: radicle elongated, root hairs developing",51,70,,,,,,,2026-01-13 05:25:53.443258+00:00,2026-01-13 05:25:53.443258+00:00


In [21]:
# --------------------------------------------------
# BLOCK 4 (NEW)
# Canonical substage template (Excel-driven)
# --------------------------------------------------

substage_template = (
    data[["principal_stage", "sub_stage", "start_gdd", "end_gdd"]]
    .dropna(subset=["principal_stage", "sub_stage"])
    .copy()
)

substage_template["principal_stage"] = (
    substage_template["principal_stage"]
    .astype("string").str.strip().str.lower()
)

substage_template["sub_stage"] = (
    substage_template["sub_stage"]
    .astype("string").str.strip().str.lower()
)

substage_template = (
    substage_template
    .sort_values(["principal_stage", "start_gdd"])
    .drop_duplicates(subset=["principal_stage", "sub_stage"])
    .reset_index(drop=True)
)

print(f"‚úÖ Canonical substage template rows: {len(substage_template)}")
display(substage_template.head())


‚úÖ Canonical substage template rows: 44


Unnamed: 0,principal_stage,sub_stage,start_gdd,end_gdd
0,development of fruit,71: seeds on outer edge of the inflorescence,801,850
1,development of fruit,73: seeds on outer third of the inflorescence,851,900
2,development of fruit,75: seeds on middle third of the inflorescence,901,950
3,development of fruit,79: seeds on inner third of the inflorescence,951,1000
4,flowering,"61: beginning of flowering:ray florets extended,",661,680


In [22]:
from sqlalchemy import MetaData, Table, Column, text, ForeignKey
from sqlalchemy import String, Integer, DateTime
from sqlalchemy.dialects.postgresql import UUID

# Use a metadata WITHOUT a default schema and reflect the parent first.
meta = MetaData()

# Ensure schema exists (safe if it already exists)
with engine.begin() as conn:
    conn.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{PG_SCHEMA}";'))

# 1) Reflect the parent table so SQLAlchemy can resolve the FK at compile time
Table(
    "varieties_stages",
    meta,
    schema=PG_SCHEMA,
    autoload_with=engine
)

# 2) Define the child table with the FK to the reflected parent
Table(
    TABLE_NAME, meta,
    Column("uuid",         UUID(as_uuid=True), primary_key=True),
    Column("stage_uuid",   UUID(as_uuid=True),
           ForeignKey(f'{PG_SCHEMA}.varieties_stages.uuid',
                      onupdate="CASCADE", ondelete="CASCADE"),
           nullable=False),
    # Column("principal_stage", String),
    Column("sub_stage",       String),
    Column("start_gdd",       Integer),
    Column("end_gdd",         Integer),
    # Column("variety_fk",      String),  # denormalized convenience copy
    Column("created_at",      DateTime(timezone=True)),
    Column("updated_at",      DateTime(timezone=True)),
    schema=PG_SCHEMA,
)

# 3) Create (only) the child table; parent already exists
with engine.begin() as conn:
    meta.create_all(conn, checkfirst=True)
    print(f"‚úÖ Ensured table {PG_SCHEMA}.{TABLE_NAME} exists (or was created).")


‚úÖ Ensured table lgs2.varieties_substages exists (or was created).


In [23]:
from sqlalchemy import String as SA_String, Integer as SA_Integer, DateTime as SA_DateTime
from sqlalchemy.dialects.postgresql import UUID as SA_UUID

dtype_map = {
    "uuid":              SA_UUID(as_uuid=True),
    "stage_uuid":        SA_UUID(as_uuid=True),
    # "principal_stage":   SA_String(),
    "sub_stage":         SA_String(),
    "start_gdd":         SA_Integer(),
    "end_gdd":           SA_Integer(),
    # "variety_fk":        SA_String(),
    "created_at":        SA_DateTime(timezone=True),
    "updated_at":        SA_DateTime(timezone=True),
}


In [9]:
# import pandas as pd
# from uuid import uuid4, UUID
# from sqlalchemy import text

# # --- filter to specific varieties if you want (optional) ---
# TARGET_VARIETIES = [
#     "bars-2009",
#     "fakhar-e-bhakkar",
#     "farid-2006",
#     "lasani-2008",
#     "narc-2011",
#     "punjab-2011",
# ]
# if TARGET_VARIETIES:
#     norm_targets = {v.casefold() for v in TARGET_VARIETIES}
#     mask = data["variety_fk"].astype("string").str.strip().str.casefold().isin(norm_targets)
#     kept = int(mask.sum())
#     if kept == 0:
#         raise ValueError("No rows match the target varieties.")
#     data = data.loc[mask].copy()

# # --- UUIDs: ensure non-null, valid ---
# def _coerce_uuid(x):
#     if x is None or x is pd.NA or (isinstance(x, float) and pd.isna(x)) or (isinstance(x, str) and x.strip() == ""):
#         return uuid4()
#     try:
#         return x if isinstance(x, UUID) else UUID(str(x))
#     except Exception:
#         return uuid4()

# if "uuid" not in data.columns:
#     data["uuid"] = [uuid4() for _ in range(len(data))]
# else:
#     data["uuid"] = data["uuid"].map(_coerce_uuid)

# # --- DEDUPE per (variety_fk, principal_stage, sub_stage) ---
# keys = ["variety_fk", "principal_stage", "sub_stage"]
# before = len(data)
# data["__vk"] = data["variety_fk"].astype("string").str.strip().str.casefold()
# data["__ps"] = data["principal_stage"].astype("string").str.strip().str.casefold()
# data["__ss"] = data["sub_stage"].astype("string").str.strip().str.casefold()
# data = (data.sort_values(["__vk", "__ps", "__ss"], kind="mergesort")
#             .drop_duplicates(subset=["__vk", "__ps", "__ss"], keep="first")
#             .reset_index(drop=True))
# after = len(data)
# if after < before:
#     print(f"‚ÑπÔ∏è Substage dedupe kept {after}/{before} rows.")

# # --- Resolve stages_uuid by joining to varieties_stages on (variety_fk, principal_stage) ---
# # We assume varieties_stages already has those varieties uploaded and
# # principal_stage strings are normalized (lowercased).
# with engine.begin() as conn:
#     rows = conn.execute(text(f"""
#         SELECT uuid, variety_fk, principal_stage
#         FROM "{PG_SCHEMA}".varieties_stages
#     """)).mappings().all()

# map_df = pd.DataFrame(rows)
# if map_df.empty:
#     raise RuntimeError("No rows found in varieties_stages; upload stages first.")

# map_df["variety_fk"]      = map_df["variety_fk"].astype("string").str.strip().str.lower()
# map_df["principal_stage"] = map_df["principal_stage"].astype("string").str.strip().str.lower()

# data["variety_fk"]      = data["variety_fk"].astype("string").str.strip().str.lower()
# data["principal_stage"] = data["principal_stage"].astype("string").str.strip().str.lower()

# # IMPORTANT: drop any pre-existing stage_uuid to avoid _x/_y suffixes
# if "stage_uuid" in data.columns:
#     data = data.drop(columns=["stage_uuid"])

# # left join to pick up parent uuid
# data = data.merge(
#     map_df.rename(columns={"uuid": "stage_uuid"}),
#     on=["variety_fk", "principal_stage"],
#     how="left",
# )

# missing = data["stage_uuid"].isna().sum()
# if missing:
#     bad = data.loc[data["stage_uuid"].isna(), ["variety_fk", "principal_stage"]].drop_duplicates()
#     raise RuntimeError(
#         f"‚ùå {missing} substage row(s) did not resolve to a parent stage. "
#         f"Upload/normalize stages first or fix names.\nMissing combos:\n{bad}"
#     )

# # --- Clean up helper columns ---
# data = data.drop(columns=["__vk", "__ps", "__ss"])


# # ---- drop any columns you don‚Äôt want in DB (already handled above if needed) ----
# for drop_col in [
#     "crop_name",
#     "variety_fk",
#     "principal_stage",
#     "main_stage",
#     "start_day",
#     "end_day",
#     "daily_n_kg_ha",
#     "daily_p_kg_ha",
#     "daily_k_kg_ha",
#     "crop_coefficient",
#     "k_ext_par_",
#     "dm_fm",
# ]:
#     if drop_col in data.columns:
#         print(f"üßπ Dropping '{drop_col}' from DataFrame before upload")
#         data = data.drop(columns=[drop_col])

# # --- NULLs to None for SQLAlchemy ---
# data = data.where(pd.notna(data), None)

# # --- Upload ---
# data.to_sql(
#     name=TABLE_NAME,
#     con=engine,
#     schema=PG_SCHEMA,
#     if_exists="append",
#     index=False,
#     chunksize=10_000,
#     method="multi",
#     dtype=dtype_map,
# )

# print(f"‚úÖ Uploaded {len(data)} substages into {PG_SCHEMA}.{TABLE_NAME}.")

In [24]:
# --- VARIETY FILTER (CANONICAL, LOWERCASE) ---
ALLOWED_VARIETIES = ['fh 331', 'hks 278', 'hysun 33', 'hysun 34', 'nk 265', 'aguara 4', 'pi 6480', 'sf 187', 't-40318', 'nk armani', 's-278', 'us444', 'parson3', 'oxsen 5270', 'oxsen 5264', 'hsf 350', 'orisun516', 'orisun648', 'orisun701', 'us666']

ALLOWED_VARIETIES = [v.strip().lower() for v in ALLOWED_VARIETIES]


In [25]:
# --------------------------------------------------
# BLOCK 8 (FILTERED)
# Fetch CANONICAL parent stages (only allowed varieties)
# --------------------------------------------------

from sqlalchemy import text
import pandas as pd

with engine.connect() as conn:
    parent_stages = pd.DataFrame(
        conn.execute(text(f"""
            SELECT DISTINCT ON (variety_fk, principal_stage)
                uuid AS stage_uuid,
                variety_fk,
                principal_stage
            FROM "{PG_SCHEMA}".varieties_stages
            ORDER BY variety_fk, principal_stage, created_at DESC
        """)).mappings().all()
    )

if parent_stages.empty:
    raise RuntimeError("‚ùå varieties_stages table is empty")

# ---- normalize join keys ----
parent_stages["variety_fk"] = (
    parent_stages["variety_fk"]
    .astype("string").str.strip().str.lower()
)

parent_stages["principal_stage"] = (
    parent_stages["principal_stage"]
    .astype("string").str.strip().str.lower()
)

# ---- FILTER TO ALLOWED VARIETIES ----
parent_stages = parent_stages[
    parent_stages["variety_fk"].isin(ALLOWED_VARIETIES)
].reset_index(drop=True)

if parent_stages.empty:
    raise RuntimeError("‚ùå No parent stages found for allowed varieties")

print(
    f"‚úÖ Filtered parent stages: {len(parent_stages)} rows "
    f"({parent_stages['stage_uuid'].nunique()} unique stage UUIDs) "
    f"for {parent_stages['variety_fk'].nunique()} varieties"
)

display(
    parent_stages[["variety_fk", "principal_stage"]]
    .drop_duplicates()
    .sort_values(["variety_fk", "principal_stage"])
)


‚úÖ Filtered parent stages: 160 rows (160 unique stage UUIDs) for 20 varieties


Unnamed: 0,variety_fk,principal_stage
0,aguara 4,development of fruit
1,aguara 4,flowering
2,aguara 4,germination
3,aguara 4,inflorescence emergence
4,aguara 4,leaf development
...,...,...
155,us666,inflorescence emergence
156,us666,leaf development
157,us666,ripening
158,us666,senescence


In [12]:
# counts = data.groupby("stage_uuid").size()
# print(counts.describe())

# if not (counts == 43).all():
#     raise RuntimeError(
#         f"‚ùå Substage count mismatch per stage_uuid:\n{counts}"
#     )

# print("‚úÖ Each stage has exactly 43 substages")


In [26]:
# --------------------------------------------------
# BLOCK 9 (NEW)
# Expand substages √ó parent stages and upload
# --------------------------------------------------

from uuid import uuid4

# Expand: parent stages √ó substages
expanded = parent_stages.merge(
    substage_template,
    on="principal_stage",
    how="inner"
)

print(f"‚úÖ Expanded rows: {len(expanded)}")
# expected: 20 √ó 44 = 880

# ---- FINAL STRUCTURE (LOCKED TO TABLE) ----
final_df = pd.DataFrame({
    "uuid":        [uuid4() for _ in range(len(expanded))],
    "stage_uuid":  expanded["stage_uuid"],
    "sub_stage":   expanded["sub_stage"],
    "start_gdd":   expanded["start_gdd"],
    "end_gdd":     expanded["end_gdd"],
    "created_at":  pd.Timestamp.now(tz="UTC"),
    "updated_at":  pd.Timestamp.now(tz="UTC"),
})

print("‚úÖ Final columns:", final_df.columns.tolist())
print("‚úÖ Final row count:", len(final_df))

# ---- UPLOAD ----
final_df.to_sql(
    name=TABLE_NAME,
    con=engine,
    schema=PG_SCHEMA,
    if_exists="append",
    index=False,
    chunksize=10_000,
    method="multi",
    dtype=dtype_map,
)

print(
    f"‚úÖ Uploaded {len(final_df)} rows into "
    f"{PG_SCHEMA}.{TABLE_NAME}"
)


‚úÖ Expanded rows: 880
‚úÖ Final columns: ['uuid', 'stage_uuid', 'sub_stage', 'start_gdd', 'end_gdd', 'created_at', 'updated_at']
‚úÖ Final row count: 880
‚úÖ Uploaded 880 rows into lgs2.varieties_substages
