In [361]:
!pip install pandas openpyxl sqlalchemy psycopg2-binary python-dotenv --quiet


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [362]:
import re
import os
from pathlib import Path
from datetime import datetime
import unicodedata
from sqlalchemy import text



import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv

In [363]:
# =========================
# CONFIG
# =========================
FILE_PATH = r"C:\Users\cauai.Capozzoli\Desktop\DB\estufas\inventario\Historico\dados\Invent√°rio Estufas - 2025 - S51.xlsx"
SHEET_WEEK_PATTERN = r"^S(\d{2})"  # S01..S50

BRONZE_XLSX_DIR = r"C:\Users\cauai.Capozzoli\Desktop\repo-local\app_estufas\app_estufa\notebooks\bronze"
BRONZE_XLSX_NAME = "bronze_inventario_estufas.xlsx"

In [364]:
# =========================
# HELPERS (poucas e simples)
# =========================
def norm(x):
    return str(x).strip().lower() if pd.notna(x) else ""

def find_row_with_cell_equal(raw, target, max_rows=700):
    """Acha a linha onde existe uma c√©lula exatamente igual ao target (ex.: 'alfaces')."""
    target = target.strip().lower()
    for i in range(min(len(raw), max_rows)):
        row = [norm(v) for v in raw.iloc[i].tolist()]
        if any(v == target for v in row):
            return i
    return None

def find_row_contains_tokens(raw, start, tokens, lookahead=60):
    """Acha a primeira linha ap√≥s start que contenha TODOS os tokens (como c√©lulas)."""
    tokens = set(t.lower() for t in tokens)
    for i in range(start, min(start + lookahead, len(raw))):
        row = [norm(v) for v in raw.iloc[i].tolist()]
        s = set(row)
        if tokens.issubset(s):
            return i
    return None

def find_header_main_table(raw, max_rows=350):
    """Acha a primeira linha da tabela principal (Bloco + Naves)."""
    for i in range(min(len(raw), max_rows)):
        row = [norm(v) for v in raw.iloc[i].tolist()]
        if ("bloco" in row) and ("naves" in row):
            return i
    return None

def slice_from_header(raw, header_row):
    """
    Recorta tabela a partir do header_row e normaliza nomes de colunas.
    Para quando a coluna 'bloco' ficar vazia/NaN.
    """
    def clean_header(h):
        h = str(h).strip().lower() if pd.notna(h) else ""

        # remove acentos
        h = unicodedata.normalize("NFKD", h)
        h = "".join(ch for ch in h if not unicodedata.combining(ch))

        # normaliza√ß√µes espec√≠ficas do seu excel
        h = h.replace("n¬∫", "n_").replace("n¬∞", "n_").replace("no", "n_")  # tolerante
        h = h.replace("¬∫", "")  # remove ¬∫ solto
        h = h.replace("/", "_")
        h = h.replace("\n", " ")
        h = h.replace(" ", "_")

        # remove caracteres que n√£o sejam alfanum ou underscore
        h = re.sub(r"[^a-z0-9_]+", "", h)

        # ajustes finais comuns
        h = h.replace("n__", "n_").strip("_")
        return h

    header = [clean_header(v) for v in raw.iloc[header_row].tolist()]

    df = raw.iloc[header_row + 1:].copy()
    df.columns = header
    df = df.dropna(how="all")

    # fim da tabela quando bloco vazio
    if "bloco" in df.columns:
        bloco_num = pd.to_numeric(df["bloco"], errors="coerce")
        if bloco_num.isna().any():
            end_idx = bloco_num.isna().idxmax()
            df = df.loc[:end_idx - 1]

    df = df.dropna(how="all").reset_index(drop=True)
    return df

In [365]:
# =========================
# EXTRATOR MAIN (tabela principal)
# =========================
def extract_main_table(raw: pd.DataFrame) -> pd.DataFrame:
    hdr = find_header_main_table(raw)
    if hdr is None:
        return pd.DataFrame()

    df = slice_from_header(raw, hdr)
    if df.empty:
        return pd.DataFrame()

    # padronizar colunas para o schema bronze
    # (tolerante a varia√ß√µes)
    cols = set(df.columns)

    def pick(*names):
        for n in names:
            if n in cols:
                return n
        return None

    c_bloco = pick("bloco")
    c_naves = pick("naves")
    c_cultura = pick("cultura")
    c_idade = pick("idade")
    c_sem_plantio = pick("semana_do_plantio", "semana_plantio")
    c_data_plantio = pick("data_do_plantio", "data_plantio")
    c_n_naves = pick("n_naves", "n_nave", "nnaves")
    c_area_nave = pick("area_nave", "area_naveha", "areanave", "area_por_nave")
    c_area_total = pick("area_total", "areatotal", "area_totalha")
    c_sem_colheita = pick("semana_de_colheita", "semana_colheita")

    out = pd.DataFrame({
        "bloco": df[c_bloco] if c_bloco else None,
        "naves": df[c_naves] if c_naves else None,
        "cultura": df[c_cultura] if c_cultura else None,
        "idade": df[c_idade] if c_idade else None,
        "semana_plantio": df[c_sem_plantio] if c_sem_plantio else None,
        "data_plantio": df[c_data_plantio] if c_data_plantio else None,
        "semana_colheita": df[c_sem_colheita] if c_sem_colheita else None,
        "n_naves": df[c_n_naves] if c_n_naves else None,
        "area_nave": df[c_area_nave] if c_area_nave else None,
        "area_total": df[c_area_total] if c_area_total else None,
    })

    # colunas de alfaces (n√£o existem na main; ficam nulas)
    out["caipira_camalhoes"] = None
    out["roxa_camalhoes"] = None
    out["iceberg_camalhoes"] = None
    out["flag_alfaces"] = False

    # convers√µes b√°sicas
    out["bloco"] = pd.to_numeric(out["bloco"], errors="coerce")
    out = out.dropna(subset=["bloco"])
    out["bloco"] = out["bloco"].astype(int)

    out["idade"] = pd.to_numeric(out["idade"], errors="coerce")
    out["semana_plantio"] = pd.to_numeric(out["semana_plantio"], errors="coerce")

    out["semana_colheita"] = out["semana_colheita"].astype(str).str.extract(r"(\d+)", expand=False)
    out["semana_colheita"] = pd.to_numeric(out["semana_colheita"], errors="coerce")

        # Semana do Plantio dentro do campo Data do Plantio: "(S45)"
    sem_from_text = (
        out["data_plantio"].astype(str)
        .str.extract(r"\(S(\d{1,2})\)", expand=False)
    )
    # s√≥ preenche onde semana_plantio est√° vazia
    out.loc[out["semana_plantio"].isna(), "semana_plantio"] = pd.to_numeric(sem_from_text, errors="coerce")


    for c in ["n_naves","area_nave","area_total"]:
        out[c] = pd.to_numeric(out[c], errors="coerce")

    # limpeza leve de strings
    out["naves"] = out["naves"].astype(str).str.strip()
    out["cultura"] = out["cultura"].astype(str).str.strip()

    return out


In [366]:
# =========================
# EXTRATOR ALFACES (ancorado no t√≠tulo "Alfaces")
# =========================
def extract_alfaces_table(raw: pd.DataFrame) -> pd.DataFrame:
    title_row = find_row_with_cell_equal(raw, "alfaces")
    if title_row is None:
        return pd.DataFrame()

    # achar header: tem bloco+naves
    header_row = None
    for i in range(title_row, min(title_row + 50, len(raw))):
        row = [norm(v) for v in raw.iloc[i].tolist()]
        if ("bloco" in row) and ("naves" in row):
            header_row = i
            break
    if header_row is None:
        return pd.DataFrame()

    # subheader: linha com caipira/roxa/iceberg como c√©lulas
    subheader_row = find_row_contains_tokens(raw, header_row, {"caipira","roxa","iceberg"}, lookahead=8)
    if subheader_row is None:
        return pd.DataFrame()

    header_main = [norm(v) for v in raw.iloc[header_row].tolist()]
    header_sub =  [norm(v) for v in raw.iloc[subheader_row].tolist()]

    cols = []
    for j in range(len(header_main)):
        if header_sub[j] in ("caipira","roxa","iceberg"):
            cols.append(f"{header_sub[j]}_camalhoes")
        else:
            cols.append(header_main[j])

    cols = [
        re.sub(r"[^a-z0-9_]+", "", c.replace(" ", "_")) if c else ""
        for c in cols
    ]

    df = raw.iloc[subheader_row + 1:].copy()
    df.columns = cols
    df = df.dropna(how="all")

    # fim: bloco vazio
    if "bloco" in df.columns:
        bloco_num = pd.to_numeric(df["bloco"], errors="coerce")
        if bloco_num.isna().any():
            end_idx = bloco_num.isna().idxmax()
            df = df.loc[:end_idx - 1]

    df = df.dropna(how="all").reset_index(drop=True)
    if df.empty:
        return pd.DataFrame()

    # mapear para schema bronze
    def pick(col_name):
        return col_name if col_name in df.columns else None

    out = pd.DataFrame({
        "bloco": df[pick("bloco")] if pick("bloco") else None,
        "naves": df[pick("naves")] if pick("naves") else None,
        "cultura": "Alface",
        "idade": df[pick("idade")] if pick("idade") else None,
        "semana_plantio": df[pick("semana_do_plantio")] if pick("semana_do_plantio") else df[pick("semana_plantio")] if pick("semana_plantio") else None,
        "data_plantio": df[pick("data_do_plantio")] if pick("data_do_plantio") else df[pick("data_plantio")] if pick("data_plantio") else None,
        "semana_colheita": df[pick("semana_de_colheita")] if pick("semana_de_colheita") else df[pick("semana_colheita")] if pick("semana_colheita") else None,
        "n_naves": None,
        "area_nave": None,
        "area_total": None,
        "caipira_camalhoes": df[pick("caipira_camalhoes")] if pick("caipira_camalhoes") else None,
        "roxa_camalhoes": df[pick("roxa_camalhoes")] if pick("roxa_camalhoes") else None,
        "iceberg_camalhoes": df[pick("iceberg_camalhoes")] if pick("iceberg_camalhoes") else None,
        "flag_alfaces": True
    })

    # convers√µes
    out["bloco"] = pd.to_numeric(out["bloco"], errors="coerce")
    out = out.dropna(subset=["bloco"])
    out["bloco"] = out["bloco"].astype(int)

    out["idade"] = pd.to_numeric(out["idade"], errors="coerce")
    out["semana_plantio"] = pd.to_numeric(out["semana_plantio"], errors="coerce")

    out["semana_colheita"] = out["semana_colheita"].astype(str).str.extract(r"(\d+)", expand=False)
    out["semana_colheita"] = pd.to_numeric(out["semana_colheita"], errors="coerce")

    out["data_plantio"] = pd.to_datetime(out["data_plantio"], errors="coerce", dayfirst=True)

    for c in ["caipira_camalhoes","roxa_camalhoes","iceberg_camalhoes"]:
        out[c] = pd.to_numeric(out[c], errors="coerce")

    out["naves"] = out["naves"].astype(str).str.strip()

    return out

In [367]:
# =========================
# RUN - monta df_bronze
# =========================
xls = pd.ExcelFile(FILE_PATH)
parts = []

for sheet in xls.sheet_names:
    m = re.search(SHEET_WEEK_PATTERN, sheet.strip(), flags=re.IGNORECASE)
    if not m:
        continue

    semana = int(m.group(1))
    raw = pd.read_excel(FILE_PATH, sheet_name=sheet, header=None, dtype=object)

    main = extract_main_table(raw)
    if not main.empty:
        main["semana_inventario"] = semana
        main["aba_origem"] = sheet
        main["arquivo_origem"] = Path(FILE_PATH).name
        parts.append(main)

    alfaces = extract_alfaces_table(raw)
    if not alfaces.empty:
        alfaces["semana_inventario"] = semana
        alfaces["aba_origem"] = sheet
        alfaces["arquivo_origem"] = Path(FILE_PATH).name
        parts.append(alfaces)

df_bronze = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame()
df_bronze["ano"] = 2025

print("df_bronze shape:", df_bronze.shape)
print("Alfaces linhas:", (df_bronze["cultura"].astype(str).str.lower() == "alface").sum())

display(df_bronze.head(20))

df_bronze shape: (1882, 18)
Alfaces linhas: 529


  df_bronze = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame()


Unnamed: 0,bloco,naves,cultura,idade,semana_plantio,data_plantio,semana_colheita,n_naves,area_nave,area_total,caipira_camalhoes,roxa_camalhoes,iceberg_camalhoes,flag_alfaces,semana_inventario,aba_origem,arquivo_origem,ano
0,1,1 a 23,Flores,,,,,14.0,0.057,0.798,,,,False,50,S50. Resumo Idades (2025),Invent√°rio Estufas - 2025 - S51.xlsx,2025
1,2,1 a 19,Flores,,,,,19.0,0.057,1.083,,,,False,50,S50. Resumo Idades (2025),Invent√°rio Estufas - 2025 - S51.xlsx,2025
2,3,1 a 19,Opera√ß√µes Campo Aberto e Caf√©,,,,,19.0,0.057,1.083,,,,False,50,S50. Resumo Idades (2025),Invent√°rio Estufas - 2025 - S51.xlsx,2025
3,4,1 a 9,Alface,,,,,9.0,0.057,0.513,,,,False,50,S50. Resumo Idades (2025),Invent√°rio Estufas - 2025 - S51.xlsx,2025
4,4,10 a 17,Opera√ß√µes Campo Aberto e Caf√©,,,,,8.0,0.057,0.456,,,,False,50,S50. Resumo Idades (2025),Invent√°rio Estufas - 2025 - S51.xlsx,2025
5,4,18 e 19,Phisallis,184.0,22.0,31/05/2022 (S22),,2.0,0.057,0.114,,,,False,50,S50. Resumo Idades (2025),Invent√°rio Estufas - 2025 - S51.xlsx,2025
6,5,1 a 10,Alface,,,,,10.0,0.06,0.6,,,,False,50,S50. Resumo Idades (2025),Invent√°rio Estufas - 2025 - S51.xlsx,2025
7,5,16 a 22,Em limpeza.,,,,,12.0,0.06,0.72,,,,False,50,S50. Resumo Idades (2025),Invent√°rio Estufas - 2025 - S51.xlsx,2025
8,6,1 a 22,Em preparo de Solo (Tomate),,,,,22.0,0.06,1.32,,,,False,50,S50. Resumo Idades (2025),Invent√°rio Estufas - 2025 - S51.xlsx,2025
9,7,1 a 22,Tomate,5.0,45.0,04/11/2025 (S45),,22.0,0.06,1.32,,,,False,50,S50. Resumo Idades (2025),Invent√°rio Estufas - 2025 - S51.xlsx,2025


In [None]:
"""
# =========================
# SALVAR XLSX
# =========================
out_dir = Path(BRONZE_XLSX_DIR)
out_dir.mkdir(parents=True, exist_ok=True)
out_file = out_dir / BRONZE_XLSX_NAME
df_bronze.to_excel(out_file, index=False)
print("‚úÖ Bronze salva em:", out_file)
"""



‚úÖ Bronze salva em: C:\Users\cauai.Capozzoli\Desktop\repo-local\app_estufas\app_estufa\notebooks\bronze\bronze_inventario_estufas.xlsx


In [369]:
# 1) Carregar DATABASE_URL do .env (ou usar default)
load_dotenv()

DATABASE_URL = os.getenv(
    "DATABASE_URL",
    "postgresql://estufas_user:estufas_pass_123@localhost:5432/estufas_kibala"
)

engine = create_engine(DATABASE_URL)



In [370]:
# 1) Checar quais (ano, semana) est√£o no df_bronze desta carga
semanas_carga = (
    df_bronze[["ano", "semana_inventario"]]
    .dropna()
    .drop_duplicates()
    .astype(int)
    .sort_values(["ano", "semana_inventario"])
)

print("Semanas encontradas na carga:")
print(semanas_carga.to_string(index=False))

Semanas encontradas na carga:
 ano  semana_inventario
2025                  1
2025                  2
2025                  3
2025                  4
2025                  5
2025                  6
2025                  7
2025                  8
2025                  9
2025                 10
2025                 11
2025                 12
2025                 13
2025                 14
2025                 15
2025                 16
2025                 17
2025                 20
2025                 21
2025                 22
2025                 23
2025                 24
2025                 25
2025                 26
2025                 27
2025                 28
2025                 29
2025                 30
2025                 31
2025                 32
2025                 33
2025                 34
2025                 35
2025                 36
2025                 37
2025                 38
2025                 39
2025                 40
2025                 41
2025      

In [371]:
with engine.begin() as conn:
    conn.execute(text("CREATE SCHEMA IF NOT EXISTS bronze;"))

    conn.execute(text("""
    CREATE TABLE IF NOT EXISTS bronze.inventario_estufas_bronze (
        bloco INT,
        naves TEXT,
        cultura TEXT,
        idade NUMERIC,
        semana_plantio INT,
        data_plantio TEXT,
        semana_colheita INT,
        n_naves NUMERIC,
        area_nave NUMERIC,
        area_total NUMERIC,
        caipira_camalhoes NUMERIC,
        roxa_camalhoes NUMERIC,
        iceberg_camalhoes NUMERIC,
        flag_alfaces BOOLEAN,
        semana_inventario INT,
        aba_origem TEXT,
        arquivo_origem TEXT,
        ano INT,
        carga_id TEXT,
        carga_ts TIMESTAMP
    );
    """))

print("‚úÖ Schema/tabela bronze.inventario_estufas_bronze garantidos.")

‚úÖ Schema/tabela bronze.inventario_estufas_bronze garantidos.


In [372]:
# 2) Deletar do banco somente essas semanas/anos (sem truncar tudo)
with engine.begin() as conn:
    for _, r in semanas_carga.iterrows():
        conn.execute(
            text("""
                DELETE FROM bronze.inventario_estufas_bronze
                WHERE ano = :ano AND semana_inventario = :semana
            """),
            {"ano": int(r["ano"]), "semana": int(r["semana_inventario"])}
        )

print(f"üßπ Limpei no banco {len(semanas_carga)} semanas desta carga.")

üßπ Limpei no banco 47 semanas desta carga.


In [373]:
# 3) Inserir (append) com metadados de carga
carga_id = datetime.now().strftime("%Y%m%d_%H%M%S")
df_up = df_bronze.copy()
df_up["carga_id"] = carga_id
df_up["carga_ts"] = pd.Timestamp.now()

# opcional (ajuda a evitar duplicata por espa√ßos)
df_up["naves"] = df_up["naves"].astype(str).str.strip()
df_up["cultura"] = df_up["cultura"].astype(str).str.strip()

df_up.to_sql(
    name="inventario_estufas_bronze",
    con=engine,
    schema="bronze",
    if_exists="append",
    index=False,
    method="multi",
    chunksize=2000
)

print("‚úÖ Carga conclu√≠da | carga_id:", carga_id)

‚úÖ Carga conclu√≠da | carga_id: 20251215_120229
