In [1]:
from pathlib import Path
import sys

PROJECT_ROOT = Path.cwd().resolve().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

In [2]:
import sqlite3
from pathlib import Path

import pandas as pd

from spark_jobs.config import GOLD_DIR, SILVER_PATH, ensure_data_dirs

In [3]:
ensure_data_dirs()
DATA_DIR = Path(SILVER_PATH).resolve().parent.parent
db_path = DATA_DIR / 'pipeline_agricultura.db'
conn = sqlite3.connect(db_path)
print(f'Connected to {db_path}')

Connected to /home/jovyan/work/data/pipeline_agricultura.db


In [4]:
df_silver = pd.read_parquet(SILVER_PATH)
print(f'Silver loaded: {df_silver.shape}')

Silver loaded: (28908, 33)


In [5]:
map_regiao = {}
map_cultura = {}
map_temporada = {}

if 'state_district' in df_silver.columns:
    dim_regiao = (
        df_silver[['state_district']]
        .drop_duplicates()
        .sort_values('state_district')
        .reset_index(drop=True)
    )
    dim_regiao.to_sql('dim_regiao', conn, if_exists='replace', index_label='id_regiao')
    map_regiao = (
        dim_regiao.reset_index()
        .rename(columns={'index': 'id_regiao'})
        .set_index('state_district')['id_regiao']
        .to_dict()
    )

if 'crop' in df_silver.columns:
    dim_cultura = (
        df_silver[['crop']]
        .drop_duplicates()
        .sort_values('crop')
        .reset_index(drop=True)
    )
    dim_cultura.to_sql('dim_cultura', conn, if_exists='replace', index_label='id_cultura')
    map_cultura = (
        dim_cultura.reset_index()
        .rename(columns={'index': 'id_cultura'})
        .set_index('crop')['id_cultura']
        .to_dict()
    )

if 'season' in df_silver.columns:
    dim_temporada = (
        df_silver[['season']]
        .drop_duplicates()
        .sort_values('season')
        .reset_index(drop=True)
    )
    dim_temporada.to_sql('dim_temporada', conn, if_exists='replace', index_label='id_temporada')
    map_temporada = (
        dim_temporada.reset_index()
        .rename(columns={'index': 'id_temporada'})
        .set_index('season')['id_temporada']
        .to_dict()
    )

In [6]:
df_fact = df_silver.copy()

if map_regiao:
    df_fact['id_regiao'] = df_fact['state_district'].map(map_regiao)
if map_cultura:
    df_fact['id_cultura'] = df_fact['crop'].map(map_cultura)
if map_temporada:
    df_fact['id_temporada'] = df_fact['season'].map(map_temporada)

df_fact = df_fact.drop(columns=['state_district', 'crop', 'season'], errors='ignore')
df_fact.to_sql('fato_producao', conn, if_exists='replace', index=False)
print(f'fato_producao loaded with {len(df_fact)} rows')

fato_producao loaded with 28908 rows


In [7]:
for parquet_file in Path(GOLD_DIR).glob('*.parquet'):
    df_gold = pd.read_parquet(parquet_file)
    table_name = parquet_file.stem
    df_gold.to_sql(table_name, conn, if_exists='replace', index=False)
    print(f' - Loaded {table_name}')

 - Loaded analise_sazonal_clima
 - Loaded benchmark_regional_rendimento
 - Loaded desempenho_regiao_cultura
 - Loaded perfil_climatico_regiao_cultura
 - Loaded producao_anual_cultura
 - Loaded tendencia_anual_rendimento
 - Loaded volatilidade_rendimento_regiao


In [8]:
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print('Tables in database:')
for name, in cursor.fetchall():
    print(f' - {name}')

conn.close()
print('SQLite load completed.')

Tables in database:
 - dim_regiao
 - dim_cultura
 - dim_temporada
 - fato_producao
 - analise_sazonal_clima
 - benchmark_regional_rendimento
 - desempenho_regiao_cultura
 - perfil_climatico_regiao_cultura
 - producao_anual_cultura
 - tendencia_anual_rendimento
 - volatilidade_rendimento_regiao
SQLite load completed.
