In [1]:
from pathlib import Path
import sys

PROJECT_ROOT = Path.cwd().resolve().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

In [2]:
import kagglehub
from pyspark.sql import functions as F

from spark_jobs.spark_session_manager import get_spark_session
from spark_jobs.config import BRONZE_PATH, ensure_data_dirs

In [3]:
from pathlib import Path

def load_kaggle_csv(spark, handle: str, csv_name: str | None = None):
    '''Download a Kaggle dataset with kagglehub and load a CSV as a Spark DataFrame.'''
    dataset_dir = Path(kagglehub.dataset_download(handle))
    candidates = []

    if csv_name:
        target = Path(csv_name)
        candidates.extend(
            [
                dataset_dir / target,
                dataset_dir / target.name,
                dataset_dir / 'data' / target.name,
            ]
        )
    else:
        candidates.extend(list(dataset_dir.glob('*.csv')))
        data_dir = dataset_dir / 'data'
        if data_dir.exists():
            candidates.extend(list(data_dir.glob('*.csv')))

    csv_path = next((path for path in candidates if path.exists()), None)
    if csv_path is None:
        print(f'No CSV found for {handle}')
        return None

    print(f'Reading {csv_path.name} from {csv_path.parent}')
    return (
        spark.read.option('header', True)
        .option('inferSchema', True)
        .option('mode', 'DROPMALFORMED')
        .csv(str(csv_path))
    )

In [4]:
ensure_data_dirs()
spark = get_spark_session('Bronze Layer')

df_stats = load_kaggle_csv(spark, 'nikhilmahajan29/crop-production-statistics-india')
df_rain = load_kaggle_csv(
    spark,
    'swarooprangle/indian-agriculture-and-climate-dataset-1961-2018',
    'data/rainfall.csv',
)
df_temp = load_kaggle_csv(
    spark,
    'swarooprangle/indian-agriculture-and-climate-dataset-1961-2018',
    'data/temperature.csv',
)

if any(df is None for df in (df_stats, df_rain, df_temp)):
    raise RuntimeError('Missing source datasets for bronze step.')

Downloading from https://www.kaggle.com/api/v1/datasets/download/nikhilmahajan29/crop-production-statistics-india?dataset_version_number=3...


100%|██████████| 3.29M/3.29M [00:02<00:00, 1.62MB/s]

Extracting files...





Reading APY.csv from /home/jovyan/.cache/kagglehub/datasets/nikhilmahajan29/crop-production-statistics-india/versions/3
Downloading from https://www.kaggle.com/api/v1/datasets/download/swarooprangle/indian-agriculture-and-climate-dataset-1961-2018?dataset_version_number=1...


100%|██████████| 90.7k/90.7k [00:00<00:00, 684kB/s]

Extracting files...
Reading rainfall.csv from /home/jovyan/.cache/kagglehub/datasets/swarooprangle/indian-agriculture-and-climate-dataset-1961-2018/versions/1/data





Reading temperature.csv from /home/jovyan/.cache/kagglehub/datasets/swarooprangle/indian-agriculture-and-climate-dataset-1961-2018/versions/1/data


In [5]:
rename_climate = {
    'JAN-FEB': 'TEMP_JAN_FEB',
    'MAR-MAY': 'TEMP_MAR_MAY',
    'JUN-SEP': 'TEMP_JUN_SEP',
    'OCT-DEC': 'TEMP_OCT_DEC',
    'ANNUAL': 'TEMP_ANNUAL',
}

# Normalize YEAR column across sources
for candidate in ['YEAR', 'Year', 'year']:
    if candidate in df_stats.columns and candidate != 'YEAR':
        df_stats = df_stats.withColumnRenamed(candidate, 'YEAR')
    if candidate in df_rain.columns and candidate != 'YEAR':
        df_rain = df_rain.withColumnRenamed(candidate, 'YEAR')
    if candidate in df_temp.columns and candidate != 'YEAR':
        df_temp = df_temp.withColumnRenamed(candidate, 'YEAR')

if 'Area ' in df_stats.columns:
    df_stats = df_stats.withColumnRenamed('Area ', 'Area')
if 'Crop_Year' in df_stats.columns and 'YEAR' not in df_stats.columns:
    df_stats = df_stats.withColumnRenamed('Crop_Year', 'YEAR')

if 'YEAR' not in df_stats.columns or 'YEAR' not in df_rain.columns or 'YEAR' not in df_temp.columns:
    raise RuntimeError('Column YEAR not found in one of the source datasets after rename.')

# Rename temperature columns before the join to avoid name collisions
upper_to_original = {col_name.upper(): col_name for col_name in df_temp.columns}
df_temp_clean = df_temp
for old_upper, new_name in rename_climate.items():
    if old_upper in upper_to_original:
        df_temp_clean = df_temp_clean.withColumnRenamed(upper_to_original[old_upper], new_name)

df_climate = df_rain.join(df_temp_clean, on=['YEAR'], how='inner')
df_combined = df_stats.join(df_climate, on=['YEAR'], how='inner')

df_bronze = df_combined.withColumn('data_ingestao', F.current_timestamp())

In [6]:
df_bronze.write.mode('overwrite').parquet(str(BRONZE_PATH))
print(f'Bronze saved to {BRONZE_PATH}')
df_bronze.printSchema()
df_bronze.show(5)
spark.stop()

Bronze saved to /home/jovyan/work/data/bronze/dados_brutos.parquet
root
 |-- YEAR: integer (nullable = true)
 |-- State: string (nullable = true)
 |-- District : string (nullable = true)
 |-- Crop: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- Area: double (nullable = true)
 |-- Production: integer (nullable = true)
 |-- Yield: double (nullable = true)
 |-- JAN: double (nullable = true)
 |-- FEB: double (nullable = true)
 |-- MAR: double (nullable = true)
 |-- APR: double (nullable = true)
 |-- MAY: double (nullable = true)
 |-- JUN: double (nullable = true)
 |-- JUL: double (nullable = true)
 |-- AUG: double (nullable = true)
 |-- SEP: double (nullable = true)
 |-- OCT: double (nullable = true)
 |-- NOV: double (nullable = true)
 |-- DEC: double (nullable = true)
 |-- ANN: double (nullable = true)
 |-- Jan-Feb: double (nullable = true)
 |-- Mar-May: double (nullable = true)
 |-- Jun-Sep: double (nullable = true)
 |-- Oct-Dec: double (nullable = true)
 |-- TEMP_AN