In [0]:
from kaggle.api.kaggle_api_extended import KaggleApi
import os
from pyspark.sql.functions import current_timestamp, input_file_name

# Configurations
S3_BUCKET = "s3a://data-files-rjx" #USE YOUR OWN MOUNTED BUCKET
KAGGLE_DATASET = "imankity/san-francisco-fire-department-public-dataset"

# Initialize
raw_path = f"{S3_BUCKET}/raw"
bronze_path = f"{S3_BUCKET}/bronze/fire_calls"

def download_from_kaggle():
    """Download data to S3 raw"""
    try:
        # Set up temp directory
        temp_dir = "/tmp/kaggle_download"
        os.makedirs(temp_dir, exist_ok=True)
        
        # Kaggle Authentication
        api = KaggleApi()
        api.authenticate()  
        
        # Download and unzip
        api.dataset_download_files(
            KAGGLE_DATASET,
            path=temp_dir,
            unzip=True,
            force=True
        )
        
        # Upload to S3
        for f in os.listdir(temp_dir):
            if f.endswith('.csv'):
                dbutils.fs.mv(f"file:{temp_dir}/{f}", f"{raw_path}/{f}")
        
        print(f"✅ 数据下载完成: {dbutils.fs.ls(raw_path)}")
        return True
    except Exception as e:
        print(f"❌ 下载失败: {e}")
        raise

def raw_to_bronze():
    """转换CSV为Bronze层"""
    try:
        csv_path = next(f.path for f in dbutils.fs.ls(raw_path) if f.name.endswith('.csv'))
        
        (spark.read.csv(csv_path, header=True)
            .withColumn("_ingest_time", current_timestamp())
            .withColumn("_source_file", input_file_name())
            .write.mode("overwrite")
            .parquet(bronze_path))
        
        print(f"✅ Bronze层生成: {dbutils.fs.ls(bronze_path)}")
        return True
    except Exception as e:
        print(f"❌ 转换失败: {e}")
        raise

# Execute
dbutils.fs.mkdirs(raw_path)
download_from_kaggle() and raw_to_bronze()

In [0]:
# Read data from Bronze-layer
bronze_df = spark.read.parquet(f"{bronze_path}")

# A quick preview
display(bronze_df.limit(100))
