Function's for Getting data from the dataset and saving to the table


In [0]:
def ingestion_func(schema:tuple,load_url:str):
    df = (
        spark.read.format("csv")
        .schema(schema=schema)
        .option("header","true")
        .load(load_url)
    )

    return df

In [0]:
def save_to_table(df, table_name: str, mode: str):
    try:
        df.write.format("delta") \
          .mode(mode) \
          .option("overwriteSchema", "true") \
          .saveAsTable(table_name)
        return "Okay"
    except Exception as e:
        print(f"Error: {e}")

Mapping of Table and Schema

In [0]:
ingestion_config ={
    "customers":{
        "path": "/Volumes/golden_360/staging/datasets/olist_customers_dataset.csv",
        "schema" : "customer_id string, customer_unique_id string,customer_zip_code_prefix string,customer_city string,customer_state string"
    },
    "geolocation" : {
        "path" : "/Volumes/golden_360/staging/datasets/olist_geolocation_dataset.csv",
        "schema" : "geolocation_zip_code_prefix string,geolocation_lat double,geolocation_lng double,geolocation_city string,geolocation_state string"
    },
    "order_item" : {
        "path" : "/Volumes/golden_360/staging/datasets/olist_order_items_dataset.csv",
        "schema" : "order_id string,order_item_id int,product_id string,seller_id string,shipping_limit_date timestamp,price decimal(10,2),freight_value decimal(10,2)"
    },
    "order_payment" : {
        "path" : "/Volumes/golden_360/staging/datasets/olist_order_items_dataset.csv",
        "schema" : "order_id string,payment_sequential int,payment_type string,payment_installments int,payment_value decimal(10,2)"
    },
    "order_review": {
        "path" : "/Volumes/golden_360/staging/datasets/olist_order_reviews_dataset.csv",
        "schema" : "review_id string,order_id string,review_score int,review_comment_title string,review_comment_message string,review_creation_date timestamp,review_answer_timestamp timestamp"
    },
    "orders":{
        "path" : "/Volumes/golden_360/staging/datasets/olist_orders_dataset.csv",
        "schema" : "order_id string, customer_id string,order_status string,order_purchase_timestamp timestamp,order_approved_at timestamp,order_delivered_carrier_date timestamp,order_delivered_customer_date timestamp,order_estimated_delivery_date timestamp"
    },
    "products" : {
        "path" : "/Volumes/golden_360/staging/datasets/olist_products_dataset.csv",
        "schema" : "product_id string,product_category_name string,product_name_lenght int,product_description_lenght int,product_photos_qty int,product_weight_g int,product_length_cm int,product_height_cm int,product_width_cm int"
    },
    "sellers" : {
        "path" : "/Volumes/golden_360/staging/datasets/olist_sellers_dataset.csv",
        "schema" : "seller_id string, seller_zip_code_prefix string,seller_city string,seller_state string"
    },
    "product_category" : {
        "path" : "/Volumes/golden_360/staging/datasets/product_category_name_translation.csv",
        "schema" : "product_category_name string,product_category string,product_category_name_english string"
    }
}

Using For Loop to get data and saving it as a table

In [0]:
for table_name, config in ingestion_config.items():
    print(f"Starting ingestion for: {table_name}")
    
    df = ingestion_func(schema=config["schema"], load_url=config["path"])
    
    full_table_name = f"golden_360.bronze.{table_name}"
    status = save_to_table(df=df, table_name=full_table_name, mode="overwrite")
    
    print(f"Finished {table_name}: {status}")