In [0]:
def ingestion_func(schema:tuple,load_url:str):
    df = (
        spark.read.format("csv")
        .schema(schema=schema)
        .option("header","true")
        .load(load_url)
    )

    return df

In [0]:
def save_to_table(df,table_name:str,mode: str):
    try:
        df.write.mode(mode).saveAsTable(table_name)
        return "Okay"
    except Exception as e:
        print(f"Error: {e}")


Ingesting Customers dataset from Olist

In [0]:
customers_schema = ("customer_id string, customer_unique_id string,customer_zip_code_prefix string,customer_city string,customer_state string")
load_url = "/Volumes/golden_360/staging/datasets/olist_customers_dataset.csv"

In [0]:
customer_df = ingestion_func(schema=customers_schema,load_url=load_url)
customer_df.show()

Ingesting Geolocation Dataset from Olist

In [0]:
geolocation_schema = ("geolocation_zip_code_prefix string,geolocation_lat double,geolocation_lng double,geolocation_city string,geolocation_state string")
geolocation_load_url = "/Volumes/golden_360/staging/datasets/olist_geolocation_dataset.csv"

In [0]:
geolocation_df = ingestion_func(schema=geolocation_schema,load_url=geolocation_load_url)
geolocation_df.show()

Ingesting Order Items Dataset from Olist

In [0]:
order_item_id_schema = (
    "order_id string,"
    "order_item_id int,"
    "product_id string,"
    "seller_id string,"
    "shipping_limit_date timestamp,"
    "price decimal(10,2),"
    "freight_value decimal(10,2)"
)
order_item_id_load_url = "/Volumes/golden_360/staging/datasets/olist_order_items_dataset.csv"

In [0]:
order_df = ingestion_func(order_item_id_schema,order_item_id_load_url)
order_df.show()

Ingesting Orders Payments Dataset from Olist

In [0]:
order_payments_schema = (
    "order_id string,"
    "payment_sequential int,"
    "payment_type string,"
    "payment_installments int,"
    "payment_value decimal(10,2)"
)
order_payments_url = "/Volumes/golden_360/staging/datasets/olist_order_payments_dataset.csv"

In [0]:
order_payment_df = ingestion_func(order_payments_schema,order_payments_url)
order_payment_df.show()

Ingestion of Order Reviews Dataset from Olist

In [0]:
order_reviews_url = "/Volumes/golden_360/staging/datasets/olist_order_reviews_dataset.csv"
order_reviews_schema = (
    "review_id string,"
    "order_id string,"
    "review_score int,"
    "review_comment_title string,"
    "review_comment_message string,"
    "review_creation_date timestamp,"
    "review_answer_timestamp timestamp"
)


In [0]:
order_review_df = ingestion_func(schema=order_reviews_schema,load_url=order_reviews_url)
order_review_df.show()

Ingesting Orders Dataset from Olist

In [0]:
orders_url = "/Volumes/golden_360/staging/datasets/olist_orders_dataset.csv"
orders_schema = (
    "order_id string,"
    "customer_id string,"
    "order_status string,"
    "order_purchase_timestamp timestamp,"
    "order_approved_at timestamp,"
    "order_delivered_carrier_date timestamp,"
    "order_delivered_customer_date timestamp,"
    "order_estimated_delivery_date timestamp"
)


In [0]:
orders_df = ingestion_func(schema=orders_schema,load_url=orders_url)
orders_df.show()

Ingesting Products Dataset from Olist

In [0]:
products_url = "/Volumes/golden_360/staging/datasets/olist_products_dataset.csv"
products_schema = (
    "product_id string,"
    "product_category_name string,"
    "product_name_lenght int,"
    "product_description_lenght int,"
    "product_photos_qty int,"
    "product_weight_g int,"
    "product_length_cm int,"
    "product_height_cm int,"
    "product_width_cm int"
)


In [0]:
product_df = ingestion_func(load_url=products_url,schema=products_schema)

Ingestion Sellers Dataset from Olist

In [0]:
sellers_url = "/Volumes/golden_360/staging/datasets/olist_sellers_dataset.csv"
sellers_schema = (
    "seller_id string,"
    "seller_zip_code_prefix string,"
    "seller_city string,"
    "seller_state string"
)


In [0]:
sellers_df = ingestion_func(load_url=sellers_url,schema=sellers_schema)
sellers_df.show()

Ingestion from Product Category Name Dataset from Olist

In [0]:
product_cat_schema = ("product_category_name string,product_category string,product_category_name_english string")
product_cat_url = "/Volumes/golden_360/staging/datasets/product_category_name_translation.csv"

In [0]:
products_df = ingestion_func(schema=product_cat_schema,load_url=product_cat_url)

Save to Table - Customers

In [0]:
save_to_table(df=customer_df,table_name="golden_360.bronze.customers",mode="overwrite")

Save to Table - Geolocation

In [0]:
save_to_table(df=geolocation_df,table_name="golden_360.bronze.geolocation",mode="overwrite")

Save to Table - Order Items

In [0]:
save_to_table(df=order_df,table_name="golden_360.bronze.order_item",mode="overwrite")

Save to Table - Order Payments

In [0]:
save_to_table(df=order_payment_df,table_name="golden_360.bronze.order_payment",mode="overwrite")

Save to Table - Order Reviews

In [0]:
save_to_table(df=order_review_df,table_name="golden_360.bronze.order_review",mode="overwrite")

Save to Table - Orders

In [0]:
save_to_table(df=orders_df,table_name="golden_360.bronze.orders",mode="overwrite")

Save to Table - Products

In [0]:
save_to_table(df=product_df,table_name="golden_360.bronze.products",mode="overwrite")

Save to Table - Sellers

In [0]:
save_to_table(df=sellers_df,table_name="golden_360.bronze.sellers",mode="overwrite")

Save to Table - Product Category

In [0]:
save_to_table(df=products_df,table_name="golden_360.bronze.product_category",mode="overwrite")