In [None]:
import pandas as pd
from pathlib import Path

# ======================================================
# Google Drive (Colab)
# ======================================================
from google.colab import drive
drive.mount("/content/drive")

# ======================================================
# Paths
# ======================================================
BASE = Path("/content/drive/MyDrive/dadosfera")

RAW = BASE / "bronze"
CDM = BASE / "cdm_silver"

CDM.mkdir(parents=True, exist_ok=True)

print("RAW:", RAW)
print("CDM:", CDM)

# ======================================================
# IO helpers
# ======================================================
def read_csv(name: str) -> pd.DataFrame:
    path = RAW / name
    print(f"Lendo: {path}")
    return pd.read_csv(path)

def write_csv(df: pd.DataFrame, name: str):
    path = CDM / f"{name}.csv"
    df.to_csv(path, index=False)
    print(f"Escrito: {path} | linhas: {len(df)}")

# ======================================================
# Load raw datasets
# ======================================================
orders    = read_csv("olist_orders_dataset.csv")
items     = read_csv("olist_order_items_dataset.csv")
payments  = read_csv("olist_order_payments_dataset.csv")
reviews   = read_csv("olist_order_reviews_dataset.csv")
customers = read_csv("olist_customers_dataset.csv")
sellers   = read_csv("olist_sellers_dataset.csv")
products  = read_csv("olist_products_dataset.csv")
cat_tr    = read_csv("product_category_name_translation.csv")
geo       = read_csv("olist_geolocation_dataset.csv")

# ======================================================
# CDM: customer
# ======================================================
cdm_customer = customers.rename(columns={
    "customer_unique_id": "customer_natural_id",
    "customer_zip_code_prefix": "zip_prefix",
    "customer_city": "city",
    "customer_state": "state",
})[[
    "customer_id",
    "customer_natural_id",
    "zip_prefix",
    "city",
    "state"
]]

# ======================================================
# CDM: seller
# ======================================================
cdm_seller = sellers.rename(columns={
    "seller_zip_code_prefix": "zip_prefix",
    "seller_city": "city",
    "seller_state": "state",
})[[
    "seller_id",
    "zip_prefix",
    "city",
    "state"
]]

# ======================================================
# CDM: product
# ======================================================
prod = products.merge(
    cat_tr,
    how="left",
    on="product_category_name"
)

cdm_product = prod.rename(columns={
    "product_category_name": "category_pt",
    "product_category_name_english": "category_en",
    "product_weight_g": "weight_g",
    "product_length_cm": "length_cm",
    "product_height_cm": "height_cm",
    "product_width_cm": "width_cm",
})[[
    "product_id",
    "category_pt",
    "category_en",
    "weight_g",
    "length_cm",
    "height_cm",
    "width_cm"
]]

# ======================================================
# CDM: order
# ======================================================
cdm_order = orders.rename(columns={
    "order_status": "status",
    "order_purchase_timestamp": "created_at",
    "order_approved_at": "approved_at",
    "order_delivered_carrier_date": "delivered_carrier_at",
    "order_delivered_customer_date": "delivered_customer_at",
    "order_estimated_delivery_date": "estimated_delivery_at",
})[[
    "order_id",
    "customer_id",
    "status",
    "created_at",
    "approved_at",
    "delivered_carrier_at",
    "delivered_customer_at",
    "estimated_delivery_at"
]]

# ======================================================
# CDM: order_item
# ======================================================
cdm_order_item = items.rename(columns={
    "shipping_limit_date": "shipping_limit_at",
    "price": "item_price",
})[[
    "order_id",
    "order_item_id",
    "product_id",
    "seller_id",
    "shipping_limit_at",
    "item_price",
    "freight_value"
]]

# ======================================================
# CDM: payment
# ======================================================
cdm_payment = payments.rename(columns={
    "payment_sequential": "payment_seq",
    "payment_installments": "installments",
})[[
    "order_id",
    "payment_seq",
    "payment_type",
    "installments",
    "payment_value"
]]

# ======================================================
# CDM: review
# ======================================================
cdm_review = reviews.rename(columns={
    "review_score": "score",
    "review_comment_title": "comment_title",
    "review_comment_message": "comment_message",
    "review_creation_date": "created_at",
    "review_answer_timestamp": "answered_at",
})[[
    "order_id",
    "review_id",
    "score",
    "comment_title",
    "comment_message",
    "created_at",
    "answered_at"
]]

# ======================================================
# CDM: geo_zip
# ======================================================
geo_zip = (
    geo
    .groupby("geolocation_zip_code_prefix", as_index=False)
    .agg(
        lat_avg=("geolocation_lat", "mean"),
        lng_avg=("geolocation_lng", "mean"),
        city_mode=("geolocation_city", lambda s: s.mode().iloc[0] if not s.mode().empty else None),
        state_mode=("geolocation_state", lambda s: s.mode().iloc[0] if not s.mode().empty else None),
    )
    .rename(columns={"geolocation_zip_code_prefix": "zip_prefix"})
)

# ======================================================
# Persist CDM Bronze
# ======================================================
write_csv(cdm_customer, "customer")
write_csv(cdm_seller, "seller")
write_csv(cdm_product, "product")
write_csv(cdm_order, "order")
write_csv(cdm_order_item, "order_item")
write_csv(cdm_payment, "payment")
write_csv(cdm_review, "review")
write_csv(geo_zip, "geo_zip")

print("\nCDM Bronze gerado com sucesso em:", CDM)
