In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from google.colab import drive
drive.mount("/content/drive")

BASE = Path("/content/drive/MyDrive/dadosfera")

BRONZE = BASE / "cdm_bronze"
SILVER = BASE / "cdm_silver"

SILVER.mkdir(parents=True, exist_ok=True)

def read_bronze(name: str) -> pd.DataFrame:
    return pd.read_csv(BRONZE / f"{name}.csv", low_memory=False)

def write_silver(df: pd.DataFrame, name: str):
    df.to_csv(SILVER / f"{name}.csv", index=False)

def normalize_text(s: pd.Series) -> pd.Series:
    return (
        s.astype("string")
         .str.strip()
         .str.lower()
         .replace("", pd.NA)
    )

customer   = read_bronze("customer")
seller     = read_bronze("seller")
product    = read_bronze("product")
order      = read_bronze("order")
order_item = read_bronze("order_item")
payment    = read_bronze("payment")
review     = read_bronze("review")
geo        = read_bronze("geo_zip")

date_cols_order = [
    "created_at",
    "approved_at",
    "delivered_carrier_at",
    "delivered_customer_at",
    "estimated_delivery_at",
]

for col in date_cols_order:
    order[col] = pd.to_datetime(order[col], errors="coerce")

review["created_at"] = pd.to_datetime(review["created_at"], errors="coerce")
review["answered_at"] = pd.to_datetime(review["answered_at"], errors="coerce")

order_item["item_price"] = pd.to_numeric(order_item["item_price"], errors="coerce")
order_item["freight_value"] = pd.to_numeric(order_item["freight_value"], errors="coerce")

payment["payment_value"] = pd.to_numeric(payment["payment_value"], errors="coerce")
payment["installments"] = pd.to_numeric(payment["installments"], errors="coerce")

for col in ["weight_g", "length_cm", "height_cm", "width_cm"]:
    product[col] = pd.to_numeric(product[col], errors="coerce")

customer["city"] = normalize_text(customer["city"])
seller["city"] = normalize_text(seller["city"])
product["category_pt"] = normalize_text(product["category_pt"])
product["category_en"] = normalize_text(product["category_en"])
review["comment_title"] = normalize_text(review["comment_title"])
review["comment_message"] = normalize_text(review["comment_message"])

customer = customer.drop_duplicates(subset=["customer_id"])
seller = seller.drop_duplicates(subset=["seller_id"])
product = product.drop_duplicates(subset=["product_id"])
order = order.drop_duplicates(subset=["order_id"])
review = review.drop_duplicates(subset=["review_id"])

order["delivery_delay_days"] = (
    order["delivered_customer_at"] - order["estimated_delivery_at"]
).dt.days

product["volume_cm3"] = (
    product["length_cm"] *
    product["height_cm"] *
    product["width_cm"]
)

order_item["total_item_value"] = (
    order_item["item_price"] + order_item["freight_value"]
)

order_item["item_price_log"] = np.log1p(order_item["item_price"])
order_item["freight_log"] = np.log1p(order_item["freight_value"])

write_silver(customer, "customer")
write_silver(seller, "seller")
write_silver(product, "product")
write_silver(order, "order")
write_silver(order_item, "order_item")
write_silver(payment, "payment")
write_silver(review, "review")
write_silver(geo, "geo_zip")
