# Data Silver

**Capa:** Silver

---


### Step 1
**Objetivo:** Carga data a las estructuras de las tablas de la capa Silver.

**Resumen Tablas a Poblar:**
- airbnb.silver.hosts;
- airbnb.silver.listings;
- airbnb.silver.address;
- airbnb.silver.amenities;
- airbnb.silver.listing_amenities;
- airbnb.silver.reviews;

---



In [0]:
-- Elimina datos previos
TRUNCATE TABLE airbnb.silver.hosts;
TRUNCATE TABLE airbnb.silver.listings;
TRUNCATE TABLE airbnb.silver.address;
TRUNCATE TABLE airbnb.silver.amenities;
TRUNCATE TABLE airbnb.silver.listing_amenities;
TRUNCATE TABLE airbnb.silver.reviews;

-- =========================
-- 1) SILVER.HOSTS
-- =========================
INSERT OVERWRITE TABLE airbnb.silver.hosts(
  bk_host_id, host_name, host_response_time, host_response_rate,
  host_is_superhost, host_total_listings_count, host_identity_verified, host_location, ingest_ts
)
SELECT
  COALESCE(
    get_json_object(b.json_raw, '$.host.host_id'),
    get_json_object(b.json_raw, '$.host.id')
  )                                                                                          AS bk_host_id,
  get_json_object(b.json_raw, '$.host.host_name')                                            AS host_name,
  get_json_object(b.json_raw, '$.host.host_response_time')                                   AS host_response_time,
  TRY_CAST(
    COALESCE(
      get_json_object(get_json_object(b.json_raw, '$.host.host_response_rate'), '$.$numberInt'),
      regexp_replace(get_json_object(b.json_raw, '$.host.host_response_rate'), '[^0-9]', '')
    ) AS INT
  )                                                                                          AS host_response_rate,
  CAST(get_json_object(b.json_raw, '$.host.host_is_superhost') AS BOOLEAN)                   AS host_is_superhost,
  TRY_CAST(
    COALESCE(
      get_json_object(get_json_object(b.json_raw, '$.host.host_total_listings_count'), '$.$numberInt'),
      regexp_replace(get_json_object(b.json_raw, '$.host.host_total_listings_count'), '[^0-9]', '')
    ) AS INT
  )                                                                                          AS host_total_listings_count,
  CAST(get_json_object(b.json_raw, '$.host.host_identity_verified') AS BOOLEAN)              AS host_identity_verified,
  get_json_object(b.json_raw, '$.host.host_location')                                        AS host_location,
  current_timestamp()                                                                        AS ingest_ts
FROM airbnb.bronze.bronze_listings_raw b
WHERE COALESCE(get_json_object(b.json_raw, '$.host.host_id'), get_json_object(b.json_raw, '$.host.id')) IS NOT NULL;


-- =========================
-- 2) SILVER.LISTINGS
-- =========================
INSERT OVERWRITE airbnb.silver.listings (
  bk_listing_id, name, summary, description, property_type, room_type,
  accommodates, bedrooms, beds, bathrooms,
  price, minimum_nights, maximum_nights, instant_bookable,
  bk_host_id, host_sk,
  first_review_date, last_review_date, number_of_reviews, review_scores_rating,
  latitude, longitude, neighborhood, city, country,
  ingest_ts
)
SELECT
  COALESCE(get_json_object(b.json_raw, '$._id'), get_json_object(b.json_raw, '$.id'))           AS bk_listing_id,
  get_json_object(b.json_raw, '$.name')                                                         AS name,
  get_json_object(b.json_raw, '$.summary')                                                      AS summary,
  get_json_object(b.json_raw, '$.description')                                                  AS description,
  get_json_object(b.json_raw, '$.property_type')                                                AS property_type,
  get_json_object(b.json_raw, '$.room_type')                                                    AS room_type,
  TRY_CAST(COALESCE(get_json_object(get_json_object(b.json_raw,'$.accommodates'), '$.$numberInt'),
                    regexp_replace(get_json_object(b.json_raw,'$.accommodates'), '[^0-9]', '')) AS INT) AS accommodates,
  TRY_CAST(COALESCE(get_json_object(get_json_object(b.json_raw,'$.bedrooms'), '$.$numberInt'),
                    regexp_replace(get_json_object(b.json_raw,'$.bedrooms'), '[^0-9]', '')) AS INT)     AS bedrooms,
  TRY_CAST(COALESCE(get_json_object(get_json_object(b.json_raw,'$.beds'), '$.$numberInt'),
                    regexp_replace(get_json_object(b.json_raw,'$.beds'), '[^0-9]', '')) AS INT)         AS beds,
  TRY_CAST(COALESCE(get_json_object(get_json_object(b.json_raw,'$.bathrooms'), '$.$numberDecimal'),
                    regexp_replace(get_json_object(b.json_raw,'$.bathrooms'), '[^0-9\\.]', '')) AS DECIMAL(3,1)) AS bathrooms,

  TRY_CAST(COALESCE(get_json_object(get_json_object(b.json_raw,'$.price'), '$.$numberDecimal'),
                    regexp_replace(get_json_object(b.json_raw,'$.price'), '[^0-9\\.]', '')) AS DECIMAL(10,2)) AS price,
  TRY_CAST(get_json_object(b.json_raw, '$.minimum_nights') AS INT)                                           AS minimum_nights,
  TRY_CAST(get_json_object(b.json_raw, '$.maximum_nights') AS INT)                                           AS maximum_nights,
  COALESCE(CAST(get_json_object(b.json_raw, '$.instant_bookable') AS BOOLEAN), FALSE)                        AS instant_bookable,

  COALESCE(get_json_object(b.json_raw, '$.host.host_id'), get_json_object(b.json_raw, '$.host.id'))          AS bk_host_id,
  h.host_sk                                                                                                  AS host_sk,

  COALESCE(
    TO_TIMESTAMP(FROM_UNIXTIME(TRY_CAST(get_json_object(b.json_raw, "$.first_review['$date']['$numberLong']") AS DOUBLE)/1000.0)),
    TO_TIMESTAMP(FROM_UNIXTIME(TRY_CAST(from_json(b.json_raw,'struct<first_review:struct<`$date`:struct<`$numberLong`:string>>>').first_review.`$date`.`$numberLong` AS DOUBLE)/1000.0)),
    TO_TIMESTAMP(FROM_UNIXTIME(TRY_CAST(regexp_extract(b.json_raw,'"first_review".*?"\\$numberLong"\\s*:\\s*"([0-9]+)"',1) AS DOUBLE)/1000.0)),
    TRY_TO_TIMESTAMP(get_json_object(get_json_object(b.json_raw, '$.first_review'), '$.$date')),
    TRY_TO_TIMESTAMP(get_json_object(b.json_raw, '$.first_review'))
  )                                                                                                         AS first_review_date,
  COALESCE(
    TO_TIMESTAMP(FROM_UNIXTIME(TRY_CAST(get_json_object(b.json_raw, "$.last_review['$date']['$numberLong']") AS DOUBLE)/1000.0)),
    TO_TIMESTAMP(FROM_UNIXTIME(TRY_CAST(from_json(b.json_raw,'struct<last_review:struct<`$date`:struct<`$numberLong`:string>>>').last_review.`$date`.`$numberLong` AS DOUBLE)/1000.0)),
    TO_TIMESTAMP(FROM_UNIXTIME(TRY_CAST(regexp_extract(b.json_raw,'"last_review".*?"\\$numberLong"\\s*:\\s*"([0-9]+)"',1) AS DOUBLE)/1000.0)),
    TRY_TO_TIMESTAMP(get_json_object(get_json_object(b.json_raw, '$.last_review'), '$.$date')),
    TRY_TO_TIMESTAMP(get_json_object(b.json_raw, '$.last_review'))
  )                                                                                                         AS last_review_date,

  TRY_CAST(COALESCE(get_json_object(get_json_object(b.json_raw, '$.number_of_reviews'), '$.$numberInt'),
                    regexp_replace(get_json_object(b.json_raw,'$.number_of_reviews'), '[^0-9]', '')) AS INT) AS number_of_reviews,
  TRY_CAST(COALESCE(get_json_object(get_json_object(b.json_raw, '$.review_scores.review_scores_rating'), '$.$numberInt'),
                    regexp_replace(get_json_object(b.json_raw,'$.review_scores.review_scores_rating'), '[^0-9]', '')) AS INT) AS review_scores_rating,

  -- coords [lon, lat] → lat, lon
  TRY_CAST(COALESCE(get_json_object(get_json_object(b.json_raw,'$.address.location.coordinates[1]'), '$.$numberDouble'),
                    regexp_replace(get_json_object(b.json_raw,'$.address.location.coordinates[1]'), '[^0-9\\.-]', '')) AS DECIMAL(9,6)) AS latitude,
  TRY_CAST(COALESCE(get_json_object(get_json_object(b.json_raw,'$.address.location.coordinates[0]'), '$.$numberDouble'),
                    regexp_replace(get_json_object(b.json_raw,'$.address.location.coordinates[0]'), '[^0-9\\.-]', '')) AS DECIMAL(9,6)) AS longitude,
  COALESCE(get_json_object(b.json_raw, '$.address.government_area'), get_json_object(b.json_raw, '$.neighborhood')) AS neighborhood,
  COALESCE(get_json_object(b.json_raw, '$.address.market'), get_json_object(b.json_raw, '$.address.city'))           AS city,
  get_json_object(b.json_raw, '$.address.country')                                                                    AS country,

  current_timestamp()                                                                                                AS ingest_ts
FROM airbnb.bronze.bronze_listings_raw b
LEFT JOIN airbnb.silver.hosts h
  ON h.bk_host_id = COALESCE(get_json_object(b.json_raw, '$.host.host_id'),
                             get_json_object(b.json_raw, '$.host.id'));


-- =========================
-- 3) SILVER.ADDRESS
-- =========================
INSERT OVERWRITE airbnb.silver.address (
  listing_sk, bk_listing_id, street, neighborhood, city, state, country, country_code, latitude, longitude, ingest_ts
)
SELECT
  l.listing_sk,
  l.bk_listing_id,
  get_json_object(b.json_raw, '$.address.street')                                                    AS street,
  COALESCE(get_json_object(b.json_raw, '$.address.government_area'), get_json_object(b.json_raw, '$.neighborhood')) AS neighborhood,
  COALESCE(get_json_object(b.json_raw, '$.address.market'), get_json_object(b.json_raw, '$.address.city'))          AS city,
  get_json_object(b.json_raw, '$.address.suburb')                                                    AS state,
  get_json_object(b.json_raw, '$.address.country')                                                   AS country,
  get_json_object(b.json_raw, '$.address.country_code')                                              AS country_code,
  TRY_CAST(COALESCE(get_json_object(get_json_object(b.json_raw,'$.address.location.coordinates[1]'), '$.$numberDouble'),
                    regexp_replace(get_json_object(b.json_raw,'$.address.location.coordinates[1]'), '[^0-9\\.-]', '')) AS DECIMAL(9,6)) AS latitude,
  TRY_CAST(COALESCE(get_json_object(get_json_object(b.json_raw,'$.address.location.coordinates[0]'), '$.$numberDouble'),
                    regexp_replace(get_json_object(b.json_raw,'$.address.location.coordinates[0]'), '[^0-9\\.-]', '')) AS DECIMAL(9,6)) AS longitude,
  current_timestamp()                                                                                 AS ingest_ts
FROM airbnb.bronze.bronze_listings_raw b
JOIN airbnb.silver.listings l
  ON l.bk_listing_id = COALESCE(get_json_object(b.json_raw, '$._id'), get_json_object(b.json_raw, '$.id'));


-- =========================
-- 4) SILVER.AMENITIES
-- =========================
INSERT INTO airbnb.silver.amenities(
  bk_amenity_name, active, ingest_ts
)
SELECT DISTINCT
  TRIM(LOWER(amenity))          AS bk_amenity_name,
  TRUE                          AS active,
  current_timestamp()           AS ingest_ts
FROM airbnb.bronze.bronze_listings_raw b
LATERAL VIEW OUTER explode(
  from_json(b.json_raw, 'struct<amenities:array<string>>').amenities
) x AS amenity
WHERE amenity IS NOT NULL AND TRIM(amenity) <> '';


-- =========================
-- 5) SILVER.LISTING_AMENITIES
-- =========================
INSERT INTO airbnb.silver.listing_amenities
SELECT DISTINCT
  l.listing_sk,                 -- 1) FK listing
  a.amenity_sk,                 -- 2) FK amenity
  l.bk_listing_id,              -- 3) BK listing
  a.bk_amenity_name,            -- 4) BK amenity (normalizado)
  src.amenity_raw               -- 5) source_tag (tal como vino)
FROM (
  -- Explota amenities con esquema y mantiene filas aunque falte el array
  SELECT
    COALESCE(
      get_json_object(b.json_raw, '$._id'),
      get_json_object(b.json_raw, '$.id')
    )                                         AS bk_listing_id,
    TRIM(LOWER(amenity))                      AS amenity_norm,
    amenity                                   AS amenity_raw
  FROM airbnb.bronze.bronze_listings_raw b
  LATERAL VIEW OUTER explode(
    from_json(b.json_raw, 'struct<amenities:array<string>>').amenities
  ) e AS amenity
) src
JOIN airbnb.silver.listings   l
  ON l.bk_listing_id = src.bk_listing_id
JOIN airbnb.silver.amenities  a
  ON a.bk_amenity_name = src.amenity_norm
WHERE src.amenity_norm IS NOT NULL AND src.amenity_norm <> '';


-- =========================
-- 6) SILVER.REVIEWS
-- =========================
WITH src AS (
  SELECT
    COALESCE(get_json_object(b.json_raw, '$._id'), get_json_object(b.json_raw, '$.id')) AS bk_listing_id,
    rev_json
  FROM airbnb.bronze.bronze_listings_raw b
  LATERAL VIEW OUTER explode(
    from_json(get_json_object(b.json_raw, '$.reviews'), 'array<string>')
  ) e AS rev_json
),
proj AS (
  SELECT
    bk_listing_id,
    get_json_object(rev_json, '$._id')               AS bk_review_id,
    get_json_object(rev_json, '$.listing_id')        AS rv_listing_id,
    get_json_object(rev_json, '$.reviewer_id')       AS reviewer_id,
    get_json_object(rev_json, '$.reviewer_name')     AS reviewer_name,
    get_json_object(rev_json, '$.comments')          AS comments,

    -- extrae epoch-ms si existe
    TRY_CAST(
      COALESCE(
        get_json_object(rev_json, '$.date.$date.$numberLong'),
        regexp_extract(rev_json, '"date"\\s*:\\s*\\{[^}]*"\\$numberLong"\\s*:\\s*"([0-9]+)"', 1),
        get_json_object(rev_json, '$.date')   -- por si viene como número plano
      ) AS BIGINT
    )                                                AS ms_epoch,

    -- extrae ISO si no hay epoch-ms
    COALESCE(
      get_json_object(get_json_object(rev_json, '$.date'), '$.$date'),
      get_json_object(rev_json, '$.date.$date'),
      get_json_object(rev_json, '$.date_iso')
    )                                                AS date_iso,

    rev_json                                         AS rev_json_raw
  FROM src
)
INSERT INTO airbnb.silver.reviews(
  bk_review_id, listing_sk, bk_listing_id, reviewer_id, reviewer_name,
  review_date, comments, comments_len, ingest_ts
)
SELECT
  p.bk_review_id,                                      -- bk_review_id
  l.listing_sk,                                        -- listing_sk (FK)
  l.bk_listing_id,                                     -- bk_listing_id
  p.reviewer_id,                                       -- reviewer_id
  p.reviewer_name,                                     -- reviewer_name
  -- fecha robusta: epoch-ms -> ISO -> cualquier string parseable en date
  COALESCE(
    TO_TIMESTAMP(FROM_UNIXTIME(p.ms_epoch / 1000.0)),
    TRY_TO_TIMESTAMP(p.date_iso),
    TRY_TO_TIMESTAMP(get_json_object(p.rev_json_raw, '$.date'))
  )                                                    AS review_date,
  p.comments,                                          -- comments
  LENGTH(p.comments)                                   -- comments_len
  ,current_timestamp()                                 -- ingest_ts
FROM proj p
JOIN airbnb.silver.listings l
  ON l.bk_listing_id = COALESCE(p.rv_listing_id, p.bk_listing_id)
WHERE p.bk_review_id IS NOT NULL;