### Transforming Customer Data
1. Remove records with NULL customer_id
2. Remove duplicate records
3. Remove duplicate records based on created_timestamp
4. CAST the columns to the correct DataType
5. Write transformed data to the Silver Schema

####1. Remove records with NULL customer_id

In [0]:
-- selects the distinct records where customer_id is not null

CREATE OR REPLACE TEMP VIEW v_customer_distinct
as
SELECT DISTINCT * FROM gizmobox.bronze.v_customers
WHERE customer_id IS NOT NULL
ORDER BY customer_id;

In [0]:
-- If we select the records of the max(time_stamp) when there is duplicates, it gives us the latest record between the duplicates.
-- Hence creating a set with customer_id and its latest created record

select customer_id, max(created_timestamp) as Latest_record
from v_customer_distinct
group by customer_id

customer_id,Latest_record
1211,2024-12-16 16:54:10
1987,2024-11-16 01:47:25
2054,2024-11-03 17:22:56
2141,2024-12-17 15:05:15
2344,2024-10-26 15:14:33
2639,2024-12-21 07:08:28
2703,2024-11-07 22:20:00
3084,2024-12-20 12:32:29
3295,2025-01-17 07:50:48
3409,2024-10-02 14:53:40


####3. Remove duplicate records based on created_timestamp

In [0]:
-- joining the 2 above sets to get disctinct records with latest created_timestamp with the use of CTE

WITH cte_max as(
      select customer_id, max(created_timestamp) as Latest_record
      from v_customer_distinct
      group by customer_id
)
select V.*
from v_customer_distinct V
JOIN cte_max M
ON V.customer_id = M.customer_id
AND V.created_timestamp = M.Latest_record

created_timestamp,customer_id,customer_name,date_of_birth,email,member_since,telephone,file_path
2024-10-24 13:03:13,9706,Jennifer Haas,2001-04-03,benjamin55@mail.com,2024-10-05,+1 4725460000,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-26 15:14:33,2344,Martin Thomas DDS,2002-05-01,billy42@outlook.com,2024-10-13,+1 0201457151,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-08 22:49:25,9263,Joseph Keller,2003-02-11,,2024-10-04,+1 3817867756,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-28 00:57:09,9247,Dr. Daniel Hall,2004-03-21,tamara30@gmail.com,2024-10-03,+1 7544137170,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-17 16:12:27,9179,Richard Cox,1996-10-25,devon84@mail.com,2024-09-26,+1 6680703335,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-22 05:52:57,6627,Candice Rosales,2004-04-03,amber79@example.org,2024-10-15,+1 4788451559,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-12 06:02:27,8539,Lori Mason,2002-11-01,stephanie7@mail.com,2024-09-12,+1 0498301620,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-06 19:55:52,5028,Jessica Harris,2004-04-19,,2024-09-10,+1 8604009935,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-23 22:03:08,7207,Billy Scott,1997-03-17,christopher30@mail.com,2024-09-23,+1 5544387564,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-01 00:50:29,4858,Carla Morton,2004-06-21,joseph88@mail.com,2024-09-15,+1 8616454195,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json


####4. CAST the columns to the correct DataType

In [0]:
-- WE CAN CHANGE THE DATA TYPES OF THE COLUMNS AS PER OUR NEED IN THE SAME CTE ABOVE: Using CAST()
-- We can also change the column names as per our need

WITH cte_max as(
      select customer_id, max(created_timestamp) as Latest_record
      from v_customer_distinct
      group by customer_id
)
select CAST(V.created_timestamp as TIMESTAMP) as created_timestamp,
       V.customer_id,
       V.customer_name,
       CAST(V.date_of_birth as DATE) as date_of_birth,
       V.email,
       CAST(V.member_since as DATE) as member_since,
       V.telephone,
       V.file_path
from v_customer_distinct V
JOIN cte_max M
ON V.customer_id = M.customer_id
AND V.created_timestamp = M.Latest_record

created_timestamp,customer_id,customer_name,date_of_birth,email,member_since,telephone,file_path
2024-10-24T13:03:13Z,9706,Jennifer Haas,2001-04-03,benjamin55@mail.com,2024-10-05,+1 4725460000,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-26T15:14:33Z,2344,Martin Thomas DDS,2002-05-01,billy42@outlook.com,2024-10-13,+1 0201457151,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-08T22:49:25Z,9263,Joseph Keller,2003-02-11,,2024-10-04,+1 3817867756,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-28T00:57:09Z,9247,Dr. Daniel Hall,2004-03-21,tamara30@gmail.com,2024-10-03,+1 7544137170,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-17T16:12:27Z,9179,Richard Cox,1996-10-25,devon84@mail.com,2024-09-26,+1 6680703335,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-22T05:52:57Z,6627,Candice Rosales,2004-04-03,amber79@example.org,2024-10-15,+1 4788451559,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-12T06:02:27Z,8539,Lori Mason,2002-11-01,stephanie7@mail.com,2024-09-12,+1 0498301620,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-06T19:55:52Z,5028,Jessica Harris,2004-04-19,,2024-09-10,+1 8604009935,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-23T22:03:08Z,7207,Billy Scott,1997-03-17,christopher30@mail.com,2024-09-23,+1 5544387564,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-01T00:50:29Z,4858,Carla Morton,2004-06-21,joseph88@mail.com,2024-09-15,+1 8616454195,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json


####5. Write transformed data to the Silver Schema
- Now using CTAS, we populate data in silver schema
- CTAS stands for Create Table As Select. It's a SQL command used to create a new table and populate it with data from a SELECT query—all in one step.

In [0]:
CREATE TABLE gizmobox.silver.customers
as
WITH cte_max as(
      select customer_id, max(created_timestamp) as Latest_record
      from v_customer_distinct
      group by customer_id
)
select CAST(V.created_timestamp as TIMESTAMP) as created_timestamp,
       V.customer_id,
       V.customer_name,
       CAST(V.date_of_birth as DATE) as date_of_birth,
       V.email,
       CAST(V.member_since as DATE) as member_since,
       V.telephone,
       V.file_path
from v_customer_distinct V
JOIN cte_max M
ON V.customer_id = M.customer_id
AND V.created_timestamp = M.Latest_record

num_affected_rows,num_inserted_rows


In [0]:
select * from gizmobox.silver.customers

created_timestamp,customer_id,customer_name,date_of_birth,email,member_since,telephone,file_path
2024-10-24T13:03:13Z,9706,Jennifer Haas,2001-04-03,benjamin55@mail.com,2024-10-05,+1 4725460000,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-26T15:14:33Z,2344,Martin Thomas DDS,2002-05-01,billy42@outlook.com,2024-10-13,+1 0201457151,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-08T22:49:25Z,9263,Joseph Keller,2003-02-11,,2024-10-04,+1 3817867756,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-28T00:57:09Z,9247,Dr. Daniel Hall,2004-03-21,tamara30@gmail.com,2024-10-03,+1 7544137170,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-17T16:12:27Z,9179,Richard Cox,1996-10-25,devon84@mail.com,2024-09-26,+1 6680703335,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-22T05:52:57Z,6627,Candice Rosales,2004-04-03,amber79@example.org,2024-10-15,+1 4788451559,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-12T06:02:27Z,8539,Lori Mason,2002-11-01,stephanie7@mail.com,2024-09-12,+1 0498301620,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-06T19:55:52Z,5028,Jessica Harris,2004-04-19,,2024-09-10,+1 8604009935,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-23T22:03:08Z,7207,Billy Scott,1997-03-17,christopher30@mail.com,2024-09-23,+1 5544387564,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-01T00:50:29Z,4858,Carla Morton,2004-06-21,joseph88@mail.com,2024-09-15,+1 8616454195,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json


In [0]:
desc extended gizmobox.silver.customers

col_name,data_type,comment
created_timestamp,timestamp,
customer_id,bigint,
customer_name,string,
date_of_birth,date,
email,string,
member_since,date,
telephone,string,
file_path,string,
,,
# Delta Statistics Columns,,


####