In [0]:
%sql
use catalog `get_started`;
use schema `labuser`;

SHOW TABLES

In [0]:
%sql
-- drop table if exists current_employees
DROP TABLE IF EXISTS users_bronze;
    
-- create a delta table
CREATE TABLE users_bronze USING DELTA
AS
SELECT *
FROM read_files(
    '/Volumes/get_started/labuser/myfiles/users.csv',
    format => 'csv',
    header => true,
    inferSchema => true
);

-- Display available tables in schema
SHOW TABLES

In [0]:
%sql
SELECT * FROM users_bronze LIMIT 5;

In [0]:
%sql
CREATE TABLE IF NOT EXISTS users_silver (
  user_id STRING,
  user_first_touch_timestamp BIGINT,
  email STRING,
  updated_at TIMESTAMP,
  first_touch TIMESTAMP,
  first_touch_date DATE,
  first_touch_time STRING,
  email_domain STRING
);


CREATE OR REPLACE TABLE users_silver_working AS
SELECT * FROM users_bronze;

SELECT * FROM users_silver_working LIMIT 10;

### DATA PROFILE

In [0]:
%sql
SELECT * FROM users_silver_working;

Databricks data profile. Run in Databricks to view.

Missing data

In [0]:
%sql
SELECT *FROM users_silver_working WHERE user_id IS NULL;

In [0]:
%sql
-- Removing NULL values
DELETE FROM users_silver_working WHERE user_id IS NULL;

In [0]:
%sql
ALTER TABLE users_silver_working
SET TBLPROPERTIES ('delta.columnMapping.mode' = 'name');

In [0]:
%sql
ALTER TABLE users_silver_working
DROP COLUMN _rescued_data;


In [0]:
%sql
SELECT * FROM users_silver_working LIMIT 5;

In [0]:
%sql
-- Removing duplicates
INSERT OVERWRITE users_silver_working
  SELECT DISTINCT * FROM users_silver_working;

Deduplicate Rows based on a column

In [0]:
%sql
INSERT OVERWRITE users_silver_working
SELECT user_id, user_first_touch_timestamp, max(email) as email, max(updated_at) as updated_at 
FROM users_silver_working 
WHERE user_id IS NOT NULL
GROUP BY user_id, user_first_touch_timestamp;


SELECT count(*) FROM users_silver_working;

VALIDATE DATASETS

In [0]:
%sql
-- No duplicate ids
SELECT max(row_count) <=1 no_duplicates_ids FROM (
  SELECT user_id, count(*) AS row_count 
  FROM users_silver_working 
  GROUP BY user_id
);

In [0]:
%sql
-- email address should be associated with one id
SELECT max(user_id_count) <=1 at_most_one_id FROM (
  SELECT email, count(user_id) AS user_id_count 
  FROM users_silver_working 
  WHERE email IS NOT NULL
  GROUP BY email
);

DATE FORMAT AND REGULAR EXPRESSIONS (REGEX)

In [0]:
%sql
-- date format
INSERT INTO users_silver
SELECT *, 
to_date(date_format(first_touch, "yyy-M-d")) AS first_touch_date, 
date_format(first_touch, "HH:mm:ss") AS first_touch_time, 
regexp_extract(email, "@(.*)", 0) AS email_domain
FROM (
  SELECT *,
  CAST(user_first_touch_timestamp/ 1e6 AS TIMESTAMP) AS first_touch
  FROM users_silver_working
)


In [0]:
%sql
Select * from users_silver;