#count distinct

In [0]:
%sql
-- sample table
select * from samples.tpch.orders limit 10

In [0]:
select approx_count_distinct(o_clerk) 
from samples.tpch.orders

In [0]:
select count(distinct o_clerk) 
from samples.tpch.orders

#Group by clause with All 

In [0]:
CREATE OR REPLACE VIEW orders_view AS
SELECT *
FROM (
    VALUES
    (1, 101, '2025-01-05', 201, 2, 500.00, 50.00, 950.00, 'DELIVERED', 'CARD'),
    (2, 102, '2025-01-06', 202, 1, 1200.00, 0.00, 1200.00, 'DELIVERED', 'UPI'),
    (3, 103, '2025-01-07', 203, 3, 300.00, 30.00, 870.00, 'CANCELLED', 'CASH'),
    (4, 101, '2025-01-08', 204, 5, 150.00, 75.00, 675.00, 'DELIVERED', 'CARD'),
    (5, 104, '2025-01-09', 205, 2, 800.00, 0.00, 1600.00, 'PENDING', 'UPI'),
    (6, 105, '2025-01-10', 206, 4, 250.00, 100.00, 900.00, 'DELIVERED', 'CARD'),
    (7, 102, '2025-01-11', 207, 1, 2000.00, 200.00, 1800.00, 'DELIVERED', 'NETBANKING'),
    (8, 106, '2025-01-12', 208, 6, 100.00, 60.00, 540.00, 'PENDING', 'CASH'),
    (9, 107, '2025-01-13', 209, 3, 450.00, 0.00, 1350.00, 'DELIVERED', 'UPI'),
    (10,108, '2025-01-14', 210, 2, 700.00, 70.00, 1330.00, 'RETURNED', 'CARD')
) AS orders_view(
    order_id,
    customer_id,
    order_date,
    product_id,
    quantity,
    unit_price,
    discount,
    total_amount,
    order_status,
    payment_method
);

In [0]:
SELECT 
  order_id,
  customer_id,
  order_date,
  product_id,
  order_status,
  payment_method,
  SUM(total_amount) as total_amount
from orders_view
group by order_id,
    customer_id,
    order_date,
    product_id,
    order_status,
    payment_method

In [0]:
SELECT 
  order_id, 
  customer_id, 
  order_date, 
  product_id, 
  order_status, 
  payment_method, 
  SUM(total_amount) as total_amount
from orders_view
group by all


#Group by rollup vs cube vs grouping sets

In [0]:
CREATE OR REPLACE TABLE hive_metastore.demo.sales AS
VALUES 
('East', "Phone", 100),
('East', "Laptop", 200),
('West', "Phone", 150),
('West', "Laptop", 300)
AS sales(region, product, amount);

In [0]:
SELECT * FROM sales

In [0]:
SELECT region, product, SUM(amount) AS total_sales
FROM sales
GROUP BY ROLLUP (region, product);

In [0]:
SELECT region, product, SUM(amount) AS total_sales
FROM sales
GROUP BY CUBE (region, product);


In [0]:
SELECT region, product, SUM(amount) AS total_sales
FROM sales
GROUP BY GROUPING SETS (
  (region, product),
  (region)
);


In [0]:
SELECT 
  region, 
  product, 
  SUM(amount) AS total_sales,
  GROUPING(region) AS grp_region,
  GROUPING(product) AS grp_product
FROM sales
GROUP BY ROLLUP (region, product);

#max by and min by

In [0]:
USE hive_metastore.demo

In [0]:
CREATE OR REPLACE TABLE hive_metastore.demo.order_data AS
VALUES 
  (1, 'A', '2024-01-01', 100),
  (2, 'A', '2024-01-05', 150),
  (3, 'B', '2024-01-02', 200),
  (4, 'B', '2024-01-06', 180)
AS order_data(order_id, product, order_date, amount);

In [0]:
%sql
select * from order_data

In [0]:
SELECT product, amount AS latest_amount
FROM (
  SELECT
    product,
    amount,
    ROW_NUMBER() OVER (
      PARTITION BY product
      ORDER BY order_date DESC
    ) AS rn
  FROM order_data
) t
WHERE rn = 1;


In [0]:
SELECT
  product,
  MAX_BY(amount, order_date) AS latest_amount
FROM order_data
GROUP BY product;


#qualify clause

In [0]:
SELECT * FROM order_data 

In [0]:
SELECT *
 FROM order_data
 QUALIFY rank() OVER(PARTITION BY product ORDER BY order_date DESC) = 1

In [0]:
WITH CTE AS (
    SELECT order_id, product, order_date, amount, 
    rank() OVER(PARTITION BY product ORDER BY order_date DESC) AS rnk
    FROM order_data
)

SELECT order_id, product, order_date, amount FROM CTE WHERE rnk = 1

#minus and except

In [0]:
CREATE OR REPLACE TABLE hive_metastore.demo.order_data_1 AS
VALUES 
  (1, 'A', '2024-01-01', 100),
  (2, 'A', '2024-01-05', 150),
  (3, 'B', '2024-01-02', 200),
  (4, 'B', '2024-01-06', 180)
AS order_data_1(order_id, product, order_date, amount);

CREATE OR REPLACE TABLE hive_metastore.demo.order_data_2 AS
VALUES 
  (1, 'A', '2024-01-01', 100),
  (2, 'A', '2024-01-05', 150),
  (4, 'B', '2024-01-06', 180)
AS order_data_2(order_id, product, order_date, amount);


In [0]:
(SELECT order_id FROM order_data_1) 
MINUS 
(SELECT order_id FROM order_data_2)

In [0]:
SELECT A.order_id FROM order_data_1 A
left join order_data_2 B
on A.order_id  = B.order_id 
where B.order_id is null

#lead and lag

In [0]:
SELECT
    order_id,
    product,
    order_date,
    amount,
    LEAD(amount) OVER (
        PARTITION BY product
        ORDER BY order_date
    ) AS next_amount
FROM order_data;


In [0]:
WITH ranked_orders AS (
    SELECT
        order_id,
        product,
        order_date,
        amount,
        ROW_NUMBER() OVER (
            PARTITION BY product
            ORDER BY order_date
        ) AS rn
    FROM order_data
)
SELECT
    c.order_id,
    c.product,
    c.order_date,
    c.amount,
    n.amount AS next_amount
FROM ranked_orders c
LEFT JOIN ranked_orders n
    ON c.product = n.product
   AND c.rn + 1 = n.rn
ORDER BY c.product, c.order_date;
