**Sales Details Window Functions Tutorial**

In [1]:
SELECT * 
FROM SalesDetails

SalesDetailsId,SalesId,SalesDate,ProductId,Price,Quantity,LineTotal
1,1,2020-01-05,6,5.99,2,11.98
2,1,2020-01-05,5,4.5,1,4.5
3,1,2020-01-05,4,17.99,4,71.96
4,2,2020-01-07,2,2.99,2,5.98
5,2,2020-01-07,3,11.4,1,11.4
6,3,2020-01-07,6,5.99,4,23.96
7,3,2020-01-07,2,2.99,2,5.98
8,3,2020-01-07,3,11.4,1,11.4
9,3,2020-01-07,9,6.29,4,25.16
10,4,2020-01-08,9,6.29,2,12.58


Find the sum of the line total

In [2]:
SELECT 
    SalesId,
    SUM(LineTotal) AS SalesTotal
FROM dbo.SalesDetails
GROUP BY 
    SalesID;

SalesId,SalesTotal
1,88.44
2,17.38
3,66.5
4,106.06
5,71.96
6,100.9


Can join this back to the original table...

In [3]:
WITH CTE AS (
    SELECT 
        SalesId,
        SUM(LineTotal) AS SalesTotal
    FROM dbo.SalesDetails
    GROUP BY 
        SalesID
)
SELECT  
    SalesDetailsId, 
    A.SalesId, 
    SalesDate, 
    ProductId, 
    Price, 
    Quantity, 
    LineTotal, 
    SalesTotal
FROM SalesDetails AS A
    INNER JOIN CTE AS B
        ON A.SalesId = B.SalesId;


SalesDetailsId,SalesId,SalesDate,ProductId,Price,Quantity,LineTotal,SalesTotal
1,1,2020-01-05,6,5.99,2,11.98,88.44
2,1,2020-01-05,5,4.5,1,4.5,88.44
3,1,2020-01-05,4,17.99,4,71.96,88.44
4,2,2020-01-07,2,2.99,2,5.98,17.38
5,2,2020-01-07,3,11.4,1,11.4,17.38
6,3,2020-01-07,6,5.99,4,23.96,66.5
7,3,2020-01-07,2,2.99,2,5.98,66.5
8,3,2020-01-07,3,11.4,1,11.4,66.5
9,3,2020-01-07,9,6.29,4,25.16,66.5
10,4,2020-01-08,9,6.29,2,12.58,106.06


With the above, we are fetching the data twice, but a window will only execute through once.

  

With Window functions, we don't lose the detail from the query:

In [4]:
SELECT  
    SalesDetailsId, 
    SalesId, 
    SalesDate, 
    ProductId, 
    Price, 
    Quantity, 
    LineTotal, 
    SUM(LineTotal) OVER(PARTITION BY SalesId) AS SalesTotal
FROM SalesDetails;

SalesDetailsId,SalesId,SalesDate,ProductId,Price,Quantity,LineTotal,SalesTotal
1,1,2020-01-05,6,5.99,2,11.98,88.44
2,1,2020-01-05,5,4.5,1,4.5,88.44
3,1,2020-01-05,4,17.99,4,71.96,88.44
4,2,2020-01-07,2,2.99,2,5.98,17.38
5,2,2020-01-07,3,11.4,1,11.4,17.38
6,3,2020-01-07,6,5.99,4,23.96,66.5
7,3,2020-01-07,2,2.99,2,5.98,66.5
8,3,2020-01-07,3,11.4,1,11.4,66.5
9,3,2020-01-07,9,6.29,4,25.16,66.5
10,4,2020-01-08,9,6.29,2,12.58,106.06


Adding additional aggregates to the query

In [8]:
SELECT  
    SalesDetailsId, 
    SalesId, 
    SalesDate, 
    ProductId, 
    Price, 
    Quantity, 
    LineTotal, 
    SUM(LineTotal) OVER(PARTITION BY SalesId) AS SalesTotal, 
    COUNT(SalesDetailsId) OVER(PARTITION BY SalesId) AS SalesCount, 
    SUM(LineTotal) OVER(PARTITION BY SalesDate) AS DailyTotal,
    SUM(LineTotal) OVER(PARTITION BY SalesDate, ProductId) AS DailyProductSales,
    SUM(LineTotal) OVER() AS SalesGrandTotal -- there is no partition, so we are using all data
FROM SalesDetails;

SalesDetailsId,SalesId,SalesDate,ProductId,Price,Quantity,LineTotal,SalesTotal,SalesCount,DailyTotal,DailyProductSales,SalesGrandTotal
3,1,2020-01-05,4,17.99,4,71.96,88.44,3,88.44,71.96,451.24
2,1,2020-01-05,5,4.5,1,4.5,88.44,3,88.44,4.5,451.24
1,1,2020-01-05,6,5.99,2,11.98,88.44,3,88.44,11.98,451.24
4,2,2020-01-07,2,2.99,2,5.98,17.38,2,83.88,11.96,451.24
7,3,2020-01-07,2,2.99,2,5.98,66.5,4,83.88,11.96,451.24
8,3,2020-01-07,3,11.4,1,11.4,66.5,4,83.88,22.8,451.24
5,2,2020-01-07,3,11.4,1,11.4,17.38,2,83.88,22.8,451.24
6,3,2020-01-07,6,5.99,4,23.96,66.5,4,83.88,23.96,451.24
9,3,2020-01-07,9,6.29,4,25.16,66.5,4,83.88,25.16,451.24
12,4,2020-01-08,1,13.25,4,53.0,106.06,5,106.06,53.0,451.24


Creating some more interesting calculations...

In [9]:
SELECT  
    SalesDetailsId, 
    SalesId, 
    SalesDate, 
    ProductId, 
    Price, 
    Quantity, 
    LineTotal, 
    100.00 * SUM(LineTotal) OVER(PARTITION BY SalesId) / SUM(LineTotal) OVER() AS PercentTotal -- This is the Sales Id Line total divided by the Grand total 
FROM SalesDetails;

SalesDetailsId,SalesId,SalesDate,ProductId,Price,Quantity,LineTotal,PercentTotal
1,1,2020-01-05,6,5.99,2,11.98,19.599326300859847
2,1,2020-01-05,5,4.5,1,4.5,19.599326300859847
3,1,2020-01-05,4,17.99,4,71.96,19.599326300859847
4,2,2020-01-07,2,2.99,2,5.98,3.851608899920219
5,2,2020-01-07,3,11.4,1,11.4,3.851608899920219
6,3,2020-01-07,6,5.99,4,23.96,14.737168690718908
7,3,2020-01-07,2,2.99,2,5.98,14.737168690718908
8,3,2020-01-07,3,11.4,1,11.4,14.737168690718908
9,3,2020-01-07,9,6.29,4,25.16,14.737168690718908
10,4,2020-01-08,9,6.29,2,12.58,23.504121975002217


Next, looking at Lag and Lead

In [14]:
SELECT 
    Sales_Customer_Id, 
    Sales_Date,
    LAG(Sales_Amount) OVER(PARTITION BY Sales_Customer_Id ORDER BY Sales_Date) AS PrevValue, 
    Sales_Amount, 
    LEAD(Sales_Amount) OVER(PARTITION BY Sales_Customer_Id ORDER BY Sales_Date) AS NextValue
FROM Sales


Sales_Customer_Id,Sales_Date,PrevValue,Sales_Amount,NextValue
1,2018-01-02 00:00:00.0000000,,54.99,72.99
1,2018-01-03 00:00:00.0000000,54.99,72.99,34.99
1,2018-01-04 00:00:00.0000000,72.99,34.99,29.99
1,2018-01-15 00:00:00.0000000,34.99,29.99,67.0
1,2018-01-21 00:00:00.0000000,29.99,67.0,


Can also introduce and offset

In [15]:
SELECT 
    Sales_Customer_Id, 
    Sales_Date,
    LAG(Sales_Amount, 2) OVER(PARTITION BY Sales_Customer_Id ORDER BY Sales_Date) AS PrevValue, 
    Sales_Amount, 
    LEAD(Sales_Amount, 2) OVER(PARTITION BY Sales_Customer_Id ORDER BY Sales_Date) AS NextValue
FROM Sales

Sales_Customer_Id,Sales_Date,PrevValue,Sales_Amount,NextValue
1,2018-01-02 00:00:00.0000000,,54.99,34.99
1,2018-01-03 00:00:00.0000000,,72.99,29.99
1,2018-01-04 00:00:00.0000000,54.99,34.99,67.0
1,2018-01-15 00:00:00.0000000,72.99,29.99,
1,2018-01-21 00:00:00.0000000,34.99,67.0,


Entering the third parameter will fill the nulls - in this case I'll set it to 0

In [16]:
SELECT 
    Sales_Customer_Id, 
    Sales_Date,
    LAG(Sales_Amount, 2, 0) OVER(PARTITION BY Sales_Customer_Id ORDER BY Sales_Date) AS PrevValue, 
    Sales_Amount, 
    LEAD(Sales_Amount, 2, 0) OVER(PARTITION BY Sales_Customer_Id ORDER BY Sales_Date) AS NextValue
FROM Sales

Sales_Customer_Id,Sales_Date,PrevValue,Sales_Amount,NextValue
1,2018-01-02 00:00:00.0000000,0.0,54.99,34.99
1,2018-01-03 00:00:00.0000000,0.0,72.99,29.99
1,2018-01-04 00:00:00.0000000,54.99,34.99,67.0
1,2018-01-15 00:00:00.0000000,72.99,29.99,0.0
1,2018-01-21 00:00:00.0000000,34.99,67.0,0.0


To find the running difference or average running difference, can use the lag function in partitioning

In [23]:
SELECT 
    Sales_Customer_Id, 
    Sales_Date, 
    Sales_Amount, 
    LAG(Sales_Amount) OVER(PARTITION BY Sales_Customer_Id ORDER BY Sales_Date) AS PrevValue, 
    Sales_Amount - LAG(Sales_Amount) OVER(PARTITION BY Sales_Customer_Id ORDER BY Sales_Date) AS RunningDifference
FROM Sales


Sales_Customer_Id,Sales_Date,Sales_Amount,PrevValue,RunningDifference
1,2018-01-02 00:00:00.0000000,54.99,,
1,2018-01-03 00:00:00.0000000,72.99,54.99,18.0
1,2018-01-04 00:00:00.0000000,34.99,72.99,-38.0
1,2018-01-15 00:00:00.0000000,29.99,34.99,-5.0
1,2018-01-21 00:00:00.0000000,67.0,29.99,37.01


To get the average of the running difference, use the CTE

In [24]:
WITH RunningDiffs AS (
    SELECT 
        Sales_Customer_Id, 
        Sales_Date, 
        Sales_Amount, 
        LAG(Sales_Amount) OVER(PARTITION BY Sales_Customer_Id ORDER BY Sales_Date) AS PrevValue, 
        Sales_Amount - LAG(Sales_Amount) OVER(PARTITION BY Sales_Customer_Id ORDER BY Sales_Date) AS RunningDifference
    FROM Sales
)
SELECT 
    Sales_Customer_Id, 
    AVG(RunningDifference) AS Avg_Difference
FROM RunningDiffs
GROUP BY Sales_Customer_Id
ORDER BY Avg_Difference DESC;

Sales_Customer_Id,Avg_Difference
1,3.0025


Calculating Rolling Totals using Window Functions

In [25]:
SELECT
    *,
    SUM(Sales_Amount) OVER(ORDER BY Sales_Date) AS Total
FROM Sales 
ORDER BY Sales_Date

Sales_Id,Sales_Customer_Id,Sales_Date,Sales_Amount,Total
1,1,2018-01-02 00:00:00.0000000,54.99,54.99
2,1,2018-01-03 00:00:00.0000000,72.99,127.98
3,1,2018-01-04 00:00:00.0000000,34.99,162.97
4,1,2018-01-15 00:00:00.0000000,29.99,192.96
5,1,2018-01-21 00:00:00.0000000,67.0,259.96


We can change the frame which we want to look at (Window).

The following is the same thing...

This is a Running Total, as it calculates from the beginning to the end.

In [26]:
SELECT
    *,
    SUM(Sales_Amount) OVER(ORDER BY Sales_Date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS Total
FROM Sales 
ORDER BY Sales_Date

Sales_Id,Sales_Customer_Id,Sales_Date,Sales_Amount,Total
1,1,2018-01-02 00:00:00.0000000,54.99,54.99
2,1,2018-01-03 00:00:00.0000000,72.99,127.98
3,1,2018-01-04 00:00:00.0000000,34.99,162.97
4,1,2018-01-15 00:00:00.0000000,29.99,192.96
5,1,2018-01-21 00:00:00.0000000,67.0,259.96


Adding Bounds changes the way it is processed

This is a Rolling Total, as it only looks at the last 3 months inclusive

In [28]:
SELECT
    *,
    SUM(Sales_Amount) OVER(ORDER BY Sales_Date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS Total
FROM Sales 
ORDER BY Sales_Date

Sales_Id,Sales_Customer_Id,Sales_Date,Sales_Amount,Total
1,1,2018-01-02 00:00:00.0000000,54.99,54.99
2,1,2018-01-03 00:00:00.0000000,72.99,127.98
3,1,2018-01-04 00:00:00.0000000,34.99,162.97
4,1,2018-01-15 00:00:00.0000000,29.99,137.97
5,1,2018-01-21 00:00:00.0000000,67.0,131.98


Can also look at future rows...

In [29]:
SELECT
    *,
    SUM(Sales_Amount) OVER(ORDER BY Sales_Date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS Total, 
    SUM(Sales_Amount) OVER(ORDER BY Sales_Date ROWS BETWEEN CURRENT ROW AND 3 FOLLOWING) AS Forward
FROM Sales 
ORDER BY Sales_Date

Sales_Id,Sales_Customer_Id,Sales_Date,Sales_Amount,Total,Forward
1,1,2018-01-02 00:00:00.0000000,54.99,54.99,192.96
2,1,2018-01-03 00:00:00.0000000,72.99,127.98,204.97
3,1,2018-01-04 00:00:00.0000000,34.99,162.97,131.98
4,1,2018-01-15 00:00:00.0000000,29.99,137.97,96.99
5,1,2018-01-21 00:00:00.0000000,67.0,131.98,67.0


Finding the Top 3 in a table using the Rank Functions

In [32]:
SELECT TOP 5 * 
FROM employee

emp_ID,emp_NAME,DEPT_NAME,SALARY
101,Mohan,Admin,4000
102,Rajkumar,HR,3000
103,Akbar,IT,4000
104,Dorvin,Finance,6500
105,Rohit,HR,3000


Find the Max Salary per department - this can be done simply using aggregates

In [35]:
SELECT
    dept_name,
    MAX(salary) AS Max_Salary
FROM employee
GROUP BY dept_name
ORDER BY Max_Salary DESC;

dept_name,Max_Salary
IT,11000
HR,8000
Finance,6500
Admin,5000


A window function is more appropriate for these types of functions to get further information and not dropping the other information

In other words, it maintains the original data and adds a new column with aggregate data without violating the aggregate situation

In [37]:
SELECT 
    e.*,
    MAX(Salary) OVER() AS Max_Salary -- over doesn't partition over any group, it's just creating a window of records over everything
FROM employee e;

emp_ID,emp_NAME,DEPT_NAME,SALARY,Max_Salary
101,Mohan,Admin,4000,11000
102,Rajkumar,HR,3000,11000
103,Akbar,IT,4000,11000
104,Dorvin,Finance,6500,11000
105,Rohit,HR,3000,11000
106,Rajesh,Finance,5000,11000
107,Preet,HR,7000,11000
108,Maryam,Admin,4000,11000
109,Sanjay,IT,6500,11000
110,Vasudha,IT,7000,11000


Now find the highest salary per department = this only requires a partitioning in the OVER clause

In [38]:
SELECT 
    e.*,
    MAX(Salary) OVER(PARTITION BY dept_name) AS Max_Salary -- over doesn't partition over any group, it's just creating a window of records over everything
FROM employee e;

emp_ID,emp_NAME,DEPT_NAME,SALARY,Max_Salary
101,Mohan,Admin,4000,5000
108,Maryam,Admin,4000,5000
113,Gautham,Admin,2000,5000
120,Monica,Admin,5000,5000
106,Rajesh,Finance,5000,6500
116,Satya,Finance,6500,6500
118,Tejaswi,Finance,5500,6500
104,Dorvin,Finance,6500,6500
105,Rohit,HR,3000,8000
102,Rajkumar,HR,3000,8000


Using the ROW\_NUMBER function assigns a row number for each row, which creates a unique value per row if they don't already exist

In [41]:
SELECT 
    e.*,
    ROW_NUMBER() OVER(ORDER BY emp_id) as rn -- in t-sql, an order by is required in the OVER after ROW_NUMBER()
FROM employee e;

emp_ID,emp_NAME,DEPT_NAME,SALARY,rn
101,Mohan,Admin,4000,1
102,Rajkumar,HR,3000,2
103,Akbar,IT,4000,3
104,Dorvin,Finance,6500,4
105,Rohit,HR,3000,5
106,Rajesh,Finance,5000,6
107,Preet,HR,7000,7
108,Maryam,Admin,4000,8
109,Sanjay,IT,6500,9
110,Vasudha,IT,7000,10


When adding the partition by within the ROW\_NUMBER OVER, each partition resets at row 1. 

This can allow us to get first n of each partition, like finding the top 3 salaries from each department

In [44]:
SELECT 
    e.*,
    ROW_NUMBER() OVER(PARTITION BY dept_name ORDER BY salary DESC) as rn
FROM employee e;

emp_ID,emp_NAME,DEPT_NAME,SALARY,rn
120,Monica,Admin,5000,1
101,Mohan,Admin,4000,2
108,Maryam,Admin,4000,3
113,Gautham,Admin,2000,4
116,Satya,Finance,6500,1
104,Dorvin,Finance,6500,2
118,Tejaswi,Finance,5500,3
106,Rajesh,Finance,5000,4
119,Cory,HR,8000,1
107,Preet,HR,7000,2


If I want the top 3 employees from each department, I can use a subquery using WHERE to pick only numbers under a particular number

In [49]:
SELECT *
FROM(
    SELECT 
        e.*,
        ROW_NUMBER() OVER(PARTITION BY dept_name ORDER BY salary DESC) as rn
    FROM employee e
) AS rd
WHERE rd.rn < 4

emp_ID,emp_NAME,DEPT_NAME,SALARY,rn
120,Monica,Admin,5000,1
101,Mohan,Admin,4000,2
108,Maryam,Admin,4000,3
116,Satya,Finance,6500,1
104,Dorvin,Finance,6500,2
118,Tejaswi,Finance,5500,3
119,Cory,HR,8000,1
107,Preet,HR,7000,2
117,Adarsh,HR,3500,3
124,Dheeraj,IT,11000,1


Alternatively, I can use the RANK() or DENSE\_RANK() functions

In [54]:
SELECT
    e.*,
    RANK() OVER(PARTITION BY dept_name ORDER BY salary DESC) as rnk
FROM employee e;

emp_ID,emp_NAME,DEPT_NAME,SALARY,rnk
120,Monica,Admin,5000,1
101,Mohan,Admin,4000,2
108,Maryam,Admin,4000,2
113,Gautham,Admin,2000,4
116,Satya,Finance,6500,1
104,Dorvin,Finance,6500,1
118,Tejaswi,Finance,5500,3
106,Rajesh,Finance,5000,4
119,Cory,HR,8000,1
107,Preet,HR,7000,2


In [56]:
SELECT *
FROM (
    SELECT
        e.*,
        RANK() OVER(PARTITION BY dept_name ORDER BY salary DESC) as rnk
    FROM employee e
) AS rank  
WHERE rank.rnk < 4;

emp_ID,emp_NAME,DEPT_NAME,SALARY,rnk
120,Monica,Admin,5000,1
101,Mohan,Admin,4000,2
108,Maryam,Admin,4000,2
116,Satya,Finance,6500,1
104,Dorvin,Finance,6500,1
118,Tejaswi,Finance,5500,3
119,Cory,HR,8000,1
107,Preet,HR,7000,2
117,Adarsh,HR,3500,3
124,Dheeraj,IT,11000,1


DENSE\_RANK includes more records - it does not skip numbers like rank.  

Rank:   1, 2, 2, 4

Dense Rank:  1, 2, 2, 3

In [57]:
SELECT *
FROM (
    SELECT
        e.*,
        DENSE_RANK() OVER(PARTITION BY dept_name ORDER BY salary DESC) as rnk
    FROM employee e
) AS rank  
WHERE rank.rnk < 4;

emp_ID,emp_NAME,DEPT_NAME,SALARY,rnk
120,Monica,Admin,5000,1
101,Mohan,Admin,4000,2
108,Maryam,Admin,4000,2
113,Gautham,Admin,2000,3
116,Satya,Finance,6500,1
104,Dorvin,Finance,6500,1
118,Tejaswi,Finance,5500,2
106,Rajesh,Finance,5000,3
119,Cory,HR,8000,1
107,Preet,HR,7000,2
