In [1]:
import mysql.connector
import pandas as pd

def show(db, query):
    con = mysql.connector.connect(
        host = 'localhost',
        user = 'root',
        passwd = 'admin1234',
        database = db
    )

    executor = con.cursor()
    executor.execute(query)

    table = executor.fetchall()

    con.close()
    return pd.DataFrame(data=table, columns=executor.column_names)

### Data Understanding

The OnlineRetail dataset contains 12858 rows of retail transactions data. 

Each row represents a purchase made by a customer and includes information such as:
* Invoice date 
* Invoice number
* Stock code
* Quantity 
* Price
* Customer ID 
* Country

In [2]:
q = '''SELECT * FROM tableRetail LIMIT 5'''

show(db='retail', query=q)

Unnamed: 0,Invoice,StockCode,Quantity,InvoiceDate,Price,Customer_ID,Country
0,537215,85124C,12,12/5/2010 15:38,2.55,12747,United Kingdom
1,537215,85124B,6,12/5/2010 15:38,2.55,12747,United Kingdom
2,537215,84879,16,12/5/2010 15:38,1.69,12747,United Kingdom
3,537215,85062,24,12/5/2010 15:38,1.65,12747,United Kingdom
4,537215,85064,6,12/5/2010 15:38,5.45,12747,United Kingdom


### Data Exploration

In [3]:
q = '''SELECT CONCAT('$', FORMAT(SUM(quantity*price), 2)) AS Revenue FROM tableRetail'''

show(db='retail', query=q)

Unnamed: 0,Revenue
0,"$255,718.38"


#### QUESTION #1

    Find the total revenue per month.

This information is important to provide insights into how revenue varies from month to month in the period from December 2010 to December 2011. 
* It can help identify any trends or patterns that may impact business decisions.

In [4]:
q = '''
WITH Revenue_per_month AS (
	SELECT 
		DISTINCT DATE_FORMAT(STR_TO_DATE(InvoiceDate, '%m/%d/%Y'), '%Y-%m') AS Month,
		SUM(Price * Quantity) OVER(PARTITION BY DATE_FORMAT(STR_TO_DATE(InvoiceDate, '%m/%d/%Y'), '%Y-%m')) AS Revenue
	FROM 
		tableRetail
	ORDER BY 1 DESC
)

SELECT *,
	AVG(Revenue) OVER (ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS Moving_Average
FROM
	Revenue_per_month
'''

trend_revenue = show(db='retail', query=q)
trend_revenue.head()

Unnamed: 0,Month,Revenue,Moving_Average
0,2011-12,11124.129918,11124.129918
1,2011-11,45633.379786,28378.754852
2,2011-10,19735.069968,25497.526557
3,2011-09,27853.819879,31074.089877
4,2011-08,38374.640226,28654.510024


#### QUESTION #2

    Find the top 5 customer with the highest total revenue, and their percentage of the total revenue.

This information is important to understand the contribution of each customer to the overall revenue. 
* Identify its most valuable customers and focuses its marketing strategies and promotions to retain them. 

In [5]:
q = '''
SELECT 
	DISTINCT Customer_ID,
	SUM(Price * Quantity) OVER(PARTITION BY Customer_ID) AS Total_Revenue,
    SUM(Price * Quantity) OVER(PARTITION BY Customer_ID) / SUM(Price * Quantity) OVER () * 100 AS Percentage
FROM 
	tableRetail
ORDER BY Total_Revenue DESC LIMIT 5
'''

pct = show(db='retail', query=q)
pct

Unnamed: 0,Customer_ID,Total_Revenue,Percentage
0,12931,42055.960337,16.446202
1,12748,33719.729785,13.186275
2,12901,17654.539694,6.903899
3,12921,16587.08993,6.486468
4,12939,11581.800232,4.529123


#### QUESTION #3

    Find the monthly revenue for the top 5 customers.

This information is important to provide insight into the monthly revenue generated by the top 5 customers of a business.
* Businesses can identify trends and patterns in high-value customer behavior.

In [6]:
q = '''
WITH Customer_Revenue AS (
	SELECT 
		DISTINCT Customer_ID,
        SUM(Price * Quantity) OVER(PARTITION BY Customer_ID) AS Total_Revenue,
		DATE_FORMAT(STR_TO_DATE(InvoiceDate, '%m/%d/%Y'), '%Y-%m') AS Order_Month,
        SUM(Price * Quantity) OVER(PARTITION BY Customer_ID, DATE_FORMAT(STR_TO_DATE(InvoiceDate, '%m/%d/%Y'), '%Y-%m')) AS Revenue_per_Month
	FROM 
		tableRetail
	ORDER BY Total_Revenue DESC)

SELECT * FROM Customer_Revenue
'''

trend_per_cust = show(db='retail', query=q)
trend_per_cust

Unnamed: 0,Customer_ID,Total_Revenue,Order_Month,Revenue_per_Month
0,12931,42055.960337,2010-12,177.000005
1,12931,42055.960337,2011-02,1696.400006
2,12931,42055.960337,2011-03,62.500000
3,12931,42055.960337,2011-04,1488.000011
4,12931,42055.960337,2011-05,496.800018
...,...,...,...,...
347,12929,117.849998,2011-02,117.849998
348,12938,114.140000,2011-11,114.140000
349,12956,108.070001,2011-02,108.070001
350,12821,92.720000,2011-05,92.720000


In [7]:
q = '''
WITH Customer_Revenue AS (
	SELECT 
		DISTINCT Customer_ID,
        SUM(Price * Quantity) OVER(PARTITION BY Customer_ID) AS Total_Revenue,
		DATE_FORMAT(STR_TO_DATE(InvoiceDate, '%m/%d/%Y'), '%Y-%m') AS Order_Month,
		SUM(Price * Quantity) OVER(PARTITION BY Customer_ID, DATE_FORMAT(STR_TO_DATE(InvoiceDate, '%m/%d/%Y'), '%Y-%m')) AS Revenue_per_Month
	FROM 
		tableRetail
	ORDER BY Total_Revenue DESC),
    
    Top_Customers AS (
	SELECT *,
        DENSE_RANK() OVER(ORDER BY Total_Revenue DESC) AS ranking
	FROM Customer_Revenue)

SELECT Customer_ID, Order_Month, Revenue_per_Month
FROM Top_Customers WHERE ranking <= 5
'''

trend_per_cust = show(db='retail', query=q)
trend_per_cust.head()

Unnamed: 0,Customer_ID,Order_Month,Revenue_per_Month
0,12931,2010-12,177.000005
1,12931,2011-02,1696.400006
2,12931,2011-03,62.5
3,12931,2011-04,1488.000011
4,12931,2011-05,496.800018


#### QUESTION #4

    Find the average basket size (average number of items purchased per transaction) for top 5 customers.

This information is important to analyze customer purchasing behavior and preferences. 
* It helps predict future sales and improve customer experience.

In [8]:
q = '''
SELECT 
	DISTINCT Customer_ID,
	COUNT(*) OVER(PARTITION BY Customer_ID) AS Number_of_Orders,
	SUM(Quantity) OVER(PARTITION BY Customer_ID) AS Total_Sold_Quantities, 
    AVG(Quantity) OVER(PARTITION BY Customer_ID) AS Avg_Basket_Size
FROM 
	tableRetail
ORDER BY Total_sold_quantities DESC LIMIT 5;
'''

show(db='retail', query=q)

Unnamed: 0,Customer_ID,Number_of_Orders,Total_Sold_Quantities,Avg_Basket_Size
0,12931,82,28004,341.5122
1,12748,4596,25748,5.6023
2,12901,116,23075,198.9224
3,12830,38,9848,259.1579
4,12921,720,9526,13.2306


#### QUESTION #5

    Find their most popular product and its revenue.

This information is important to help the business identify the most profitable customers and the products that are raising their revenu.
* Provide insights into customer preferences and demand.
* Pricing strategies.
* Finding similar products that can attract them.

In [9]:
q = '''
WITH Customer_Revenue AS (
	SELECT 
		Customer_ID, 
		DENSE_RANK() OVER(ORDER BY SUM(Price * Quantity) DESC) AS ranking
	FROM 
		tableRetail
	GROUP BY Customer_ID),

	Customer_Product AS (
	SELECT 
		DISTINCT Customer_ID, 
        StockCode AS Product, 
        SUM(Quantity) AS Total_Quantity,
		SUM(Quantity * Price) AS Product_Revenue 
	FROM 
		tableRetail
	GROUP BY Customer_ID, Product),
    
	Ranked_Products AS (
	SELECT *, 
		RANK() OVER(PARTITION BY Customer_ID ORDER BY Total_Quantity DESC) AS ranking
	FROM Customer_Product)
    
SELECT 
    Customer_Revenue.Customer_ID,
    Ranked_Products.Product AS Most_Popular_Product,
    Ranked_Products.Total_Quantity,
    Ranked_Products.Product_Revenue
FROM
    Customer_Revenue JOIN Ranked_Products USING (Customer_ID)
WHERE
    Ranked_Products.ranking = 1 AND Customer_Revenue.ranking <= 5
ORDER BY Customer_Revenue.ranking
'''

show(db='retail', query=q)

Unnamed: 0,Customer_ID,Most_Popular_Product,Total_Quantity,Product_Revenue
0,12931,22197,5340,3844.800153
1,12748,21135,595,147.310003
2,12901,84077,6768,1482.719954
3,12921,84879,320,540.800018
4,12939,22570,624,2115.360065
