In [1]:
%load_ext sql

In [2]:
from decouple import config
import pandas as pd

In [3]:
host = config('HOST')
database = config('SQL_DATABASE')
user = config('SQL_USER')
password = config('SQL_PASSWORD')

In [4]:
connection_string = f"postgresql://{user}:{password}@{host}/{database}"

In [5]:
%sql $connection_string

In [6]:
store_cities = %sql SELECT * FROM store_cities
store_cities_df = store_cities.DataFrame()
store_cities_df.info()

 * postgresql://postgres:***@localhost/sales
144 rows affected.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   store_id      144 non-null    object 
 1   storetype_id  144 non-null    object 
 2   store_size    144 non-null    float64
 3   city_id       144 non-null    object 
dtypes: float64(1), object(3)
memory usage: 4.6+ KB


In [7]:
product = %sql SELECT * FROM product
product_df = product.DataFrame()
product_df.info()

 * postgresql://postgres:***@localhost/sales
699 rows affected.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   product_id      699 non-null    object 
 1   product_length  681 non-null    float64
 2   product_depth   683 non-null    float64
 3   product_width   683 non-null    float64
 4   cluster_id      649 non-null    object 
 5   hierarchy1_id   699 non-null    object 
 6   hierarchy2_id   699 non-null    object 
 7   hierarchy3_id   699 non-null    object 
 8   hierarchy4_id   699 non-null    object 
 9   hierarchy5_id   699 non-null    object 
dtypes: float64(3), object(7)
memory usage: 54.7+ KB


In [8]:
# sales = %sql SELECT * FROM sales
# sales_df = sales.DataFrame()
# sales_df.info()

In [9]:
# sales_df.to_pickle('sales_df.pickle')
# del sales_df

In [10]:
loaded_sales_df = pd.read_pickle('sales_df.pickle')
loaded_sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19454838 entries, 0 to 19454837
Data columns (total 13 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   product_id             object 
 1   store_id               object 
 2   date                   object 
 3   sales                  float64
 4   revenue                float64
 5   stock                  float64
 6   price                  float64
 7   promo_type_1           object 
 8   promo_bin_1            object 
 9   promo_type_2           object 
 10  promo_bin_2            object 
 11  promo_discount_2       object 
 12  promo_discount_type_2  object 
dtypes: float64(4), object(9)
memory usage: 1.9+ GB


## Average Revenue

### 1.0 Average revenue for all the stores 

In [11]:
avg_revenue = %sql SELECT CAST(AVG(revenue) AS DECIMAL (5,2)) FROM sales
avg_revenue
# avg_revenue = round(avg_revenue, 2)

 * postgresql://postgres:***@localhost/sales
1 rows affected.


avg
2.06


### 2.0 Average revenue for individual stores

In [12]:
avg_revenue_by_store = %sql SELECT store_id, CAST(AVG(revenue) AS DECIMAL (5,2)) FROM sales GROUP BY store_id
avg_revenue_by_store

 * postgresql://postgres:***@localhost/sales
144 rows affected.


store_id,avg
S0001,3.18
S0002,2.12
S0003,1.9
S0004,1.37
S0005,1.49
S0006,0.89
S0007,1.74
S0008,1.38
S0009,0.94
S0010,2.67


### 3.0 Average revenue for select store

### One store_id

In [13]:
%%sql store_avg_revenue << SELECT * 
FROM(SELECT store_id, CAST(AVG(revenue) AS DECIMAL (5,2)) 
FROM sales GROUP BY store_id) AS result 
WHERE result.store_id = 'S0100'

 * postgresql://postgres:***@localhost/sales
1 rows affected.
Returning data to local variable store_avg_revenue


In [14]:
store_avg_revenue

store_id,avg
S0100,2.65


### Two store_ids

In [15]:
select_store_id = ('S0100', 'S0101')

In [16]:
%%sql select_store_avg_revenue << SELECT * 
FROM(SELECT store_id, CAST(AVG(revenue) AS DECIMAL (5,2)) 
FROM sales GROUP BY store_id) AS result 
WHERE result.store_id IN {select_store_id}

 * postgresql://postgres:***@localhost/sales
2 rows affected.
Returning data to local variable select_store_avg_revenue


In [17]:
select_store_avg_revenue

store_id,avg
S0100,2.65
S0101,2.41


### Range of store_ids selected from table

In [18]:
%%sql select_range_store_avg_revenue << SELECT * 
FROM(SELECT store_id, CAST(AVG(revenue) AS DECIMAL (5,2)) 
FROM sales GROUP BY store_id) AS result 
WHERE result.store_id BETWEEN 'S0140' AND 'S0144'

 * postgresql://postgres:***@localhost/sales
5 rows affected.
Returning data to local variable select_range_store_avg_revenue


In [19]:
select_range_store_avg_revenue

store_id,avg
S0140,1.29
S0141,0.73
S0142,0.98
S0143,0.86
S0144,0.92


In [20]:
%%sql revenue_by_city_id << SELECT store_cities.city_id, sales.revenue
FROM sales 
INNER JOIN store_cities ON sales.store_id=store_cities.store_id

 * postgresql://postgres:***@localhost/sales
19454838 rows affected.
Returning data to local variable revenue_by_city_id


In [21]:
revenue_by_city_id_df = revenue_by_city_id.DataFrame()
revenue_by_city_id_df.head()

Unnamed: 0,city_id,revenue
0,C007,0.0
1,C005,5.3
2,C026,10.59
3,C008,0.0
4,C024,0.0


In [22]:
len(revenue_by_city_id_df['city_id'].unique())

37

In [23]:
%%sql total_revenue_by_city_id << SELECT city_id, CAST(SUM(revenue) AS DECIMAL (10,2)) 
FROM sales 
INNER JOIN store_cities ON sales.store_id=store_cities.store_id
GROUP BY city_id

 * postgresql://postgres:***@localhost/sales
37 rows affected.
Returning data to local variable total_revenue_by_city_id


In [47]:
total_revenue_by_city_id_df = total_revenue_by_city_id.DataFrame()
total_revenue_by_city_id_df.head()

Unnamed: 0,city_id,sum
0,C001,223137.25
1,C002,680987.33
2,C003,103252.45
3,C004,1027630.08
4,C005,417360.83


In [37]:
total_revenue_by_city_id_df[
    total_revenue_by_city_id_df['sum'] == 
    total_revenue_by_city_id_df['sum'].max()
    ]

Unnamed: 0,city_id,sum
13,C014,12760439.56


In [45]:
%%sql city_max_revenue << SELECT city_id, CAST(SUM(revenue) AS DECIMAL (10,2)) 
FROM sales 
INNER JOIN store_cities ON sales.store_id=store_cities.store_id
GROUP BY city_id
ORDER BY sum DESC
LIMIT 1

 * postgresql://postgres:***@localhost/sales
1 rows affected.
Returning data to local variable city_max_revenue


In [46]:
city_max_revenue

city_id,sum
C014,12760439.56


In [52]:
%%sql total_revenue_by_product_id << SELECT sales.product_id, CAST(SUM(revenue) AS DECIMAL (10,2)) 
FROM sales 
INNER JOIN product ON sales.product_id=product.product_id
GROUP BY sales.product_id

 * postgresql://postgres:***@localhost/sales
649 rows affected.
Returning data to local variable total_revenue_by_product_id


In [53]:
total_revenue_by_product_id_df = total_revenue_by_product_id.DataFrame()
total_revenue_by_product_id_df.head()

Unnamed: 0,product_id,sum
0,P0001,21128.44
1,P0002,9776.39
2,P0004,1631.29
3,P0005,25408.62
4,P0006,2550.17


In [54]:
total_revenue_by_product_id_df[
    total_revenue_by_product_id_df['sum'] == 
    total_revenue_by_product_id_df['sum'].max()
    ]

Unnamed: 0,product_id,sum
92,P0103,2670199.66


In [69]:
%%sql product_max_revenue << SELECT *
FROM(SELECT sales.product_id, CAST(SUM(revenue) AS DECIMAL (10,2)) 
FROM sales 
INNER JOIN product ON product.product_id=sales.product_id
GROUP BY sales.product_id
ORDER BY sum DESC) AS result
WHERE result.sum IS NOT NULL
LIMIT 1


 * postgresql://postgres:***@localhost/sales
1 rows affected.
Returning data to local variable product_max_revenue


In [70]:
product_max_revenue

product_id,sum
P0103,2670199.66
