In [83]:
# Introduce Data in Context

orders = [
 (9423517, '2021-08-04', 9001),
 (4626232, '2021-08-04', 9003),
 (9423534, '2021-08-04', 9001),
 (9423679, '2021-08-05', 9002),
 (4626377, '2021-08-05', 9003),
 (4626412, '2021-08-05', 9004),
 (9423783, '2021-08-06', 9002),
 (4626490, '2021-08-06', 9004)
]

details = [
 (9423517, 'Jeans', 'Rip Curl', 87.0, 1),
 (9423517, 'Jacket', 'The North Face', 112.0, 1),
 (4626232, 'Socks', 'Vans', 15.0, 1),
 (4626232, 'Jeans', 'Quiksilver', 82.0, 1),
 (9423534, 'Socks', 'DC', 10.0, 2),
 (9423534, 'Socks', 'Quiksilver', 12.0, 2),
 (9423679, 'T-shirt', 'Patagonia', 35.0, 1),
 (4626377, 'Hoody', 'Animal', 44.0, 1),
 (4626377, 'Cargo Shorts', 'Animal', 38.0, 1),
 (4626412, 'Shirt', 'Volcom', 78.0, 1),
 (9423783, 'Boxer Shorts', 'Superdry', 30.0, 2),
 (9423783, 'Shorts', 'Globe', 26.0, 1),
 (4626490, 'Cargo Shorts', 'Billabong', 54.0, 1),
 (4626490, 'Sweater', 'Dickies', 56.0, 1)
]

emps = [
 (9001, 'Jeff Russell', 'LA'),
 (9002, 'Nick Boorman', 'San Francisco'),
 (9003, 'Tom Heints', 'NYC'),
 (9004, 'Maya Silver', 'Philadelphia')
]

locations = [
 ('LA', 'West'),
 ('San Francisco', 'West'),
 ('NYC', 'East'),
 ('Philadelphia', 'East')
]

In [84]:
import pandas as pd

# Turn each collection into a Pandas DataFrame

df_orders = pd.DataFrame(orders, columns = ['OrderNo', 'Date', 'Empno'])
df_details = pd.DataFrame(details, columns = ['OrderNo', 'Item', 'Brand', 'Price', 'Quantity'])
df_emps = pd.DataFrame(emps, columns = ['Empno', 'Empname', 'Location'])
df_locations = pd.DataFrame(locations, columns = ['Location', 'Region'])

With every `DataFrame` in place aggregation can be applied based on our queries.
The goal is to generate the sums of sales by region and date.

In [85]:
# In order to have Sales details, Orders and Order Details are merged into a single DataFrame, generating the Sales DataFrame.
# Given that both DataFrames, Orders and Details, has the column `OrderNo`, this column is used as Foreign Key when merging both.
df_sales = df_orders.merge(df_details)

display(df_sales)

Unnamed: 0,OrderNo,Date,Empno,Item,Brand,Price,Quantity
0,9423517,2021-08-04,9001,Jeans,Rip Curl,87.0,1
1,9423517,2021-08-04,9001,Jacket,The North Face,112.0,1
2,4626232,2021-08-04,9003,Socks,Vans,15.0,1
3,4626232,2021-08-04,9003,Jeans,Quiksilver,82.0,1
4,9423534,2021-08-04,9001,Socks,DC,10.0,2
5,9423534,2021-08-04,9001,Socks,Quiksilver,12.0,2
6,9423679,2021-08-05,9002,T-shirt,Patagonia,35.0,1
7,4626377,2021-08-05,9003,Hoody,Animal,44.0,1
8,4626377,2021-08-05,9003,Cargo Shorts,Animal,38.0,1
9,4626412,2021-08-05,9004,Shirt,Volcom,78.0,1


As result, 2 new issues arise:

1. More than 1 row represent the same order, given that order details are per-item.
2. The total price per order is not present, only the unitary price.

In [86]:
# A new column `Total` is added to the Sales DataFrame in order to have the total value from an order line.

df_sales['Total'] = df_sales['Price'] * df_sales['Quantity']

display(df_sales)

Unnamed: 0,OrderNo,Date,Empno,Item,Brand,Price,Quantity,Total
0,9423517,2021-08-04,9001,Jeans,Rip Curl,87.0,1,87.0
1,9423517,2021-08-04,9001,Jacket,The North Face,112.0,1,112.0
2,4626232,2021-08-04,9003,Socks,Vans,15.0,1,15.0
3,4626232,2021-08-04,9003,Jeans,Quiksilver,82.0,1,82.0
4,9423534,2021-08-04,9001,Socks,DC,10.0,2,20.0
5,9423534,2021-08-04,9001,Socks,Quiksilver,12.0,2,24.0
6,9423679,2021-08-05,9002,T-shirt,Patagonia,35.0,1,35.0
7,4626377,2021-08-05,9003,Hoody,Animal,44.0,1,44.0
8,4626377,2021-08-05,9003,Cargo Shorts,Animal,38.0,1,38.0
9,4626412,2021-08-05,9004,Shirt,Volcom,78.0,1,78.0


In [87]:
# A new DataFrame grouping by Order can be created to sum the total for each order using `groupby`

df_order_totals = df_sales.groupby(['OrderNo', 'Empno', 'Date']).sum()

display(df_order_totals)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Item,Brand,Price,Quantity,Total
OrderNo,Empno,Date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4626232,9003,2021-08-04,SocksJeans,VansQuiksilver,97.0,2,97.0
4626377,9003,2021-08-05,HoodyCargo Shorts,AnimalAnimal,82.0,2,82.0
4626412,9004,2021-08-05,Shirt,Volcom,78.0,1,78.0
4626490,9004,2021-08-06,Cargo ShortsSweater,BillabongDickies,110.0,2,110.0
9423517,9001,2021-08-04,JeansJacket,Rip CurlThe North Face,199.0,2,199.0
9423534,9001,2021-08-04,SocksSocks,DCQuiksilver,22.0,4,44.0
9423679,9002,2021-08-05,T-shirt,Patagonia,35.0,1,35.0
9423783,9002,2021-08-06,Boxer ShortsShorts,SuperdryGlobe,56.0,3,86.0


The columns for Item, Brand, Quantity and Price doest seem relevant when we want to know the total value per-order.
So we can drop those columns.

First we must ensure indexes are set as expected by reseting them.

In [88]:
df_order_totals = df_order_totals.reset_index().reindex(columns = ['OrderNo', 'Empno', 'Date', 'Item', 'Brand', 'Price', 'Quantity', 'Total'])

display(df_order_totals)

Unnamed: 0,OrderNo,Empno,Date,Item,Brand,Price,Quantity,Total
0,4626232,9003,2021-08-04,SocksJeans,VansQuiksilver,97.0,2,97.0
1,4626377,9003,2021-08-05,HoodyCargo Shorts,AnimalAnimal,82.0,2,82.0
2,4626412,9004,2021-08-05,Shirt,Volcom,78.0,1,78.0
3,4626490,9004,2021-08-06,Cargo ShortsSweater,BillabongDickies,110.0,2,110.0
4,9423517,9001,2021-08-04,JeansJacket,Rip CurlThe North Face,199.0,2,199.0
5,9423534,9001,2021-08-04,SocksSocks,DCQuiksilver,22.0,4,44.0
6,9423679,9002,2021-08-05,T-shirt,Patagonia,35.0,1,35.0
7,9423783,9002,2021-08-06,Boxer ShortsShorts,SuperdryGlobe,56.0,3,86.0


Finally an array with the desired column indexes is passed to filter out the undesired columns.

In [89]:
df_order_totals = df_order_totals[['OrderNo', 'Empno', 'Date', 'Total']]

display(df_order_totals)

Unnamed: 0,OrderNo,Empno,Date,Total
0,4626232,9003,2021-08-04,97.0
1,4626377,9003,2021-08-05,82.0
2,4626412,9004,2021-08-05,78.0
3,4626490,9004,2021-08-06,110.0
4,9423517,9001,2021-08-04,199.0
5,9423534,9001,2021-08-04,44.0
6,9423679,9002,2021-08-05,35.0
7,9423783,9002,2021-08-06,86.0


Going back to our first query: _Generate the sums of sales by region and date._

- The `df_emps` contains data of each Employee Number, Name and City
- The `df_locations` contains data of each City and Regions

Taking advantage of `Empno` in `df_emps` and `Location ` in `df_locations` we can end up with a DataFrame that includes the desired "Region" and its order total per employee.

In [90]:
df_region_sales = df_order_totals.merge(df_emps)

display(df_region_sales)

Unnamed: 0,OrderNo,Empno,Date,Total,Empname,Location
0,4626232,9003,2021-08-04,97.0,Tom Heints,NYC
1,4626377,9003,2021-08-05,82.0,Tom Heints,NYC
2,4626412,9004,2021-08-05,78.0,Maya Silver,Philadelphia
3,4626490,9004,2021-08-06,110.0,Maya Silver,Philadelphia
4,9423517,9001,2021-08-04,199.0,Jeff Russell,LA
5,9423534,9001,2021-08-04,44.0,Jeff Russell,LA
6,9423679,9002,2021-08-05,35.0,Nick Boorman,San Francisco
7,9423783,9002,2021-08-06,86.0,Nick Boorman,San Francisco


In [91]:
df_region_sales = df_region_sales.merge(df_locations)

display(df_region_sales)

Unnamed: 0,OrderNo,Empno,Date,Total,Empname,Location,Region
0,4626232,9003,2021-08-04,97.0,Tom Heints,NYC,East
1,4626377,9003,2021-08-05,82.0,Tom Heints,NYC,East
2,4626412,9004,2021-08-05,78.0,Maya Silver,Philadelphia,East
3,4626490,9004,2021-08-06,110.0,Maya Silver,Philadelphia,East
4,9423517,9001,2021-08-04,199.0,Jeff Russell,LA,West
5,9423534,9001,2021-08-04,44.0,Jeff Russell,LA,West
6,9423679,9002,2021-08-05,35.0,Nick Boorman,San Francisco,West
7,9423783,9002,2021-08-06,86.0,Nick Boorman,San Francisco,West


Great! Now every order also has the "Region", "Location" and the `Empname` as a side-effect.

Using `groupby`, we can join rows that has the same `Region`.

In [92]:
df_region_sales = df_region_sales.reset_index().reindex(columns = ['Region', 'Date', 'OrderNo', 'Empno', 'Total', 'Empname', 'Location'])

display(df_region_sales)

Unnamed: 0,Region,Date,OrderNo,Empno,Total,Empname,Location
0,East,2021-08-04,4626232,9003,97.0,Tom Heints,NYC
1,East,2021-08-05,4626377,9003,82.0,Tom Heints,NYC
2,East,2021-08-05,4626412,9004,78.0,Maya Silver,Philadelphia
3,East,2021-08-06,4626490,9004,110.0,Maya Silver,Philadelphia
4,West,2021-08-04,9423517,9001,199.0,Jeff Russell,LA
5,West,2021-08-04,9423534,9001,44.0,Jeff Russell,LA
6,West,2021-08-05,9423679,9002,35.0,Nick Boorman,San Francisco
7,West,2021-08-06,9423783,9002,86.0,Nick Boorman,San Francisco


Finally, lets drop undesired columns from our `DataFrame`.

In [93]:
df_region_sales = df_region_sales[['Date', 'Region', 'Total']]
df_region_sales = df_region_sales.groupby(['Region', 'Date']).sum()

display(df_region_sales)

Unnamed: 0_level_0,Unnamed: 1_level_0,Total
Region,Date,Unnamed: 2_level_1
East,2021-08-04,97.0
East,2021-08-05,160.0
East,2021-08-06,110.0
West,2021-08-04,243.0
West,2021-08-05,35.0
West,2021-08-06,86.0


> Aggregation order matters!

In [94]:
df_date_region_agg = df_region_sales.groupby(['Date', 'Region']).sum()
display(df_date_region_agg)

Unnamed: 0_level_0,Unnamed: 1_level_0,Total
Date,Region,Unnamed: 2_level_1
2021-08-04,East,97.0
2021-08-04,West,243.0
2021-08-05,East,160.0
2021-08-05,West,35.0
2021-08-06,East,110.0
2021-08-06,West,86.0


Given that both `Date` and `Region` are indexes, together these form a _hirerachical index_ also known as _multilevel index_ (a.k.a. `MultiIndex`)

`MultiIndex`es make it possible to work with data that has an arbitrary number of dimensions.

In our example we have 3 axes being `Date`, `Region` and `Aggregation`. This is also known as a 3D Dataset.

Axis | Coordinates
--- | ---
`Date` | `2022-02-04`,`2021-08-05`, `2021-08-06`
`Region` | `West`, `East`
`Aggregation` | `Total`

In [95]:
# Retrieve all available indexes in a data frame
print(df_date_region_agg.index)

MultiIndex([('2021-08-04', 'East'),
            ('2021-08-04', 'West'),
            ('2021-08-05', 'East'),
            ('2021-08-05', 'West'),
            ('2021-08-06', 'East'),
            ('2021-08-06', 'West')],
           names=['Date', 'Region'])


In [96]:
# Knowing indexes we can access the Total for a `Date` and `Region`
df_date_region_agg[df_date_region_agg.index.isin([('2021-08-05', 'West')])]

Unnamed: 0_level_0,Unnamed: 1_level_0,Total
Date,Region,Unnamed: 2_level_1
2021-08-05,West,35.0


In [97]:
# Creates a series from a sum on the total
grand_total_series = df_date_region_agg.sum(axis = 0)
print(grand_total_series)

Total    731.0
dtype: float64


In [98]:
# Set a name for the series
grand_total_series.name = ('Grand Total', 'Grand Total')

In [99]:
df_date_region_total = pd.concat([df_date_region_agg, pd.DataFrame([grand_total_series])], ignore_index=True)

display(df_date_region_total)

Unnamed: 0,Total
0,97.0
1,243.0
2,160.0
3,35.0
4,110.0
5,86.0
6,731.0


In [101]:
df_date_region_total[df_date_region_total.index.isin([('Grand Total', 'Grand Total')])]

Unnamed: 0,Total
