# Constructing aggregation matrix

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

import logging
logging.basicConfig(level=logging.DEBUG)

logger = logging.getLogger(__name__)
logger.debug('hello')   # Logger's method

plt.rcParams['figure.figsize'] = [16, 8]

DEBUG:__main__:hello


In [2]:
# debugging function

DEBUG = True

def log(*s):
    if DEBUG:
        print(*s)

log("hello world", np.ones(2), ["a", "b"])

print("abc")

hello world [1. 1.] ['a', 'b']
abc


In [3]:
# Loading dataset

df = pd.read_csv('../data/dataset-02.csv')
df

Unnamed: 0.1,time,Unnamed: 0,year,month,week,province,city,store,category,department,class,vendor,size,totalQuantity,totalSales,Holiday
0,2012-01-08,0,2012,1,1,province3,city31,store40,category1,department1,class7,vendor2,size26,1.0,424.971081,True
1,2012-01-08,1,2012,1,1,province2,city15,store33,category1,department1,class7,vendor2,size29,-1.0,-424.971081,True
2,2012-01-08,3,2012,1,1,province1,city18,store51,category1,department1,class7,vendor2,size29,1.0,424.971081,True
3,2012-01-08,4,2012,1,1,province3,city31,store62,category1,department1,class7,vendor2,size29,1.0,424.971081,True
4,2012-01-08,5,2012,1,1,province3,city31,store40,category1,department1,class7,vendor2,size33,1.0,424.971081,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2348925,2018-01-07,2639097,2018,1,1,province2,city12,store31,category2,department12,class41,vendor11,size1,2.0,85.708453,True
2348926,2018-01-07,2639098,2018,1,1,province1,city39,store43,category2,department12,class41,vendor11,size1,2.0,85.708453,True
2348927,2018-01-07,2639099,2018,1,1,province1,city23,store56,category2,department12,class41,vendor11,size1,1.0,42.854227,True
2348928,2018-01-07,2639100,2018,1,1,province1,city35,store45,category2,department12,class41,vendor84,size1,1.0,103.564381,True


## Constructing aggregation matrix

In [4]:
## Iterative implementation for space heirarchy

# deprecated in favor of generic recursive function


### Refactoring into generic _function_

In [5]:

# recursive implementation for constructing summation matrix over a given hierarchy

from scipy.linalg import block_diag # nparray
import scipy.sparse as sp # sparse matrix

def sum_group(df, columns):
    """
    Inputs
    df: dataframe of data to aggregate
    columns: list if columns from the dataframe df that represent the hierarchy to traverse (higher to lower)

    Output
    S: aggregation matrix for df over all hierarchies defined in columns (sparse matrix COO format)
        -COO is a fast format for constructing sparse matrices
        -Once a matrix has been constructed, convert to CSR or CSC format for fast arithmetic and matrix vector operations
        dimensions = (all combination of nodes x all leaves) in the hierarchy
    col_names: names/labels in order the leaves(columns) are represented in S
    row_names: names/labels in order the aggregations(rows) are represented in S 
               (does not include the name of the aggregation above columns[0])
    """
    
    assert(len(columns) >= 2)
    
    if len(columns) == 2: # last level of hierarchy = leaf and its parent
        log('Processing Leaf')
        
        parent_field = columns[0]
        leaf_field = columns[1]
        
        submatrix = []
        col_names = []
        row_names = []
        
        for subname, subgroup in df.groupby(parent_field):
            log(parent_field, subname, subgroup[leaf_field].unique())

            leaves = subgroup[leaf_field].unique()
            
            ident = sp.eye(len(leaves), dtype=int) # identity matrix for all the leaves
            ones = np.ones(len(leaves), dtype=int) # row of ones to aggregate the leaves
            ones = sp.coo_matrix(ones, dtype=int) # Sparse matrix representation of ones()

            #S = np.row_stack((ones, ident))
            S = sp.vstack([ones, ident])

            submatrix.append(S)
            
            col_names.extend(leaves)
            row_names.extend([subname] + leaves.tolist())
            
        S = combine(submatrix)
         
        log('Number of leaves: %d'%S.shape[1]) # columns of matrix
          
        return S, (row_names, col_names)

    group_on = columns.pop(0)
    log('Processing Children of:', group_on) # Traversing hierarchy in DFS order
    
    matrix = []
    col_names = []
    row_names = []
    for name, group in df.groupby(group_on): # 0 = first level (highest) of hierarchy
        log(name, columns)
        
        S, names = sum_group(group, columns) # recurse over remaining levels in the heirarchy and retrieve their summation matrix
        matrix.append(S)
        
        row, col = names
        row_names.extend([name] + row)
        col_names.extend(col)
        log()
     
    S = combine(matrix)
    log('Number of nodes in %s: %d'%(group_on, S.shape[1])) # columns of matrix
    log()
    
    assert(len(col_names) == S.shape[1])
    assert(len(row_names) == S.shape[0]-1) # -1 because the top row is the aggregation of all leaves in the hierarchy tree whose name is known outside the hierachy
    
    return S, (row_names, col_names)

def combine(submatrix):
    """
    Inputs
    submatrix: list of submatrices to be used to construct the block diagonal matrix
    adds row of ones to the top to represent aggregation of submatrices
    
    Output
    S: aggregation matrix (sparse matrix COO format)
        -COO is a fast format for constructing sparse matrices
        -Once a matrix has been constructed, convert to CSR or CSC format for fast arithmetic and matrix vector operations
    """
    
    #S = block_diag(*matrix)
    S = sp.block_diag(tuple(submatrix)) # Sparse matrix implementation
    cols = S.shape[1]
    
    ones = np.ones(cols, dtype=int)    
    ones = sp.coo_matrix(ones, dtype=int) # Sparse matrix representation of ones()

    #S = np.row_stack((ones,S))
    S = sp.vstack([ones, S]) # Sparse matrix version of row_stack() 
    
    assert(S.getrow(0).sum() == cols) # top row must be all 1's representing aggregation of submatrix
    
    return S


## *spatial* hierarchy

- **Dense** matrix representation is *shape* = (105, 57), *nbytes* = 47880 ~46.88MB

- **Sparse** matrix representation is *shape* = (105, 57), *nbytes* = 1824 ~1.8MB


In [6]:
hierarchy = ['province','city','store']

space_S, space_names = sum_group(df, hierarchy)
log('is_sparse:', sp.issparse(space_S))
    
# Regular matrix (np.array)
# space_S.shape, space_S.nbytes #'shape = (105, 57), nbytes=47880' ~46.88MB

#Sparse matrix:
print('shape', space_S.shape, 'bytes', space_S.data.nbytes) # 'shape = (105, 57), nbytes=1824' ~1.8MB

if sp.issparse(space_S):
    space_S = space_S.tocsr()



Processing Children of: province
province1 ['city', 'store']
Processing Leaf
city city11 ['store47' 'store57']
city city13 ['store41']
city city18 ['store51']
city city20 ['store60']
city city23 ['store56']
city city3 ['store53']
city city30 ['store49']
city city35 ['store45']
city city37 ['store48']
city city38 ['store50']
city city39 ['store43']
city city45 ['store52']
Number of leaves: 13

province2 ['city', 'store']
Processing Leaf
city city10 ['store12']
city city12 ['store31']
city city14 ['store29' 'store37']
city city15 ['store33' 'store38']
city city16 ['store2']
city city19 ['store7']
city city2 ['store14']
city city21 ['store4']
city city22 ['store35']
city city25 ['store15']
city city26 ['store30']
city city27 ['store13']
city city28 ['store20']
city city29 ['store24']
city city32 ['store28']
city city36 ['store19']
city city4 ['store1' 'store10' 'store5' 'store6' 'store11']
city city41 ['store34']
city city42 ['store27']
city city43 ['store26']
city city44 ['store17' 'stor

### Visualize *spatial* aggregation matrix

In [7]:
# Visualize space

print('row names:\n', space_names[0])
print('column names:\n', space_names[1])
names = (['sales'] + space_names[0], space_names[1])

print(len(names[0]), len(names[1]), space_S.shape)

pd.DataFrame(space_S.todense(), index=names[0], columns=names[1])

row names:
 ['province1', 'city11', 'store47', 'store57', 'city13', 'store41', 'city18', 'store51', 'city20', 'store60', 'city23', 'store56', 'city3', 'store53', 'city30', 'store49', 'city35', 'store45', 'city37', 'store48', 'city38', 'store50', 'city39', 'store43', 'city45', 'store52', 'province2', 'city10', 'store12', 'city12', 'store31', 'city14', 'store29', 'store37', 'city15', 'store33', 'store38', 'city16', 'store2', 'city19', 'store7', 'city2', 'store14', 'city21', 'store4', 'city22', 'store35', 'city25', 'store15', 'city26', 'store30', 'city27', 'store13', 'city28', 'store20', 'city29', 'store24', 'city32', 'store28', 'city36', 'store19', 'city4', 'store1', 'store10', 'store5', 'store6', 'store11', 'city41', 'store34', 'city42', 'store27', 'city43', 'store26', 'city44', 'store17', 'store18', 'store22', 'city46', 'store36', 'city48', 'store23', 'city5', 'store9', 'city6', 'store16', 'city7', 'store25', 'city8', 'store32', 'city9', 'store8', 'province3', 'city31', 'store40', 'sto

Unnamed: 0,store47,store57,store41,store51,store60,store56,store53,store49,store45,store48,...,store32,store8,store40,store62,store63,store65,store61,store64,store39,store46
sales,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
province1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
city11,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
store47,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
store57,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
store64,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
city40,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
store39,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
city49,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## *product* hierarchy

- **Dense** matrix representation is *shape* = (61, 44), *nbytes* = 21472 ~21.5MB

- **Sparse** matrix representation is *shape* = (61, 44), *nbytes* = 1408 ~1.4MB


In [8]:
hierarchy = ['category','department','class']

prod_S, prod_names = sum_group(df, hierarchy)
log('is_sparse:', sp.issparse(prod_S))

# Regular matrix (np.array)
# prod_S.shape, prod_S.nbytes #'shape=(61, 44), nbytes=21472' ~21.5 MB

#Sparse matrix:
print('shape', prod_S.shape, 'bytes', prod_S.data.nbytes) #'shape=(61, 44), nbytes=1408' ~1.4MB

if sp.issparse(prod_S):
    prod_S = prod_S.tocsr()

Processing Children of: category
category1 ['department', 'class']
Processing Leaf
department department1 ['class7' 'class14' 'class15']
department department2 ['class1' 'class2' 'class3' 'class4' 'class5' 'class6']
department department3 ['class8' 'class9' 'class10' 'class11' 'class12' 'class13']
department department4 ['class16']
department department5 ['class17' 'class18']
department department6 ['class19']
department department7 ['class20' 'class21']
Number of leaves: 21

category2 ['department', 'class']
Processing Leaf
department department10 ['class35' 'class26']
department department11 ['class33' 'class34' 'class32' 'class27' 'class29' 'class28' 'class30'
 'class31']
department department12 ['class36' 'class37' 'class38' 'class39' 'class40' 'class41']
department department13 ['class42' 'class43']
department department14 ['class44']
department department8 ['class22' 'class24' 'class23']
department department9 ['class25']
Number of leaves: 23

Number of nodes in category: 44

is_

### Visualize *product* aggregation matrix

In [9]:
# Visualize product

print('row names:\n', prod_names[0])
print('column names:\n', prod_names[1])
names = (['sales'] + prod_names[0], prod_names[1])

pd.DataFrame(prod_S.todense(), index=names[0], columns=names[1])

row names:
 ['category1', 'department1', 'class7', 'class14', 'class15', 'department2', 'class1', 'class2', 'class3', 'class4', 'class5', 'class6', 'department3', 'class8', 'class9', 'class10', 'class11', 'class12', 'class13', 'department4', 'class16', 'department5', 'class17', 'class18', 'department6', 'class19', 'department7', 'class20', 'class21', 'category2', 'department10', 'class35', 'class26', 'department11', 'class33', 'class34', 'class32', 'class27', 'class29', 'class28', 'class30', 'class31', 'department12', 'class36', 'class37', 'class38', 'class39', 'class40', 'class41', 'department13', 'class42', 'class43', 'department14', 'class44', 'department8', 'class22', 'class24', 'class23', 'department9', 'class25']
column names:
 ['class7', 'class14', 'class15', 'class1', 'class2', 'class3', 'class4', 'class5', 'class6', 'class8', 'class9', 'class10', 'class11', 'class12', 'class13', 'class16', 'class17', 'class18', 'class19', 'class20', 'class21', 'class35', 'class26', 'class33', 

Unnamed: 0,class7,class14,class15,class1,class2,class3,class4,class5,class6,class8,...,class39,class40,class41,class42,class43,class44,class22,class24,class23,class25
sales,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
category1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
department1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
class7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
class14,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
class22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
class24,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
class23,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
department9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# *Deprecated*

## Iterative implementation for space heirarchy

In [10]:
# deprecated in favor of recursive function

%%deprecated # Will stop execution of this cell, comment if execution desired

from scipy.linalg import block_diag

grouped = df.groupby(['province'])
# grouped = grouped.reset_index()
matrix = []
for name, group in df.groupby(['province']):
#     print(name)
    submatrix = []
    for subname, subgroup in group.groupby('city'):
        print(name, subname, subgroup['store'].unique())

        store = subgroup['store'].unique()
        
        ident = np.eye(len(store), dtype=int)
        ones = np.ones(len(store), dtype=int)
        S = np.row_stack((ones,ident))
        
        submatrix.append(S)
        
    Z = block_diag(*submatrix)
    cols = Z.shape[1]
    print(cols)
   
    ones = np.ones(cols, dtype=int)
    Z = np.row_stack((ones,Z))
    
    matrix.append(Z)

S = block_diag(*matrix)
cols = S.shape[1]
print(cols)
ones = np.ones(cols, dtype=int)
S = np.row_stack((ones,S))

S.shape



UsageError: Line magic function `%%deprecated` not found.
