# Constructing aggregation matrix
`Execute notebook once:`
- Will construct 2 aggregation matrices for `totalSales` in each of `spatial` and `product` dimensions.
- Will construct **combined** aggregation matrices for `totalSales` in both `spatial` and `product` dimensions (cross product terms included).
- Create pickle files for each matrix as a dataframe. Files are created if they do not already exist in `../data/` directory.

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

import pickle

import logging
logging.basicConfig(level=logging.DEBUG)

logger = logging.getLogger(__name__)
logger.debug('hello')   # Logger's method

plt.rcParams['figure.figsize'] = [16, 8]

DEBUG:__main__:hello


In [2]:
# debugging function

DEBUG = False

def log(*s):
    if DEBUG:
        print(*s)

log("Debugging ON:", np.ones(2, dtype=int), ["a", "b"])

print("Starting...")

Starting...


In [3]:
# Loading dataset

df = pd.read_csv('../data/dataset-02.csv')
df

Unnamed: 0.1,time,Unnamed: 0,year,month,week,province,city,store,category,department,class,vendor,size,totalQuantity,totalSales,Holiday
0,2012-01-08,0,2012,1,1,province3,city31,store40,category1,department1,class7,vendor2,size26,1.0,424.971081,True
1,2012-01-08,1,2012,1,1,province2,city15,store33,category1,department1,class7,vendor2,size29,-1.0,-424.971081,True
2,2012-01-08,3,2012,1,1,province1,city18,store51,category1,department1,class7,vendor2,size29,1.0,424.971081,True
3,2012-01-08,4,2012,1,1,province3,city31,store62,category1,department1,class7,vendor2,size29,1.0,424.971081,True
4,2012-01-08,5,2012,1,1,province3,city31,store40,category1,department1,class7,vendor2,size33,1.0,424.971081,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2348925,2018-01-07,2639097,2018,1,1,province2,city12,store31,category2,department12,class41,vendor11,size1,2.0,85.708453,True
2348926,2018-01-07,2639098,2018,1,1,province1,city39,store43,category2,department12,class41,vendor11,size1,2.0,85.708453,True
2348927,2018-01-07,2639099,2018,1,1,province1,city23,store56,category2,department12,class41,vendor11,size1,1.0,42.854227,True
2348928,2018-01-07,2639100,2018,1,1,province1,city35,store45,category2,department12,class41,vendor84,size1,1.0,103.564381,True


## Constructing aggregation matrix

In [4]:
## Iterative implementation for space heirarchy

# deprecated in favor of generic recursive function


In [5]:
space_order = ['province', 'city', 'store']
product_order = ['category', 'department', 'class']


### Refactoring into generic _functions_

In [6]:

# recursive implementation for constructing summation matrix over a given hierarchy

from scipy.linalg import block_diag # nparray
import scipy.sparse as sp # sparse matrix

def sum_group(df, columns, path=""):
    """
    Inputs
    df: dataframe of data to aggregate
    columns: list if columns from the dataframe df that represent the hierarchy to traverse (higher to lower)

    Output
    S: aggregation matrix for df over all hierarchies defined in columns (sparse matrix COO format)
        -COO is a fast format for constructing sparse matrices
        -Once a matrix has been constructed, convert to CSR or CSC format for fast arithmetic and matrix vector operations
        dimensions = (all combination of nodes x all leaves) in the hierarchy
        if a joint-combined hierachy is provided, futher processing is needed to groupby duplicate rows,
        use aggregate_group()
    col_names: names/labels in order the leaves(columns) are represented in S
    row_names: names/labels in order the aggregations(rows) are represented in S 
               (does not include the name of the aggregation above columns[0])
    """
    
    assert len(columns) >= 2, 'Hierarchy must contain atleast one level (parent->child)'
    
    if len(columns) == 2: # last level of hierarchy = leaf and its parent
        log('Processing Leaf')
        
        parent_field = columns[0]
        leaf_field = columns[1]
        
        submatrix = []
        col_names = []
        row_names = []
        
        for subname, subgroup in df.groupby(parent_field):
            log(parent_field, subname, subgroup[leaf_field].unique())

            leaves = subgroup[leaf_field].unique()
            
            ident = sp.eye(len(leaves), dtype=bool) # identity matrix for all the leaves
            ones = np.ones(len(leaves), dtype=bool) # row of ones to aggregate the leaves
            ones = sp.coo_matrix(ones, dtype=bool) # Sparse matrix representation of ones()

            #S = np.row_stack((ones, ident))
            S = sp.vstack([ones, ident])
            log('Number of elements in %s: %d'%(subname, S.shape[1]))

            submatrix.append(S)
            
            group_path = "+".join([path, subname])
            leaf_path = ["+".join([group_path, l]) for l in leaves]
            col_names.extend(leaf_path)
            row_names.extend([subname] + leaves.tolist())
            #row_names.extend([subname] + leaf_path) # possibly deprecated
            
            
        S = combine(submatrix) 
        log('Number of leaves in %s+%s: %d'%(path, parent_field, S.shape[1]) ) # columns of matrix
          
        return S, (row_names, col_names)

    group_on = columns[0]
    log('Processing Children of:', group_on) # Traversing hierarchy in DFS order
    
    matrix = []
    col_names = []
    row_names = []
    for name, group in df.groupby(group_on): # 0 = first level (highest) of hierarchy
        log(name, columns)
        
        group_path = "+".join([path, name])
        
        S, names = sum_group(group, columns[1:], group_path) # recurse over remaining levels in the heirarchy and retrieve their summation matrix
        
        log('Number of elements in %s: %d'%(name, S.shape[1]))
        matrix.append(S)
        
        row, col = names
        row_names.extend([name] + row)
        col_names.extend(col)
        log()
     
    S = combine(matrix)
    log('Number of nodes in %s+%s: %d'%(path, group_on, S.shape[1])) # columns of matrix
    log()
    
    assert len(col_names) == S.shape[1], 'Number of columns:%d, Columns in Matrix: %d'(len(col_names),S.shape[1])
    assert len(row_names) == S.shape[0]-1, 'Number of rows:%d, Rows in Matrix: %d'(len(col_names),S.shape[0]-1) # -1 because the top row is the aggregation of all leaves in the hierarchy tree whose name is known outside the hierachy
    
    return S, (row_names, col_names)

def combine(submatrix):
    """
    Inputs
    submatrix: list of submatrices to be used to construct the block diagonal matrix
    adds row of ones to the top to represent aggregation of submatrices
    
    Output
    S: aggregation matrix (sparse matrix COO format)
        -COO is a fast format for constructing sparse matrices
        -Once a matrix has been constructed, convert to CSR or CSC format for fast arithmetic and matrix vector operations
    """
    
    #S = block_diag(*matrix)
    S = sp.block_diag(tuple(submatrix)) # Sparse matrix implementation
    cols = S.shape[1]
    
    ones = np.ones(cols, dtype=bool)    
    ones = sp.coo_matrix(ones, dtype=bool) # Sparse matrix representation of ones()

    #S = np.row_stack((ones,S))
    S = sp.vstack([ones, S]) # Sparse matrix version of row_stack() 
    
    assert S.getrow(0).sum() == cols # top row must be all 1's representing aggregation of submatrix
    
    return S


In [34]:
def post_process(combined_S, grouped_names, root="sales"):
    """
    Inputs
        combined_S: Matrix to be used for aggregation that 
                    needs post-processing if a joint-combined hierarchy was used
        grouped_names: tuple(list, list), 
            grouped_names[0]: row names of the nodes enountered during forming aggregation matrix
            grouped_names[1]: column names of the leaves enountered during forming aggregation matrix
        root: str, label for the top level of aggregation

    Outputs
        combined_df: dataframe, for aggregation matrix for all nodes of hierarchy tree
                    (does not include cross-terms, use cross_term() for that)
    """
    
    if sp.issparse(combined_S):
        combined_S = combined_S.tocsr()

    # print('row names:\n', grouped_names[0])
    # print('column names:\n', grouped_names[1])
    names = ([root] + grouped_names[0], grouped_names[1]) # Add label for target row (row of ones)

    combined_df = pd.DataFrame(combined_S.todense(), index=names[0], columns=names[1])

    combined_df[combined_df.index.duplicated(keep=False)] # keep=False: means keep all the duplicates

    print('Unique rows in grouping:',len(combined_df.index.unique()))

    combined_df = combined_df.groupby(combined_df.index, sort=False).sum() # duplicate indices are aggregated (since entries are 0/1 summing will also do)

    return combined_df





## *spatial* hierarchy

- **Dense** `int` matrix representation is *shape* = (105, 57), *nbytes* = 47880 ~46.88MB

- **Sparse** `int` matrix representation is *shape* = (105, 57), *nbytes* = 1824 ~1.8MB

- **Sparse** `bool` matrix representation is *shape* = (105, 57), *nbytes* = 228


In [7]:
space_order = ['province', 'city', 'store']

space_S, space_names = sum_group(df, space_order, "sales")
log('is_sparse:', sp.issparse(space_S))
    
# Regular matrix (np.array)
# space_S.shape, space_S.nbytes #'shape = (105, 57), nbytes = 47880' ~46.88MB

#Sparse matrix:
print('shape', space_S.shape, 'bytes', space_S.data.nbytes) # 'shape = (105, 57), nbytes = 1824' ~1.8MB

if sp.issparse(space_S):
    space_S = space_S.tocsr()



shape (105, 57) bytes 228


### Visualize *spatial* aggregation matrix

In [8]:
# Visualize space

# print('row names:\n', space_names[0])
# print('column names:\n', space_names[1])
names = (['sales'] + space_names[0], space_names[1])

print(len(names[0]), len(names[1]), space_S.shape)

space = pd.DataFrame(space_S.todense(), index=names[0], columns=names[1])
space

105 57 (105, 57)


Unnamed: 0,sales+province1+city11+store47,sales+province1+city11+store57,sales+province1+city13+store41,sales+province1+city18+store51,sales+province1+city20+store60,sales+province1+city23+store56,sales+province1+city3+store53,sales+province1+city30+store49,sales+province1+city35+store45,sales+province1+city37+store48,...,sales+province2+city8+store32,sales+province2+city9+store8,sales+province3+city31+store40,sales+province3+city31+store62,sales+province3+city31+store63,sales+province3+city31+store65,sales+province3+city33+store61,sales+province3+city33+store64,sales+province3+city40+store39,sales+province3+city49+store46
sales,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
province1,True,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
city11,True,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
store47,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
store57,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
store64,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
city40,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
store39,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
city49,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [39]:
combined_S, grouped_names = sum_group(df, space_order, "sales")

S = post_process(combined_S, grouped_names, "sales")
S

Unique rows in grouping: 105


Unnamed: 0,sales+province1+city11+store47,sales+province1+city11+store57,sales+province1+city13+store41,sales+province1+city18+store51,sales+province1+city20+store60,sales+province1+city23+store56,sales+province1+city3+store53,sales+province1+city30+store49,sales+province1+city35+store45,sales+province1+city37+store48,...,sales+province2+city8+store32,sales+province2+city9+store8,sales+province3+city31+store40,sales+province3+city31+store62,sales+province3+city31+store63,sales+province3+city31+store65,sales+province3+city33+store61,sales+province3+city33+store64,sales+province3+city40+store39,sales+province3+city49+store46
sales,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
province1,True,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
city11,True,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
store47,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
store57,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
store64,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
city40,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
store39,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
city49,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [38]:
S.equals(space)

True

## *product* hierarchy

- **Dense** `int` matrix representation is *shape* = (61, 44), *nbytes* = 21472 ~21.5MB

- **Sparse** `int` matrix representation is *shape* = (61, 44), *nbytes* = 1408 ~1.4MB

- **Sparse** `bool` matrix representation is *shape* = (61, 44), *nbytes* = 176




In [9]:
product_order = ['category', 'department', 'class']

prod_S, prod_names = sum_group(df, product_order, "sales")

# Regular matrix (np.array)
# prod_S.shape, prod_S.nbytes #'shape = (61, 44), nbytes = 21472' ~21.5 MB

#Sparse matrix:
print('shape', prod_S.shape, 'bytes', prod_S.data.nbytes) #'shape = (61, 44), nbytes = 1408' ~1.4MB

log('is_sparse:', sp.issparse(prod_S))
if sp.issparse(prod_S):
    prod_S = prod_S.tocsr()

shape (61, 44) bytes 176


### Visualize *product* aggregation matrix

In [10]:
# Visualize product

# print('row names:\n', prod_names[0])
# print('column names:\n', prod_names[1])
names = (['sales'] + prod_names[0], prod_names[1])

prod = pd.DataFrame(prod_S.todense(), index=names[0], columns=names[1])
prod

Unnamed: 0,sales+category1+department1+class7,sales+category1+department1+class14,sales+category1+department1+class15,sales+category1+department2+class1,sales+category1+department2+class2,sales+category1+department2+class3,sales+category1+department2+class4,sales+category1+department2+class5,sales+category1+department2+class6,sales+category1+department3+class8,...,sales+category2+department12+class39,sales+category2+department12+class40,sales+category2+department12+class41,sales+category2+department13+class42,sales+category2+department13+class43,sales+category2+department14+class44,sales+category2+department8+class22,sales+category2+department8+class24,sales+category2+department8+class23,sales+category2+department9+class25
sales,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
category1,True,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
department1,True,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
class7,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
class14,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
class22,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
class24,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
class23,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
department9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [41]:
combined_S, grouped_names = sum_group(df, product_order, "sales")

S = post_process(combined_S, grouped_names, "sales")
S


Unique rows in grouping: 61


Unnamed: 0,sales+category1+department1+class7,sales+category1+department1+class14,sales+category1+department1+class15,sales+category1+department2+class1,sales+category1+department2+class2,sales+category1+department2+class3,sales+category1+department2+class4,sales+category1+department2+class5,sales+category1+department2+class6,sales+category1+department3+class8,...,sales+category2+department12+class39,sales+category2+department12+class40,sales+category2+department12+class41,sales+category2+department13+class42,sales+category2+department13+class43,sales+category2+department14+class44,sales+category2+department8+class22,sales+category2+department8+class24,sales+category2+department8+class23,sales+category2+department9+class25
sales,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
category1,True,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
department1,True,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
class7,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
class14,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
class22,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
class24,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
class23,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
department9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [42]:
S.equals(prod)

True

## Grouped hierarchy: combined *space* and *product*



**np** matrix:
- *space*->*product*: _shape_ = (3077, 2217), _nbytes_ = n/a
- *product*->*space* _shape_ = (4145, 2217), _nbytes_ = n/a

**Sparse** matrix:
- *space*->*product*:
- - _shape_ = (3077, 2217), _nbytes_ = 124152  ~124MB (`int`) 
- - _nbytes_ = 15519 ~15MB (`bool`)
- *product*->*space* 
- - _shape_ = (4145, 2217), _nbytes_ = 124152 (`int`)
- - _nbytes_ = 15519 ~15MB (`bool`)




In [11]:
combo1 = space_order + product_order
combo2 = product_order + space_order

combined_S, grouped_names = sum_group(df, combo1, "sales")
log('is_sparse:', sp.issparse(combined_S))
    
# Regular matrix (np.array)
# grouped_S.shape, grouped_S.nbytes 
# space->product: shape = (3077, 2217), nbytes = n/a
# product->space shape: (4145, 2217), nbytes = same as before

#Sparse matrix:
print('shape', combined_S.shape, 'bytes', combined_S.data.nbytes) 
# space->product: shape = (3077, 2217), nbytes = 124152 ~124MB
#                              using bool dtype  15519 ~15MB
# product->space shape: (4145, 2217), nbytes = 124152
#                              using bool dtype  15519 ~15MB




shape (3077, 2217) bytes 15519


In [32]:
# Further process to obtain cross-terms
def cross_terms(combined_df, space_order, product_order, root="sales"):
    """
    Inputs:
        combined_df: dataframe, 
        space_order: 
        product_order:
        root: str, label for the top level of aggregation

    Outputs:
        S: dataframe, aggregation matrix containing entries for all terms (including cross terms)
    """
    S = combined_df.copy()

    ind = S.index.tolist()

    all_spaces = [i for i in ind for t in space_order if i.startswith(t)]
    all_products = [i for i in ind for t in product_order if i.startswith(t)]

    # Separate in two hierarchical groups, 'sales' row is excluded
    S.loc[all_spaces]
    S.loc[all_products]

    # cartesian product
    S['key'] = 0
    S = pd.merge(S.loc[all_spaces].reset_index(), S.loc[all_products].reset_index(), 
                 suffixes=(':x',':y'), 
                 on='key',sort=False)

    def remove_suffix(x):
        return x.split(":")[0]

    S = S.rename(columns = remove_suffix).drop(columns='key')

    # There exist two 'index' columns, one from each hierarchy group, combine using "+"

    S['index'] = S['index'].agg('+'.join, axis=1) # aggregation will apply to both columns, duplicate not removed
    final_index = pd.Index(S['index'].iloc[:,0], name="node") # new index for final matrix (0: duplicate removed)
    S = S.drop(columns='index')

    S = S.groupby(level=0, axis=1).agg('any').set_index(final_index) # groupby all remaining columns(level 0, axis=1) and aggregate any ones
    S.loc[root] = 1

    return S



In [45]:
combined_S, grouped_names = sum_group(df, combo1, "sales")

combined_df = post_process(combined_S, grouped_names, "sales")

agg_combo = cross_terms(combined_df, space_order, product_order, "sales")
agg_combo


Unique rows in grouping: 165


Unnamed: 0_level_0,sales+province1+city11+store47+category1+department1+class14,sales+province1+city11+store47+category1+department1+class15,sales+province1+city11+store47+category1+department1+class7,sales+province1+city11+store47+category1+department2+class1,sales+province1+city11+store47+category1+department2+class2,sales+province1+city11+store47+category1+department2+class3,sales+province1+city11+store47+category1+department2+class4,sales+province1+city11+store47+category1+department2+class5,sales+province1+city11+store47+category1+department2+class6,sales+province1+city11+store47+category1+department3+class10,...,sales+province3+city49+store46+category2+department12+class36,sales+province3+city49+store46+category2+department12+class37,sales+province3+city49+store46+category2+department12+class38,sales+province3+city49+store46+category2+department12+class39,sales+province3+city49+store46+category2+department12+class40,sales+province3+city49+store46+category2+department12+class41,sales+province3+city49+store46+category2+department13+class42,sales+province3+city49+store46+category2+department8+class22,sales+province3+city49+store46+category2+department8+class24,sales+province3+city49+store46+category2+department9+class25
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
province1+category1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
province1+department1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
province1+class7,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
province1+class14,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
province1+class15,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
store46+class23,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
store46+class43,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
store46+department14,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
store46+class44,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1


### Save variables

In [14]:
import os.path

if not os.path.isfile('../data/agg_space.pickle'):
    space.to_pickle('../data/agg_space.pickle')

if not os.path.isfile('../data/agg_prod.pickle'):
    prod.to_pickle('../data/agg_prod.pickle')
    
if not os.path.isfile('../data/agg_combo.pickle'):
    agg_combo.to_pickle('../data/agg_combo.pickle')


In [15]:
%%save_read_vars # hack. Will stop execution of this cell, comment if execution desired

import pickle

# Save Variable
df.to_pickle('../data/my_df.pickle')

with open('../data/agg_space.pickle', 'wb') as f:
    pickle.dump(some_obj, f)
    
with open('../data/agg_product.pickle', 'wb') as f:
    pickle.dump(some_obj, f)

# Read Variable
with open('../data/agg_space.pickle', 'rb') as f:
    some_obj = pickle.load(f)
    
with open('../data/agg_product.pickle', 'rb') as f:
    some_obj = pickle.load(f)
    
pd.read_pickle('../data/my_df.pickle')


UsageError: Cell magic `%%save_read_vars` not found.


# <font color="red">*Deprecated...*</font>

## Iterative implementation for space heirarchy

In [None]:
# deprecated in favor of recursive function

%%deprecated # hack. Will stop execution of this cell, comment if execution desired

from scipy.linalg import block_diag

grouped = df.groupby(['province'])
# grouped = grouped.reset_index()
matrix = []
for name, group in df.groupby(['province']):
#     print(name)
    submatrix = []
    for subname, subgroup in group.groupby('city'):
        print(name, subname, subgroup['store'].unique())

        store = subgroup['store'].unique()
        
        ident = np.eye(len(store), dtype=int)
        ones = np.ones(len(store), dtype=int)
        S = np.row_stack((ones,ident))
        
        submatrix.append(S)
        
    Z = block_diag(*submatrix)
    cols = Z.shape[1]
    print(cols)
   
    ones = np.ones(cols, dtype=int)
    Z = np.row_stack((ones,Z))
    
    matrix.append(Z)

S = block_diag(*matrix)
cols = S.shape[1]
print(cols)
ones = np.ones(cols, dtype=int)
S = np.row_stack((ones,S))

S.shape

