In [3]:
import pandas as pd
import numpy as np

# Hierarchical Indexing

In [4]:
#While Pandas does provide Panel and Panel4D objects
#that natively handle three-dimensional and four-dimensional data (see “Panel Data”
#on page 141), a far more common pattern in practice is to make use of hierarchical
#indexing (also known as multi-indexing) to incorporate multiple index levels within a
#single index. In this way, higher-dimensional data can be compactly represented
#within the familiar one-dimensional Series and two-dimensional DataFrame objects.

# A Multiply Indexed Series

In [5]:
#Let’s start by considering how we might represent two-dimensional data within a
#one-dimensional Series . For concreteness, we will consider a series of data where
#each point has a character and numerical key.

# The bad way

In [6]:
#Suppose you would like to track data about states from two different years. Using the
#Pandas tools we’ve already covered, you might be tempted to simply use Python
#tuples as keys

In [7]:
index = [('California', 2000), ('California', 2010),
('New York', 2000), ('New York', 2010),
('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
18976457, 19378102,
20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [9]:
#With this indexing scheme, you can straightforwardly index or slice the series based
#on this multiple index

In [10]:
pop[('California', 2010):('Texas', 2000)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [11]:
#But the convenience ends there. For example, if you need to select all values from
#2010, you’ll need to do some messy (and potentially slow) munging to make it
#happen

In [12]:
pop[[i for i in pop.index if i[1]==2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

# The better way: Pandas MultiIndex

In [13]:
#Fortunately, Pandas provides a better way. Our tuple-based indexing is essentially a
#rudimentary multi-index, and the Pandas MultiIndex type gives us the type of opera‐
#tions we wish to have. We can create a multi-index from the tuples as follows

In [14]:
index

[('California', 2000),
 ('California', 2010),
 ('New York', 2000),
 ('New York', 2010),
 ('Texas', 2000),
 ('Texas', 2010)]

In [15]:
index=pd.MultiIndex.from_tuples(index)

In [16]:
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [17]:
#If we reindex our series with this MultiIndex , we see the hierarchical representation
#of the data:

In [18]:
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [27]:
pop=pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [24]:
#Here the first two columns of the Series representation show the multiple index val‐
#ues, while the third column shows the data. Notice that some entries are missing in
#the first column: in this multi-index representation, any blank entry indicates the
#same value as the line above it.

In [25]:
#Now to access all data for which the second index is 2010, we can simply use the Pan‐
#das slicing notation

In [26]:
pop[:,2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [28]:
#The result is a singly indexed array with just the keys we’re interested in. This syntax
#is much more convenient (and the operation is much more efficient!) than the home-
#spun tuple-based multi-indexing solution that we started with

# MultiIndex as extra dimension

In [29]:
#You might notice something else here: we could easily have stored the same data
#using a simple DataFrame with index and column labels. In fact, Pandas is built with
#this equivalence in mind. The unstack() method will quickly convert a multiply-
#indexed Series into a conventionally indexed DataFrame

In [30]:
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [31]:
pop_df=pop.unstack()

In [33]:
pop_df #dataframe

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [34]:
#Naturally, the stack() method provides the opposite operation:
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

# Methods of MultiIndex Creation

In [35]:
#The most straightforward way to construct a multiply indexed Series or DataFrame
#is to simply pass a list of two or more index arrays to the constructor. For example:

In [39]:
df=pd.DataFrame(np.random.rand(4,2),index=[['a','a','b','b'],[1,2,1,3]],columns=['A','B'])

In [40]:
df

Unnamed: 0,Unnamed: 1,A,B
a,1,0.61726,0.511034
a,2,0.771062,0.464462
b,1,0.019111,0.182737
b,3,0.076758,0.644105


In [41]:
#The work of creating the MultiIndex is done in the background.
#Similarly, if you pass a dictionary with appropriate tuples as keys, Pandas will auto‐
#matically recognize this and use a MultiIndex by default:

In [42]:
data = {('California', 2000): 33871648,
('California', 2010): 37253956,
('Texas', 2000): 20851820,
('Texas', 2010): 25145561,
('New York', 2000): 18976457,
('New York', 2010): 19378102}
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

In [43]:
#Nevertheless, it is sometimes useful to explicitly create a MultiIndex ; we’ll see a cou‐
#ple of these methods here

# Explicit MultiIndex constructors

In [44]:
index=pd.MultiIndex.from_arrays([['a','a','b','c'],[1,2,1,3]])

In [45]:
index

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('c', 3)],
           )

In [47]:
pd.MultiIndex.from_tuples([(1,'a'),(1,'b'),(2,'b'),(3,'c')])

MultiIndex([(1, 'a'),
            (1, 'b'),
            (2, 'b'),
            (3, 'c')],
           )

In [48]:
#You can even construct it from a Cartesian product of single indices

In [49]:
pd.MultiIndex.from_product([['a','b'],['1','2']])

MultiIndex([('a', '1'),
            ('a', '2'),
            ('b', '1'),
            ('b', '2')],
           )

# MultiIndex level names

In [53]:
#Sometimes it is convenient to name the levels of the MultiIndex . You can accomplish
#this by passing the names argument to any of the above MultiIndex constructors, or
#by setting the names attribute of the index after the fact

In [54]:
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [55]:
pop.index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [56]:
pop.index.names=['State','Year']

In [57]:
pop

State       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

# MultiIndex for columns

In [58]:
#In a DataFrame , the rows and columns are completely symmetric, and just as the rows
#can have multiple levels of indices, the columns can have multiple levels as well. Con‐
#sider the following, which is a mock-up of some (somewhat realistic) medical data

In [59]:
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
names=['year', 'visit'])

columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
names=['subject', 'type'])

In [60]:
index

MultiIndex([(2013, 1),
            (2013, 2),
            (2014, 1),
            (2014, 2)],
           names=['year', 'visit'])

In [61]:
columns

MultiIndex([(  'Bob',   'HR'),
            (  'Bob', 'Temp'),
            ('Guido',   'HR'),
            ('Guido', 'Temp'),
            (  'Sue',   'HR'),
            (  'Sue', 'Temp')],
           names=['subject', 'type'])

In [66]:
# mock some data
data=np.round(np.random.randn(4,6),1)
data

array([[ 1.4,  0.1,  0.6,  0.8,  0.5,  0.5],
       [ 1. ,  0.7,  0.1,  0.7, -0.7, -0.8],
       [-0.4,  1.2, -0.3, -0.6, -1.1,  0.1],
       [-1.2,  0.4,  0. ,  0.6,  0.9,  0.1]])

In [67]:
data[:,::2]*=10

In [68]:
data

array([[ 14. ,   0.1,   6. ,   0.8,   5. ,   0.5],
       [ 10. ,   0.7,   1. ,   0.7,  -7. ,  -0.8],
       [ -4. ,   1.2,  -3. ,  -0.6, -11. ,   0.1],
       [-12. ,   0.4,   0. ,   0.6,   9. ,   0.1]])

In [69]:
data+=37

In [71]:
data

array([[51. , 37.1, 43. , 37.8, 42. , 37.5],
       [47. , 37.7, 38. , 37.7, 30. , 36.2],
       [33. , 38.2, 34. , 36.4, 26. , 37.1],
       [25. , 37.4, 37. , 37.6, 46. , 37.1]])

In [72]:
#create dataframe

In [73]:
health_data=pd.DataFrame(data,index=index,columns=columns)

In [74]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,51.0,37.1,43.0,37.8,42.0,37.5
2013,2,47.0,37.7,38.0,37.7,30.0,36.2
2014,1,33.0,38.2,34.0,36.4,26.0,37.1
2014,2,25.0,37.4,37.0,37.6,46.0,37.1


In [75]:
#Here we see where the multi-indexing for both rows and columns can come in very
#handy. This is fundamentally four-dimensional data, where the dimensions are the
#subject, the measurement type, the year, and the visit number. With this in place we
#can, for example, index the top-level column by the person’s name and get a full Data
#Frame containing just that person’s information:

In [76]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,43.0,37.8
2013,2,38.0,37.7
2014,1,34.0,36.4
2014,2,37.0,37.6


In [77]:
#For complicated records containing multiple labeled measurements across multiple
#times for many subjects (people, countries, cities, etc.), use of hierarchical rows and
#columns can be extremely convenient!

# Indexing and Slicing a MultiIndex

# Multiply indexed Series

In [78]:
pop

State       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [93]:
pop['California']

Year
2000    33871648
2010    37253956
dtype: int64

In [94]:
pop['California',2010]

37253956

In [95]:
pop['California':'Texas'] #explicit slicing

State       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [96]:
pop[:,2000]

State
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [97]:
pop[pop>22000000] #using boolean masks

State       Year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

In [98]:
pop[['California','Texas','New York']] #fancy indexing

State       Year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

# Multiply indexed DataFrames

In [99]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,51.0,37.1,43.0,37.8,42.0,37.5
2013,2,47.0,37.7,38.0,37.7,30.0,36.2
2014,1,33.0,38.2,34.0,36.4,26.0,37.1
2014,2,25.0,37.4,37.0,37.6,46.0,37.1


In [101]:
#Remember that columns are primary in a DataFrame , and the syntax used for multi‐
#ply indexed Series applies to the columns. For example, we can recover Guido’s heart
#rate data with a simple operation:

In [102]:
health_data['Guido','HR']

year  visit
2013  1        43.0
      2        38.0
2014  1        34.0
      2        37.0
Name: (Guido, HR), dtype: float64

In [103]:
#Also, as with the single-index case, we can use the loc and iloc indexers

In [104]:
health_data.iloc[:2,:2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,51.0,37.1
2013,2,47.0,37.7


In [105]:
health_data.loc[:,('Bob','HR')]

year  visit
2013  1        51.0
      2        47.0
2014  1        33.0
      2        25.0
Name: (Bob, HR), dtype: float64

In [106]:
#Working with slices within these index tuples is not especially convenient; trying to
#create a slice within a tuple will lead to a syntax error

In [107]:
#You could get around this by building the desired slice explicitly using Python’s built-
#in slice() function, but a better way in this context is to use an IndexSlice object,
#which Pandas provides for precisely this situation. For example:

In [108]:
idx=pd.IndexSlice

In [109]:
health_data.loc[idx[:,1],idx[:,'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,51.0,43.0,42.0
2014,1,33.0,34.0,26.0


# Rearranging Multi-Indices

# Sorted and unsorted indices

In [110]:
#Many of
#the MultiIndex slicing operations will fail if the index is not sorted. Let’s take a look at
#this here.
#We’ll start by creating some simple multiply indexed data where the indices are not
#lexographically sorted:

In [111]:
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data

char  int
a     1      0.425649
      2      0.808210
c     1      0.486956
      2      0.528178
b     1      0.703190
      2      0.906993
dtype: float64

In [113]:
#If we try to take a partial slice of this index, it will result in an error

In [115]:
try:
    data['a':'b']
except KeyError as e:
    print(type(e))
    print(e)

<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


In [116]:
#Although it is not entirely clear from the error message, this is the result of the Multi
#Index not being sorted. For various reasons, partial slices and other similar opera‐
#tions require the levels in the MultiIndex to be in sorted (i.e., lexographical) order.
#Pandas provides a number of convenience routines to perform this type of sorting;
#examples are the sort_index() and sortlevel() methods of the DataFrame . We’ll
#use the simplest, sort_index() , here:

In [117]:
data=data.sort_index()

In [118]:
data

char  int
a     1      0.425649
      2      0.808210
b     1      0.703190
      2      0.906993
c     1      0.486956
      2      0.528178
dtype: float64

In [119]:
data['a':'b']

char  int
a     1      0.425649
      2      0.808210
b     1      0.703190
      2      0.906993
dtype: float64

# Stacking and unstacking indices

In [120]:
#it is possible to convert a dataset from a stacked multi-index
#to a simple two-dimensional representation, optionally specifying the level to use

In [121]:
pop

State       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [122]:
pop.unstack(level=0)

State,California,New York,Texas
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [123]:
pop.unstack(level=1)

Year,2000,2010
State,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [124]:
#The opposite of unstack() is stack() , which here can be used to recover the original series:

In [126]:
pop.unstack().stack()

State       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

# Index setting and resetting

In [127]:
#Another way to rearrange hierarchical data is to turn the index labels into columns;
#this can be accomplished with the reset_index method. Calling this on the popula‐
#tion dictionary will result in a DataFrame with a state and year column holding the
#information that was formerly in the index. For clarity, we can optionally specify the
#name of the data for the column representation

In [128]:
pop_flat=pop.reset_index(name='Population')

In [129]:
pop_flat

Unnamed: 0,State,Year,Population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [130]:
#Often when you are working with data in the real world, the raw input data looks like
#this and it’s useful to build a MultiIndex from the column values. This can be done
#with the set_index method of the DataFrame , which returns a multiply indexed Data
#Frame :

In [131]:
pop_flat.set_index(['State','Year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Population
State,Year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


# Data Aggregations on Multi-Indices

In [132]:
#We’ve previously seen that Pandas has built-in data aggregation methods, such as
#mean() , sum() , and max() . For hierarchically indexed data, these can be passed a
#level parameter that controls which subset of the data the aggregate is computed on.

In [133]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,51.0,37.1,43.0,37.8,42.0,37.5
2013,2,47.0,37.7,38.0,37.7,30.0,36.2
2014,1,33.0,38.2,34.0,36.4,26.0,37.1
2014,2,25.0,37.4,37.0,37.6,46.0,37.1


In [134]:
#average out the measurements in the two visits each year

In [136]:
health_data.mean(level="year") #axis=0 (by default)

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,49.0,37.4,40.5,37.75,36.0,36.85
2014,29.0,37.8,35.5,37.0,36.0,37.1


In [137]:
#By further making use of the axis keyword, we can take the mean among levels on
#the columns as well

In [138]:
health_data.mean(axis=1,level="type")

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,45.333333,37.466667
2013,2,38.333333,37.2
2014,1,31.0,37.233333
2014,2,36.0,37.366667


# Thank You