In [1]:
import pandas as pd
import numpy as np

## a bad way multiply indexed series

In [3]:
index = [('California', 2000), ('California', 2010),
          ('New York', 2000), ('New York', 2010),
          ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]       
pop = pd.Series(populations, index=index)
pop


(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [5]:
pop[('California',2010)]

37253956

In [6]:
#only find the values on : 2010
pop[[i for i in pop.index if i[1]==2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

## the better way : Pandas MultiIndex

In [10]:
index = [('California', 2000), ('California', 2010),
          ('New York', 2000), ('New York', 2010),
          ('Texas', 2000), ('Texas', 2010)]

In [8]:
index=pd.MultiIndex.from_tuples(index)
index

MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010]],
           codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

In [12]:
pop=pop.reindex(index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

## Methods of multi index creation

In [21]:
df=pd.DataFrame(np.random.rand(4,2),
               index=[['a','a','b','b'],[1,2,1,2]],
               columns=['data1','data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.71053,0.532026
a,2,0.555156,0.402782
b,1,0.740561,0.058639
b,2,0.865454,0.351866


## Explicit multi index constructor

In [23]:
pd.MultiIndex.from_arrays([['a','a','b','b'],[1,2,1,2]])


MultiIndex(levels=[['a', 'b'], [1, 2]],
           codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [24]:
pd.MultiIndex.from_product([['a','b'],[1,2] ])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

## multi index for columns

In [29]:
index=pd.MultiIndex.from_product([ [2013,2014],[1,2]],
                                names=['year','visit'] )
index

MultiIndex(levels=[[2013, 2014], [1, 2]],
           codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=['year', 'visit'])

In [38]:
# 2013 x 1 , 2013 x 2 ,2014 x 1 ,2014 x 2
#index position for 2013 :[0,0] ,2014 :[1,1] ->[0,0,1,1]
# index for 1,2 :[0,1,0,1] 

In [31]:
columns=pd.MultiIndex.from_product([['Bob','Guido','Sue'],
                                   ['HR','Temp'] ],
                                  names=['subject','type'])
columns

MultiIndex(levels=[['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
           codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
           names=['subject', 'type'])

In [41]:
data=np.round(np.random.randn(4,6),1)
data

array([[-0.1,  1.6, -0. ,  0.9, -1.6, -0.4],
       [ 1.5,  0.1,  0.5, -1.9,  0.3, -1. ],
       [-0.4, -1.4, -0.8,  0.1,  0.1,  0.4],
       [-0.9, -1.4,  2. ,  1.9, -1.2,  0.7]])

In [44]:
data[:,::2] *= 10
data

array([[-1.0e+01,  1.6e+00, -0.0e+00,  9.0e-01, -1.6e+02, -4.0e-01],
       [ 1.5e+02,  1.0e-01,  5.0e+01, -1.9e+00,  3.0e+01, -1.0e+00],
       [-4.0e+01, -1.4e+00, -8.0e+01,  1.0e-01,  1.0e+01,  4.0e-01],
       [-9.0e+01, -1.4e+00,  2.0e+02,  1.9e+00, -1.2e+02,  7.0e-01]])

In [46]:
data +=37
data

array([[ 64. ,  75.6,  74. ,  74.9, -86. ,  73.6],
       [224. ,  74.1, 124. ,  72.1, 104. ,  73. ],
       [ 34. ,  72.6,  -6. ,  74.1,  84. ,  74.4],
       [-16. ,  72.6, 274. ,  75.9, -46. ,  74.7]])

In [47]:
# Ctrate the data frame
health_data=pd.DataFrame(data,index=index,columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,64.0,75.6,74.0,74.9,-86.0,73.6
2013,2,224.0,74.1,124.0,72.1,104.0,73.0
2014,1,34.0,72.6,-6.0,74.1,84.0,74.4
2014,2,-16.0,72.6,274.0,75.9,-46.0,74.7


In [48]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,74.0,74.9
2013,2,124.0,72.1
2014,1,-6.0,74.1
2014,2,274.0,75.9


In [49]:
health_data['Guido','HR']

year  visit
2013  1         74.0
      2        124.0
2014  1         -6.0
      2        274.0
Name: (Guido, HR), dtype: float64

# Data Aggregations on Multi indices

In [51]:
data_mean=health_data.mean(level='year')
data_mean

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,144.0,74.85,99.0,73.5,9.0,73.3
2014,9.0,72.6,134.0,75.0,19.0,74.55


In [53]:
data_mean.mean(level='type',axis=1)

type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,84.0,73.883333
2014,54.0,74.05
