## Hierarchical indexing

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Tracking data abput a state for two different years
# Using the python MultiIndex

index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop = pd.Series(populations, index=index)


In [3]:
# Creating a MultiIndex with a tuple based index
index = pd.MultiIndex.from_tuples(index)

# Reindexing the pop series with the MultiIndex above
pop = pop.reindex(index)

# Selecting a subset of the data using the 2nd level of the multiple index
pop[:, 2000]

California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [4]:
# Converting a multiply index series to a conventional pandas DataFrame by unstack() mehtod
pop_unstacked = pop.unstack()
pop_unstacked

# The unstacked pop DataFrame object can be changed back to a multiply index series
# Using stack() method

pop_stacked = pop_unstacked.stack()
# Adding another column to the dataframe 
pop_stacked = pd.DataFrame({'total':pop_stacked,
                            'under_18':[9267089, 9284094,
                                        4687374, 4318033,
                                        5906301, 6879014]})
pop_stacked['divide'] =  pop_stacked['under_18']/pop_stacked['total']
pop_stacked

# Passing appropriate tuples as key, pandas automatically use multiIndex by default
data = {('California', 2000): 33871648, 
        ('California', 2010): 37253956, 
        ('Texas', 2000): 20851820, 
        ('Texas', 2010): 25145561, 
        ('New York', 2000): 18976457, 
        ('New York', 2010): 19378102}
data_series=pd.Series(data)

In [5]:
# Further examples on hierarchical indexing
# Creating a Dataframe with both axis having hierarchical index

examp = pd.DataFrame(np.arange(12).reshape((4,3)),
                     index=[['a','a','a','b'],[1,3,1,3]],
                     columns=[['Ohio','Ohio', 'Colorado'],
                              ['Green', 'Red', 'Green']])

examp.index.names = ['key1', 'key2']

examp.columns.names = ['State', 'Color']


In [6]:
# Explicit MultiIndex construction

# Constructing from list of array, given the index value
pd.MultiIndex.from_arrays([['a','a', 'b'], [1,2,3]])

# Constructing from list of tuples, giving the multiple index values of each point
pd.MultiIndex.from_tuples([('a',1),('a',2),('b',2),('b',1)])

# Constructing from cartesian product of single indices
pd.MultiIndex.from_product([['a','b','c'],[1,2]])

# Creating hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013,2014],[1,2]],
                                    names=['year','visit'])
column = pd.MultiIndex.from_product([['Bob','Guido','Sue'],['HR','Temp']],
                                     names=['subject','type'])

# mock some data
rng = np.random.RandomState(0)
data = np.round(rng.randn(4,6),1)
data[:,::2] *= 10
data +=37

# Creating a DataFrame
health_data = pd.DataFrame(data, index=index, columns=column)
health_data

# Practice on an imaginary data using multi level indices
index = pd.MultiIndex.from_product([['Nigeria','USA','South Africa','Italy'], 
                                    ['Avg_Age', "Total_num(x 1000)", 'Avg_Salary($)']],
                                    names= ['Country','Data'])
column = pd.MultiIndex.from_product([['Lawyer', 'Developer', 'Doctor', 'Accountants'],
                                    ['Male', 'Female']],
                                    names= ['Job_Titles','Gender']  )

data = rng.randint(20,70, size=(12,8)) * 1.2
data[2::3,:] *= 1000 # multiplying the entries in avg_salary by 1000
occu_data = pd.DataFrame(data, index=index, columns=column)
id = pd.IndexSlice
occu_data.loc[id[:,'Total_num(x 1000)'], id[:,'Female']] -= 20.5
occu_data.loc[id[:,'Total_num(x 1000)'], id[:,'Male']] += 15
occu_data


Unnamed: 0_level_0,Job_Titles,Lawyer,Lawyer,Developer,Developer,Doctor,Doctor,Accountants,Accountants
Unnamed: 0_level_1,Gender,Male,Female,Male,Female,Male,Female,Male,Female
Country,Data,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Nigeria,Avg_Age,57.6,64.8,24.0,24.0,67.2,30.0,69.6,72.0
Nigeria,Total_num(x 1000),59.4,21.5,43.8,52.7,89.4,40.7,40.2,4.7
Nigeria,Avg_Salary($),70800.0,73200.0,66000.0,69600.0,37200.0,79200.0,45600.0,56400.0
USA,Avg_Age,24.0,40.8,66.0,38.4,74.4,48.0,37.2,28.8
USA,Total_num(x 1000),46.2,8.3,95.4,7.1,53.4,46.7,87.0,20.3
USA,Avg_Salary($),42000.0,48000.0,66000.0,51600.0,42000.0,39600.0,49200.0,81600.0
South Africa,Avg_Age,82.8,30.0,73.2,66.0,24.0,61.2,30.0,60.0
South Africa,Total_num(x 1000),39.0,62.3,82.2,44.3,96.6,38.3,42.6,44.3
South Africa,Avg_Salary($),74400.0,39600.0,81600.0,70800.0,49200.0,34800.0,24000.0,36000.0
Italy,Avg_Age,75.6,51.6,26.4,64.8,66.0,60.0,27.6,45.6


In [16]:
# Rearranging multi-Indices
# Sorted and unsorted indices

# Creating a multiply indexed data where the indices are not lexicographically sorted
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1,2]])
data = pd.Series(np.random.rand(6), index=index)
# data['a':'b'] # Trying partial slicing on this data will yield an error

data = data.sort_index() # Sorting the index will allow us to partially slice
data['a':'b']

a  1    0.899673
   2    0.936794
b  1    0.438620
   2    0.094177
dtype: float64

In [27]:
# Stacking and Unstacking indices

# Stacking along levels is optional 
new = pop.unstack(level=1)

# The opposite of stack is unstack to recover the original series
new.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [31]:
# We can also swap levels of our multiply index
# Using the data created in the examp variable above

examp.swaplevel('key2', 'key1')


In [34]:
# Data Aggregation on multi-indices

# Still working eith the health_data 
mean_data = health_data.mean(level='year') 
mean_data.mean(axis=1, level='type')

type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,46.5,37.55
2014,39.166667,36.883333
