In [1]:
import pandas as pd

index = [('California', 2000,1), ('California',
2010,2), ('New York', 2000,3), ('New York', 2010,4),
('Texas', 2000,5), ('Texas', 2010,6)]
populations = [33871648, 37253956, 18976457,
19378102, 20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000, 1)    33871648
(California, 2010, 2)    37253956
(New York, 2000, 3)      18976457
(New York, 2010, 4)      19378102
(Texas, 2000, 5)         20851820
(Texas, 2010, 6)         25145561
dtype: int64

In [11]:
pop.index

Index([('California', 2000, 1), ('California', 2010, 2),
         ('New York', 2000, 3),   ('New York', 2010, 4),
            ('Texas', 2000, 5),      ('Texas', 2010, 6)],
      dtype='object')

In [12]:
index = [('California', 2000), ('California',
2010), ('New York', 2000), ('New York', 2010),
('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956, 18976457,
19378102, 20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [13]:
pop["California",2010]

37253956

In [14]:
pop[[i for i in pop.index if i[1]==2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

In [15]:
index = pd.MultiIndex.from_tuples(index)

In [17]:
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [18]:
pop.index

MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010]],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

In [20]:
pop["California",2010]
# when add an extra column of the index, 
# it will show another column in the table as index

37253956

In [124]:
pop["California"]

2000    33871648
2010    37253956
dtype: int64

In [125]:
pop[:,2000]

California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [59]:
pop.loc[:,2010,2:6]
# pop.loc is more clear at this point

California  2010  2    37253956
New York    2010  4    19378102
Texas       2010  6    25145561
dtype: int64

In [126]:
pop[:,2010]
# ask how to select the last column

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [22]:
pop_df=pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [23]:
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [25]:
# making a dataframe
pop_df = pd.DataFrame({'total': pop,
'under18': [9267089, 9284094,4687374, 4318033,5906301,6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [27]:
u18_percent = pop_df['under18']/pop_df['total']
u18_percent.unstack()
#u18_percent

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


### Hierarchical DataFrames
### 1. Pass list of lists as the index or column when creating a DataFrame

In [28]:
import numpy as np
df = pd.DataFrame(np.random.rand(4, 2),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.288496,0.621091
a,2,0.843534,0.262985
b,1,0.384316,0.193174
b,2,0.628506,0.298087


### 2- Use tuples as the Keys when using Dictionaries to create a DataFrame

In [39]:
# in this case it will take the keys as index, 
data = {('California', 2000,1): 33871648,
('California', 2010,2): 37253956, ('Texas', 2000,1):
20851820, ('Texas', 2010,3): 25145561, ('New York',
2000,4): 18976457, ('New York', 2010,5): 19378102}
A=pd.Series(data)
A.index

MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010], [1, 2, 3, 4, 5]],
           labels=[[0, 0, 2, 2, 1, 1], [0, 1, 0, 1, 0, 1], [0, 1, 0, 2, 3, 4]])

In [41]:
A

California  2000  1    33871648
            2010  2    37253956
Texas       2000  1    20851820
            2010  3    25145561
New York    2000  4    18976457
            2010  5    19378102
dtype: int64

### 3 - Explicitly use MultiIndex constructor

In [42]:
pd.MultiIndex.from_arrays([['a', 'a', 'b',
'b'], [1, 2, 1, 2]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [43]:
pd.MultiIndex.from_tuples([('a', 1), ('a', 2),
('b', 1), ('b', 2)])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [44]:
#From Cartesian product of single indices
pd.MultiIndex.from_product([['a', 'b'], [1,
2]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [111]:
# naming index levels
pop

(California, 2000, 1)    33871648
(California, 2010, 2)    37253956
(New York, 2000, 3)      18976457
(New York, 2010, 4)      19378102
(Texas, 2000, 5)         20851820
(Texas, 2010, 6)         25145561
dtype: int64

In [1]:
#pop.index.names=["state",'year']

In [105]:
# try it

In [49]:
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'],['HR', 'Temp']], names=['subject', 'type'])

In [50]:
# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,44.0,40.2,48.0,38.8,40.0,37.3
2013,2,28.0,38.2,38.0,36.9,53.0,37.4
2014,1,39.0,36.4,35.0,36.9,31.0,36.7
2014,2,20.0,38.1,31.0,36.9,33.0,38.6


### The indexing operations on Multiply index Series applies on the columns

In [51]:
health_data['Bob']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,44.0,40.2
2013,2,28.0,38.2
2014,1,39.0,36.4
2014,2,20.0,38.1


In [52]:
health_data['Bob', 'HR']

year  visit
2013  1        44.0
      2        28.0
2014  1        39.0
      2        20.0
Name: (Bob, HR), dtype: float64

In [53]:
health_data.iloc[:2, :2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,44.0,40.2
2013,2,28.0,38.2


### Each individual index in loc or iloc can be passed a tuple of multiple
### indices:

In [54]:
health_data.loc[:,('Sue', 'Temp')]

year  visit
2013  1        37.3
      2        37.4
2014  1        36.7
      2        38.6
Name: (Sue, Temp), dtype: float64

In [70]:
health_data.loc[(2013,2),('Sue', 'Temp')]

37.4

In [147]:
## Do not create slice within a tuple: Syntax errors:
health_data.loc[(:,1), (:, 'HR')]

SyntaxError: invalid syntax (<ipython-input-147-69cdd420837e>, line 2)

In [75]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,44.0,40.2,48.0,38.8,40.0,37.3
2013,2,28.0,38.2,38.0,36.9,53.0,37.4
2014,1,39.0,36.4,35.0,36.9,31.0,36.7
2014,2,20.0,38.1,31.0,36.9,33.0,38.6


In [78]:
idx=pd.IndexSlice
health_data.loc[idx[:,1],idx[:,'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,44.0,48.0,40.0
2014,1,39.0,35.0,31.0


In [87]:
# sorting
health_data.sort_index(level = 1,axis= 1)

Unnamed: 0_level_0,subject,Bob,Guido,Sue,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR,Temp,Temp,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,44.0,48.0,40.0,40.2,38.8,37.3
2013,2,28.0,38.0,53.0,38.2,36.9,37.4
2014,1,39.0,35.0,31.0,36.4,36.9,36.7
2014,2,20.0,31.0,33.0,38.1,36.9,38.6


In [89]:
frame = pd.DataFrame(np.arange(18).reshape((6, 3)),
index=[['a', 'a', 'c', 'c', 'b', 'b'],[1, 2, 2,1,1, 2]],
columns=[['Ohio', 'Ohio','Colorado'],
['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
c,2,6,7,8
c,1,9,10,11
b,1,12,13,14
b,2,15,16,17


In [91]:
frame.sort_index(level=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
b,1,12,13,14
c,1,9,10,11
a,2,3,4,5
b,2,15,16,17
c,2,6,7,8


In [None]:
# labels is the order of the columns from outliers 0,1,2 ... then inner
# 0,1 as the sub-columns

In [92]:
frame.swaplevel(0,1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
1,a,0,1,2
2,a,3,4,5
2,c,6,7,8
1,c,9,10,11
1,b,12,13,14
2,b,15,16,17


In [99]:
#frame.index.name("key1","key2")

In [107]:
#pop_df.reset_index("under18")

In [111]:
pop_df.set_index("total",drop=False)

Unnamed: 0_level_0,total,under18
total,Unnamed: 1_level_1,Unnamed: 2_level_1
33871648,33871648,9267089
37253956,37253956,9284094
18976457,18976457,4687374
19378102,19378102,4318033
20851820,20851820,5906301
25145561,25145561,6879014


In [113]:
# data_frame.method(level =level_name, axis = axis)
health_data.mean(level="year")

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,36.0,39.2,43.0,37.85,46.5,37.35
2014,29.5,37.25,33.0,36.9,32.0,37.65
