In [1]:
import numpy as np, pandas as pd

index = [('California', 2000), ('California', 2010), 
 ('New York', 2000), ('New York', 2010), 
 ('Texas', 2000), ('Texas', 2010)] 

populations = [33871648, 37253956, 
 18976457, 19378102, 
 20851820, 25145561] 
pop = pd.Series(populations, index=index) 
 
print(pop) 


(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64


In [3]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [4]:
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [11]:
print(pop[:,2010])
print("\n-------------------------------------\n")
print(pop["California",])

California    37253956
New York      19378102
Texas         25145561
dtype: int64

-------------------------------------

2000    33871648
2010    37253956
dtype: int64


In [12]:
# unstack and stack

pop_df = pop.unstack()
print(pop_df)

                2000      2010
California  33871648  37253956
New York    18976457  19378102
Texas       20851820  25145561


In [13]:
pop_stack = pop_df.stack()
print(pop_stack)

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64


In [18]:
# create multi-indexing

print(np.random.rand(4,2))

df = pd.DataFrame(np.random.rand(9,3), # 9 by 3 matrix
                 index=[["A","A","A","B","B","B","C","C","C"],[1,2,3,1,2,3,1,2,3]], # both numbers = row numbers, in order of index
                 columns=list("XYZ"))
print(df)
print(df.unstack())

[[0.14215676 0.12074468]
 [0.36901676 0.81713123]
 [0.5327172  0.32076164]
 [0.83001416 0.35605361]]
            X         Y         Z
A 1  0.336536  0.889826  0.434002
  2  0.112206  0.255963  0.253268
  3  0.200350  0.073074  0.638359
B 1  0.459190  0.735313  0.642241
  2  0.744598  0.456900  0.419964
  3  0.181240  0.401190  0.982894
C 1  0.319773  0.285484  0.594799
  2  0.553834  0.101952  0.507852
  3  0.790725  0.306459  0.166001
          X                             Y                             Z  \
          1         2         3         1         2         3         1   
A  0.336536  0.112206  0.200350  0.889826  0.255963  0.073074  0.434002   
B  0.459190  0.744598  0.181240  0.735313  0.456900  0.401190  0.642241   
C  0.319773  0.553834  0.790725  0.285484  0.101952  0.306459  0.594799   

                       
          2         3  
A  0.253268  0.638359  
B  0.419964  0.982894  
C  0.507852  0.166001  


In [19]:
# 显式地创建多级索引
# from array
print(pd.MultiIndex.from_arrays([["A","A","B","B","C","C","D","D"],[6,9,6,9,6,9,6,9]]))

MultiIndex([('A', 6),
            ('A', 9),
            ('B', 6),
            ('B', 9),
            ('C', 6),
            ('C', 9),
            ('D', 6),
            ('D', 9)],
           )


In [20]:
# from tuples
print(pd.MultiIndex.from_tuples([("A",1),("A",3),("A",5),("B",1),("B",3),("B",5)]))

MultiIndex([('A', 1),
            ('A', 3),
            ('A', 5),
            ('B', 1),
            ('B', 3),
            ('B', 5)],
           )


In [22]:
# from rpoduct
print(pd.MultiIndex.from_product([["A","B","C","D","E"],[3,4,5,6,7]]))

MultiIndex([('A', 3),
            ('A', 4),
            ('A', 5),
            ('A', 6),
            ('A', 7),
            ('B', 3),
            ('B', 4),
            ('B', 5),
            ('B', 6),
            ('B', 7),
            ('C', 3),
            ('C', 4),
            ('C', 5),
            ('C', 6),
            ('C', 7),
            ('D', 3),
            ('D', 4),
            ('D', 5),
            ('D', 6),
            ('D', 7),
            ('E', 3),
            ('E', 4),
            ('E', 5),
            ('E', 6),
            ('E', 7)],
           )


In [25]:
print(pd.__version__)

1.4.4


In [26]:
# giving you index names
df.index.names=["Upper Letter","Number"]
print(df)

                            X         Y         Z
Upper Letter Number                              
A            1       0.336536  0.889826  0.434002
             2       0.112206  0.255963  0.253268
             3       0.200350  0.073074  0.638359
B            1       0.459190  0.735313  0.642241
             2       0.744598  0.456900  0.419964
             3       0.181240  0.401190  0.982894
C            1       0.319773  0.285484  0.594799
             2       0.553834  0.101952  0.507852
             3       0.790725  0.306459  0.166001


In [29]:
# multiindex for columns

index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]], 
 names=['year', 'visit']) 

columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']], 
 names=['subject', 'type'])

data = np.round(np.random.randn(4, 6), 1) 
data[:, ::2] *= 10 
data += 37 

health_data = pd.DataFrame(data, index=index, columns=columns) 
health_data 

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,40.0,38.1,27.0,36.7,29.0,38.2
2013,2,39.0,37.6,47.0,35.9,22.0,35.7
2014,1,21.0,38.8,25.0,37.5,43.0,37.4
2014,2,34.0,37.4,27.0,37.9,30.0,37.5


In [30]:
health_data["Bob"] # select column Bob

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,40.0,38.1
2013,2,39.0,37.6
2014,1,21.0,38.8
2014,2,34.0,37.4


In [31]:
print(pop)

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64


In [32]:
# selecting multiIndex
pop["California",2000]

33871648

In [33]:
pop["California"]

2000    33871648
2010    37253956
dtype: int64

In [35]:
pop[:,2000]

California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [36]:
pop[:,2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [38]:
# DataFrame multi_index selection
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,40.0,38.1,27.0,36.7,29.0,38.2
2013,2,39.0,37.6,47.0,35.9,22.0,35.7
2014,1,21.0,38.8,25.0,37.5,43.0,37.4
2014,2,34.0,37.4,27.0,37.9,30.0,37.5


In [39]:
health_data["Sue"]

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,29.0,38.2
2013,2,22.0,35.7
2014,1,43.0,37.4
2014,2,30.0,37.5


In [40]:
health_data["Sue","Temp"]

year  visit
2013  1        38.2
      2        35.7
2014  1        37.4
      2        37.5
Name: (Sue, Temp), dtype: float64

In [55]:
#无序索引，有序索引
index = pd.MultiIndex.from_product([["a","c","b"],[1,2,3]])
print(index)
print("\n-----------------------------------------------------------\n")
data=pd.Series(np.random.rand(9),index=index)
data.index.names=["lower", "num"]
print(data)
print("\n-----------------------------------------------------------\n")
try:
    data["a":"b"]
except KeyError as e:
    print(type(e))
    print(e)
print("\n-----------------------------------------------------------\n")
try:
    data["b":"c"]
except KeyError as ke:
    print(type(ke))
    print(ke)

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('c', 1),
            ('c', 2),
            ('c', 3),
            ('b', 1),
            ('b', 2),
            ('b', 3)],
           )

-----------------------------------------------------------

lower  num
a      1      0.436889
       2      0.858677
       3      0.652154
c      1      0.434584
       2      0.297850
       3      0.608629
b      1      0.005528
       2      0.139458
       3      0.012884
dtype: float64

-----------------------------------------------------------

<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'

-----------------------------------------------------------

<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


In [58]:
# use sort_index
data = data.sort_index()
print(data["a":"b"])
print("\n-----------------------------------------------------------\n")
print(data["b":"c"])


lower  num
a      1      0.436889
       2      0.858677
       3      0.652154
b      1      0.005528
       2      0.139458
       3      0.012884
dtype: float64

-----------------------------------------------------------

lower  num
b      1      0.005528
       2      0.139458
       3      0.012884
c      1      0.434584
       2      0.297850
       3      0.608629
dtype: float64


In [59]:
print(pop)

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64


In [62]:
print(pop.unstack(level=0))
print("\n-----------------------------------------------\n")
print(pop.unstack(level=0).stack())

      California  New York     Texas
2000    33871648  18976457  20851820
2010    37253956  19378102  25145561

-----------------------------------------------

2000  California    33871648
      New York      18976457
      Texas         20851820
2010  California    37253956
      New York      19378102
      Texas         25145561
dtype: int64


In [63]:
print(pop.unstack(level=1))
print("\n-----------------------------------------------\n")
print(pop.unstack(level=1).stack())

                2000      2010
California  33871648  37253956
New York    18976457  19378102
Texas       20851820  25145561

-----------------------------------------------

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64


In [71]:
# set index , reset index
pop.index.names=["state","year"]
print(pop)
print("\n-----------------------------------------------\n")
pop_flat = pop.reset_index(name="population")
print(pop_flat)
print("\n-----------------------------------------------\n")
print(pop_flat.set_index(["state","population"]))
print("\n-----------------------------------------------\n")
print(pop_flat.set_index(["state","year"]))
print("\n-----------------------------------------------\n")
print(pop_flat.set_index(["year","population"]))

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

-----------------------------------------------

        state  year  population
0  California  2000    33871648
1  California  2010    37253956
2    New York  2000    18976457
3    New York  2010    19378102
4       Texas  2000    20851820
5       Texas  2010    25145561

-----------------------------------------------

                       year
state      population      
California 33871648    2000
           37253956    2010
New York   18976457    2000
           19378102    2010
Texas      20851820    2000
           25145561    2010

-----------------------------------------------

                 population
state      year            
California 2000    33871648
           2010    37253956
New York   2000    18976457
           2010    19378102
Texas      2000    20851820
  

In [79]:
# again max, min, sum, mean

print(health_data)
print("\n------------------------------------\n")

print(health_data.groupby(level="year").mean())
print("\n------------------------------------\n")

print(health_data.groupby(level="subject", axis=1).mean())

subject      Bob       Guido         Sue      
type          HR  Temp    HR  Temp    HR  Temp
year visit                                    
2013 1      40.0  38.1  27.0  36.7  29.0  38.2
     2      39.0  37.6  47.0  35.9  22.0  35.7
2014 1      21.0  38.8  25.0  37.5  43.0  37.4
     2      34.0  37.4  27.0  37.9  30.0  37.5

------------------------------------

subject   Bob        Guido         Sue       
type       HR   Temp    HR  Temp    HR   Temp
year                                         
2013     39.5  37.85  37.0  36.3  25.5  36.95
2014     27.5  38.10  26.0  37.7  36.5  37.45

------------------------------------

subject       Bob  Guido    Sue
year visit                     
2013 1      39.05  31.85  33.60
     2      38.30  41.45  28.85
2014 1      29.90  31.25  40.20
     2      35.70  32.45  33.75
