# 数据规整：连接、联合与重塑

## 1、分层索引

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_rows=10

In [15]:
'''
即在一个轴向上可以拥有多个索引层级
'''
data=pd.Series(np.random.randn(9),
               index=[['a','a','a','b','b','c','c','d','d'],
                      [1,2,3,1,3,1,2,2,3]])
print('1',data)
print('2',data.index)
print('3',data['b'])
print('4',data['b':'c'])
print('5',data.loc[:,2])
print('6',data.loc['b',3])

1 a  1   -0.107627
   2    0.401281
   3    0.244178
b  1   -1.186420
   3   -0.431663
c  1   -2.008233
   2   -0.123997
d  2    0.309056
   3   -0.059214
dtype: float64
2 MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )
3 1   -1.186420
3   -0.431663
dtype: float64
4 b  1   -1.186420
   3   -0.431663
c  1   -2.008233
   2   -0.123997
dtype: float64
5 a    0.401281
c   -0.123997
d    0.309056
dtype: float64
6 -0.4316628196992452


In [16]:
#unstack方法与stack方法
print(data.unstack())
print(data.unstack().stack())

          1         2         3
a -0.107627  0.401281  0.244178
b -1.186420       NaN -0.431663
c -2.008233 -0.123997       NaN
d       NaN  0.309056 -0.059214
a  1   -0.107627
   2    0.401281
   3    0.244178
b  1   -1.186420
   3   -0.431663
c  1   -2.008233
   2   -0.123997
d  2    0.309056
   3   -0.059214
dtype: float64


In [6]:
#另外DataFrame中每个轴都可以有分层索引
frame=pd.DataFrame(np.arange(12).reshape((4,3)),
                   index=[['a','a','b','b'],[1,2,1,2]],
                   columns=[['Ohio','Ohio','Colorado'],
                            ['Green','Red','Green']])
print(frame)
print(frame.loc['a',('Ohio','Green')])


#分层层级也有名称，属性为names
frame.index.names=['key1','key2']
frame.columns.names=['state','color']
print(frame)

     Ohio     Colorado
    Green Red    Green
a 1     0   1        2
  2     3   4        5
b 1     6   7        8
  2     9  10       11
1    0
2    3
Name: (Ohio, Green), dtype: int32
state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
     2        3   4        5
b    1        6   7        8
     2        9  10       11


### 1.1、重排序和层级排序

In [14]:
#方法swaplevel.可以交换两个层级的值
frame=pd.DataFrame(np.arange(12).reshape((4,3)),
                   index=[['a','a','b','b'],[1,2,1,2]],
                   columns=[['Ohio','Ohio','Colorado'],
                            ['Green','Red','Green']])
frame.index.names=['key1','key2']
frame.columns.names=['state','color']
print(frame)
print(frame.swaplevel('key1','key2'))
print(frame.swaplevel('state','color',axis=1))

state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
     2        3   4        5
b    1        6   7        8
     2        9  10       11
state      Ohio     Colorado
color     Green Red    Green
key2 key1                   
1    a        0   1        2
2    a        3   4        5
1    b        6   7        8
2    b        9  10       11
color     Green  Red    Green
state      Ohio Ohio Colorado
key1 key2                    
a    1        0    1        2
     2        3    4        5
b    1        6    7        8
     2        9   10       11


In [16]:
#sort_index方法。sort_index(axis=,level=)    axis指定排序的轴，默认为0轴，level指定排序的层级，默认为0级
frame=pd.DataFrame(np.arange(12).reshape((4,3)),
                   index=[['a','a','b','b'],[1,2,1,2]],
                   columns=[['Ohio','Ohio','Colorado'],
                            ['Green','Red','Green']])
frame.index.names=['key1','key2']
frame.columns.names=['state','color']
print(frame)
print(frame.sort_index(level=1))
print(frame.swaplevel('key1','key2').sort_index(level=0))

state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
     2        3   4        5
b    1        6   7        8
     2        9  10       11
state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
b    1        6   7        8
a    2        3   4        5
b    2        9  10       11
state      Ohio     Colorado
color     Green Red    Green
key2 key1                   
1    a        0   1        2
     b        6   7        8
2    a        3   4        5
     b        9  10       11


### 1.2、按层级进行汇总统计

In [42]:
#DataFrame与Series的许多汇总函数level属性，可以实现在指定层级上进行统计
frame=pd.DataFrame({'a':range(7),'b':range(7,0,-1),
                    'c':['one','one','one','two','two','two','two'],
                    'd':[0,1,2,0,1,2,3]})
print(frame)
print(frame.groupby(['c','d']).size())

   a  b    c  d
0  0  7  one  0
1  1  6  one  1
2  2  5  one  2
3  3  4  two  0
4  4  3  two  1
5  5  2  two  2
6  6  1  two  3
c    d
one  0    1
     1    1
     2    1
two  0    1
     1    1
     2    1
     3    1
dtype: int64


### 1.3、使用DataFrame的列进行索引

In [34]:
frame=pd.DataFrame({'a':range(7),'b':range(7,0,-1),
                    'c':['one','one','one','two','two','two','two'],
                    'd':[0,1,2,0,1,2,3]})
print(frame)
print(frame.set_index(['c','d']))                            #利用set_index可以将已存在的一些列作为索引
print(frame.set_index(['c','d'],drop=False))                #drop指定为False，可以使得将被作为索引的列仍留在列表中
print(frame.set_index(['c','d']).reset_index())              #reset_index为set_index的反向操作，可以使得索引列再次移动到数据列中

   a  b    c  d
0  0  7  one  0
1  1  6  one  1
2  2  5  one  2
3  3  4  two  0
4  4  3  two  1
5  5  2  two  2
6  6  1  two  3
       a  b
c   d      
one 0  0  7
    1  1  6
    2  2  5
two 0  3  4
    1  4  3
    2  5  2
    3  6  1
       a  b    c  d
c   d              
one 0  0  7  one  0
    1  1  6  one  1
    2  2  5  one  2
two 0  3  4  two  0
    1  4  3  two  1
    2  5  2  two  2
    3  6  1  two  3
     c  d  a  b
0  one  0  0  7
1  one  1  1  6
2  one  2  2  5
3  two  0  3  4
4  two  1  4  3
5  two  2  5  2
6  two  3  6  1


## 2、联合与合并数据集