# 数据规整：连接、联合与重塑

## 1、分层索引

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_rows=10

In [15]:
'''
即在一个轴向上可以拥有多个索引层级
'''
data=pd.Series(np.random.randn(9),
               index=[['a','a','a','b','b','c','c','d','d'],
                      [1,2,3,1,3,1,2,2,3]])
print('1',data)
print('2',data.index)
print('3',data['b'])
print('4',data['b':'c'])
print('5',data.loc[:,2])
print('6',data.loc['b',3])

1 a  1   -0.107627
   2    0.401281
   3    0.244178
b  1   -1.186420
   3   -0.431663
c  1   -2.008233
   2   -0.123997
d  2    0.309056
   3   -0.059214
dtype: float64
2 MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )
3 1   -1.186420
3   -0.431663
dtype: float64
4 b  1   -1.186420
   3   -0.431663
c  1   -2.008233
   2   -0.123997
dtype: float64
5 a    0.401281
c   -0.123997
d    0.309056
dtype: float64
6 -0.4316628196992452


In [16]:
#unstack方法与stack方法
print(data.unstack())
print(data.unstack().stack())

          1         2         3
a -0.107627  0.401281  0.244178
b -1.186420       NaN -0.431663
c -2.008233 -0.123997       NaN
d       NaN  0.309056 -0.059214
a  1   -0.107627
   2    0.401281
   3    0.244178
b  1   -1.186420
   3   -0.431663
c  1   -2.008233
   2   -0.123997
d  2    0.309056
   3   -0.059214
dtype: float64


In [6]:
#另外DataFrame中每个轴都可以有分层索引
frame=pd.DataFrame(np.arange(12).reshape((4,3)),
                   index=[['a','a','b','b'],[1,2,1,2]],
                   columns=[['Ohio','Ohio','Colorado'],
                            ['Green','Red','Green']])
print(frame)
print(frame.loc['a',('Ohio','Green')])


#分层层级也有名称，属性为names
frame.index.names=['key1','key2']
frame.columns.names=['state','color']
print(frame)

     Ohio     Colorado
    Green Red    Green
a 1     0   1        2
  2     3   4        5
b 1     6   7        8
  2     9  10       11
1    0
2    3
Name: (Ohio, Green), dtype: int32
state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
     2        3   4        5
b    1        6   7        8
     2        9  10       11


### 1.1、重排序和层级排序

In [14]:
#方法swaplevel.可以交换两个层级的值
frame=pd.DataFrame(np.arange(12).reshape((4,3)),
                   index=[['a','a','b','b'],[1,2,1,2]],
                   columns=[['Ohio','Ohio','Colorado'],
                            ['Green','Red','Green']])
frame.index.names=['key1','key2']
frame.columns.names=['state','color']
print(frame)
print(frame.swaplevel('key1','key2'))
print(frame.swaplevel('state','color',axis=1))

state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
     2        3   4        5
b    1        6   7        8
     2        9  10       11
state      Ohio     Colorado
color     Green Red    Green
key2 key1                   
1    a        0   1        2
2    a        3   4        5
1    b        6   7        8
2    b        9  10       11
color     Green  Red    Green
state      Ohio Ohio Colorado
key1 key2                    
a    1        0    1        2
     2        3    4        5
b    1        6    7        8
     2        9   10       11


In [16]:
#sort_index方法。sort_index(axis=,level=)    axis指定排序的轴，默认为0轴，level指定排序的层级，默认为0级
frame=pd.DataFrame(np.arange(12).reshape((4,3)),
                   index=[['a','a','b','b'],[1,2,1,2]],
                   columns=[['Ohio','Ohio','Colorado'],
                            ['Green','Red','Green']])
frame.index.names=['key1','key2']
frame.columns.names=['state','color']
print(frame)
print(frame.sort_index(level=1))
print(frame.swaplevel('key1','key2').sort_index(level=0))

state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
     2        3   4        5
b    1        6   7        8
     2        9  10       11
state      Ohio     Colorado
color     Green Red    Green
key1 key2                   
a    1        0   1        2
b    1        6   7        8
a    2        3   4        5
b    2        9  10       11
state      Ohio     Colorado
color     Green Red    Green
key2 key1                   
1    a        0   1        2
     b        6   7        8
2    a        3   4        5
     b        9  10       11


### 1.2、按层级进行汇总统计

In [42]:
#DataFrame与Series的许多汇总函数level属性，可以实现在指定层级上进行统计
frame=pd.DataFrame({'a':range(7),'b':range(7,0,-1),
                    'c':['one','one','one','two','two','two','two'],
                    'd':[0,1,2,0,1,2,3]})
print(frame)
print(frame.groupby(['c','d']).size())

   a  b    c  d
0  0  7  one  0
1  1  6  one  1
2  2  5  one  2
3  3  4  two  0
4  4  3  two  1
5  5  2  two  2
6  6  1  two  3
c    d
one  0    1
     1    1
     2    1
two  0    1
     1    1
     2    1
     3    1
dtype: int64


### 1.3、使用DataFrame的列进行索引

In [34]:
#就是将指定列作为索引index
frame=pd.DataFrame({'a':range(7),'b':range(7,0,-1),
                    'c':['one','one','one','two','two','two','two'],
                    'd':[0,1,2,0,1,2,3]})
print(frame)
print(frame.set_index(['c','d']))                            #利用set_index可以将已存在的一些列作为索引
print(frame.set_index(['c','d'],drop=False))                #drop指定为False，可以使得将被作为索引的列仍留在列表中
print(frame.set_index(['c','d']).reset_index())              #reset_index为set_index的反向操作，可以使得索引列再次移动到数据列中

   a  b    c  d
0  0  7  one  0
1  1  6  one  1
2  2  5  one  2
3  3  4  two  0
4  4  3  two  1
5  5  2  two  2
6  6  1  two  3
       a  b
c   d      
one 0  0  7
    1  1  6
    2  2  5
two 0  3  4
    1  4  3
    2  5  2
    3  6  1
       a  b    c  d
c   d              
one 0  0  7  one  0
    1  1  6  one  1
    2  2  5  one  2
two 0  3  4  two  0
    1  4  3  two  1
    2  5  2  two  2
    3  6  1  two  3
     c  d  a  b
0  one  0  0  7
1  one  1  1  6
2  one  2  2  5
3  two  0  3  4
4  two  1  4  3
5  two  2  5  2
6  two  3  6  1


## 2、联合与合并数据集

In [10]:
#利用pandas.merge、pandas.concat、combine_first可以实现不同pandas对象数据的联合

### 2.1 merge的数据库风格的DataFrame连接

In [16]:
#使用pandas.merge实现。与数据库实现的连接类似，利用键连接
df1=pd.DataFrame({'key':['b','b','a','c','a','a','b'],
                 'data1':range(7)})
df2=pd.DataFrame({'key':['a','b','d'],
                  'data2':range(3)})



merge1=pd.merge(df1,df2)                  #当未说明在哪一列进行连接时，默认按照重叠的列进行连接
merge2=pd.merge(df1,df2,on='key')         #指定按照'key'列连接

print('1',merge1,merge2,sep='\n\n',end='\n\n')

df3=pd.DataFrame({'lkey':['b','b','a','c','a','a','b'],
                  'data1':range(7)})
df4=pd.DataFrame({'rkey':['a','b','d'],
                  'data2':range(3)})
 
#merge3=pd.merge(df3,df4)             由于两个DataFrame的列名不同，因此无法连接
merge4=pd.merge(df3,df4,left_on='lkey',right_on='rkey')      #需要分别指定列名
print('2\n',merge4,end='\n\n')

'''
默认来说，pandas.merge执行内连接
可以通过how参数选定要执行连接的类别，可以是'left','right','outer','inner'
'''

merge5=pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='outer')
print('3\n',merge5,end='\n\n')


#同时也可以实现利用多个键进行合并，传入一个列名的列表
df5=pd.DataFrame({'key1':['foo','foo','bar'],
                  'key2':['one','two','one'],
                  'lval':[1,2,3]})
df6=pd.DataFrame({'key1':['foo','foo','bar','bar'],
                  'key2':['one','one','one','two'],
                  'rval':[4,5,6,7]})
merge6=pd.merge(df5,df6,how='outer')
print('4\n',merge6,end='\n\n')


#merge中有suffixes参数用于指定在联合结果的重叠列名后要添加的字符串
merge7=pd.merge(df5,df6,on='key1')
merge8=pd.merge(df5,df6,on='key1',suffixes=('_left','_right'))
print('5',merge7,merge8,sep='\n\n')

1

  key  data1  data2
0   b      0      1
1   b      1      1
2   b      6      1
3   a      2      0
4   a      4      0
5   a      5      0

  key  data1  data2
0   b      0      1
1   b      1      1
2   b      6      1
3   a      2      0
4   a      4      0
5   a      5      0

2
   lkey  data1 rkey  data2
0    b      0    b      1
1    b      1    b      1
2    b      6    b      1
3    a      2    a      0
4    a      4    a      0
5    a      5    a      0

3
   lkey  data1 rkey  data2
0    b    0.0    b    1.0
1    b    1.0    b    1.0
2    b    6.0    b    1.0
3    a    2.0    a    0.0
4    a    4.0    a    0.0
5    a    5.0    a    0.0
6    c    3.0  NaN    NaN
7  NaN    NaN    d    2.0

4
   key1 key2  lval  rval
0  foo  one   1.0   4.0
1  foo  one   1.0   5.0
2  foo  two   2.0   NaN
3  bar  one   3.0   6.0
4  bar  two   NaN   7.0

5

  key1 key2_x  lval key2_y  rval
0  foo    one     1    one     4
1  foo    one     1    one     5
2  foo    two     2    one     4
3  foo  

### 2.2 根据索引合并

In [6]:
#有的列表进行连接的键就是它的索引，通过left_index=True或right*****来表明那一侧的列表其索引为参与连接的键
left1=pd.DataFrame({'key':['a','b','a','a','b','c'],
                    'value':range(6)})
right1=pd.DataFrame({'group_val':[3.5,7]},index=['a','b'])

merge1=pd.merge(left1,right1,left_on='key',right_index=True)
print(merge1)

  key  value  group_val
0   a      0        3.5
2   a      2        3.5
3   a      3        3.5
1   b      1        7.0
4   b      4        7.0


In [18]:
#在多层索引数据情况下，在索引上连接
left=pd.DataFrame({'key1':['Ohio','Ohio','Ohio','Nevada','Nevada'],
                   'key2':[2000,2001,2002,2001,2002],
                   'data':np.arange(5.)})
right=pd.DataFrame(np.arange(12).reshape((6,2)),
                   index=[['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio'],
                          [2001,2000,2000,2000,2001,2002]],
                   columns=['event1','event2'])
merge3=pd.merge(left,right,left_on=['key1','key2'],right_index=True)
print(left,right,merge3,sep='\n')

     key1  key2  data
0    Ohio  2000   0.0
1    Ohio  2001   1.0
2    Ohio  2002   2.0
3  Nevada  2001   3.0
4  Nevada  2002   4.0
             event1  event2
Nevada 2001       0       1
       2000       2       3
Ohio   2000       4       5
       2000       6       7
       2001       8       9
       2002      10      11
     key1  key2  data  event1  event2
0    Ohio  2000   0.0       4       5
0    Ohio  2000   0.0       6       7
1    Ohio  2001   1.0       8       9
2    Ohio  2002   2.0      10      11
3  Nevada  2001   3.0       0       1


In [15]:
#还可以使用两边的索引进行合并
left2=pd.DataFrame([[1.,2.],[3.,4.],[5.,6.]],
                   index=['a','c','e'],
                   columns=['Ohio','Nevada'])
right2=pd.DataFrame([[7.,8.],[9.,10.],[11.,12.],[13.,14.]],
                    index=['b','c','d','e'],
                    columns=['Missouri','Alabama'])
merge4=pd.merge(left2,right2,how='outer',left_index=True,right_index=True)
print(left2,right2,merge4,sep='\n\n')



   Ohio  Nevada
a   1.0     2.0
c   3.0     4.0
e   5.0     6.0

   Missouri  Alabama
b       7.0      8.0
c       9.0     10.0
d      11.0     12.0
e      13.0     14.0

   Ohio  Nevada  Missouri  Alabama
a   1.0     2.0       NaN      NaN
b   NaN     NaN       7.0      8.0
c   3.0     4.0       9.0     10.0
d   NaN     NaN      11.0     12.0
e   5.0     6.0      13.0     14.0


In [19]:
'''
DataFrame有一个join实例，可以实现按照索引合并
join默认连接方法为左连接'left'
'''
print(left2.join(right2,how='outer'))   #与merge效果相同

#join还可以选取当前对象的某列与另一对象的索引进行连接
print('\n',left1,right1,left1.join(right1,on='key',how='right'),sep='\n')


#还可以向join中传入一个DataFrame列表
another=pd.DataFrame([[7.,8.],[9.,10.],[11.,12.],[16.,17.]],
                     index=['a','c','e','f'],
                    columns=['New York','Oregon'])
print(left2,right2,another,sep='\n',end='\n\n')
print(left2.join([right2,another]))
print(left2.join([another,right2]))
print(left2.join([right2,another],how='outer'))

   Ohio  Nevada  Missouri  Alabama
a   1.0     2.0       NaN      NaN
b   NaN     NaN       7.0      8.0
c   3.0     4.0       9.0     10.0
d   NaN     NaN      11.0     12.0
e   5.0     6.0      13.0     14.0


  key  value
0   a      0
1   b      1
2   a      2
3   a      3
4   b      4
5   c      5
   group_val
a        3.5
b        7.0
  key  value  group_val
0   a      0        3.5
2   a      2        3.5
3   a      3        3.5
1   b      1        7.0
4   b      4        7.0
   Ohio  Nevada
a   1.0     2.0
c   3.0     4.0
e   5.0     6.0
   Missouri  Alabama
b       7.0      8.0
c       9.0     10.0
d      11.0     12.0
e      13.0     14.0
   New York  Oregon
a       7.0     8.0
c       9.0    10.0
e      11.0    12.0
f      16.0    17.0

   Ohio  Nevada  Missouri  Alabama  New York  Oregon
a   1.0     2.0       NaN      NaN       7.0     8.0
c   3.0     4.0       9.0     10.0       9.0    10.0
e   5.0     6.0      13.0     14.0      11.0    12.0
   Ohio  Nevada  New York  Orego

### 2.3 沿轴向连接

In [20]:
'''
上述的均为数据的合并，即通过键实现合并连接
而另一种数据连接为拼接或堆叠
'''
arr=np.arange(12).reshape((3,4))
print(np.concatenate([arr,arr],axis=1))    #NumPy数据可以利用concatenate实现数据堆叠

[[ 0  1  2  3  0  1  2  3]
 [ 4  5  6  7  4  5  6  7]
 [ 8  9 10 11  8  9 10 11]]


In [35]:
#pandas.concat实现了这种目标
s1=pd.Series([0,1,2],index=['a','b','c'])
s2=pd.Series([2,3,4,],index=['c','d','e'])
s3=pd.Series([5,6],index=['f','g'])
print(pd.concat([s1,s2,s3]))            #默认在轴axis=0上进行
print(pd.concat([s1,s2,s3],axis=1),end='\n\n')     #指定在axis=1上进行就会返回一个DataFrame    
'''
concat的默认连接方式是外连接'outer',利用join参数改变连接方式
'''
s4=pd.concat([s1,s3])
print(s4)
print(pd.concat([s1,s4],axis=1))
print(pd.concat([s1,s4],axis=1,join='inner'))
print(pd.concat([s1,s1,s3],keys=['one','two','three']))       #利用keys区分拼接后的各部分，产生了一个多层索引
print(pd.concat([s1,s1,s3],keys=['one','two','three'],axis=1)) #沿着轴axis=1拼接时，keys则变成了列名

a    0
b    1
c    2
c    2
d    3
e    4
f    5
g    6
dtype: int64
     0    1    2
a  0.0  NaN  NaN
b  1.0  NaN  NaN
c  2.0  2.0  NaN
d  NaN  3.0  NaN
e  NaN  4.0  NaN
f  NaN  NaN  5.0
g  NaN  NaN  6.0

a    0
b    1
c    2
f    5
g    6
dtype: int64
     0  1
a  0.0  0
b  1.0  1
c  2.0  2
f  NaN  5
g  NaN  6
   0  1
a  0  0
b  1  1
c  2  2
one    a    0
       b    1
       c    2
two    a    0
       b    1
       c    2
three  f    5
       g    6
dtype: int64
   one  two  three
a  0.0  0.0    NaN
b  1.0  1.0    NaN
c  2.0  2.0    NaN
f  NaN  NaN    5.0
g  NaN  NaN    6.0


In [2]:
#同样也可以将pandas.concat用于DataFrame中
df1=pd.DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],
                 columns=['one','two'])
df2=pd.DataFrame(5+np.arange(4).reshape(2,2),
                 index=['a','c'],
                 columns=['three','four'])
print(pd.concat([df1,df2],axis=1,keys=['level1','level2']))
print(pd.concat({'level1':df1,'level2':df2},axis=1))             #pandas.concat中也可以传入字典，字典的键将用于keys参数
print(pd.concat([df1,df2],axis=1,keys=['level1','level2'],names=['upper','lower']))    #利用names属性为不同级索引添加名字

  level1     level2     
     one two  three four
a      0   1    5.0  6.0
b      2   3    NaN  NaN
c      4   5    7.0  8.0
  level1     level2     
     one two  three four
a      0   1    5.0  6.0
b      2   3    NaN  NaN
c      4   5    7.0  8.0
upper level1     level2     
lower    one two  three four
a          0   1    5.0  6.0
b          2   3    NaN  NaN
c          4   5    7.0  8.0


In [5]:
#ignore_index参数：不保留原本的行索引，而产生一个新的长度为拼接起来行数的新索引
df1=pd.DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df2=pd.DataFrame(np.random.randn(2,3),columns=['b','d','a'])
print('df1:',df1,'df2:',df2,sep='\n')
print('df1+df2:',pd.concat([df1,df2],ignore_index=1),sep='\n')

df1:
          a         b         c         d
0 -0.099360 -0.822586 -0.633260 -0.160680
1 -0.540328 -0.474174 -1.008406 -0.539455
2 -0.476782 -2.086547 -1.892256 -1.230668
df2:
          b         d         a
0  0.626185 -0.199051 -1.471502
1 -0.431134  0.367462 -3.135466
df1+df2:
          a         b         c         d
0 -0.099360 -0.822586 -0.633260 -0.160680
1 -0.540328 -0.474174 -1.008406 -0.539455
2 -0.476782 -2.086547 -1.892256 -1.230668
3 -1.471502  0.626185       NaN -0.199051
4 -3.135466 -0.431134       NaN  0.367462


### 2.4 联合重叠数据