In [5]:
import numpy as np
import pandas as pd

### 1.层次化索引

In [43]:
ser = pd.Series(np.random.randn(9), index = [
    ['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
    [1, 2, 3, 1, 2, 1, 2, 2, 3]
])
ser

a  1   -1.402150
   2    1.031333
   3    0.035888
b  1    0.761838
   2   -0.114161
c  1    1.028216
   2    1.349692
d  2   -1.802057
   3    2.378148
dtype: float64

In [44]:
ser["a"]

1   -1.402150
2    1.031333
3    0.035888
dtype: float64

In [47]:
ser.loc["b": "c"]

b  1    0.761838
   2   -0.114161
c  1    1.028216
   2    1.349692
dtype: float64

In [48]:
ser.loc[["b", "d"]]

b  1    0.761838
   2   -0.114161
d  2   -1.802057
   3    2.378148
dtype: float64

In [49]:
# 忽略外层索引
ser[:, 2]

a    1.031333
b   -0.114161
c    1.349692
d   -1.802057
dtype: float64

In [50]:
frame = pd.DataFrame({
    "a": range(7),
    "b": range(7, 0, -1),
    "c": ["one"] * 4 + ["two"] * 3,
    "d": [0, 1, 2] * 2 + [3]
})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,one,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [54]:
# drop=True 参数默认删除列转变成行索引的列
frame2 = frame.set_index(["c", "d"], drop = True)
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
one,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [55]:
frame3 = frame.set_index(["c", "d"], drop = False)
frame3

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
one,0,3,4,one,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [56]:
# reset_index是set_index的逆操作
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,one,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


### 2.数据连接

In [58]:
left = pd.DataFrame({
    "A": ["A0", "A1", "A2", "A3"],
    "B": ["B0", "B1", "B2", "B3"],
    "key": ["K0", "K1", "K2", "K3"]
})
left

Unnamed: 0,A,B,key
0,A0,B0,K0
1,A1,B1,K1
2,A2,B2,K2
3,A3,B3,K3


In [59]:
right = pd.DataFrame({
    "C": ["C0", "C1", "C2", "C3"],
    "D": ["D0", "D1", "D2", "D3"],
    "key": ["K0", "K1", "K2", "K3"]
})
right

Unnamed: 0,C,D,key
0,C0,D0,K0
1,C1,D1,K1
2,C2,D2,K2
3,C3,D3,K3


#### merge的使用

In [60]:
# 默认情况下连接的数据集有相同的key则会自动连接
pd.merge(left, right)

Unnamed: 0,A,B,key,C,D
0,A0,B0,K0,C0,D0
1,A1,B1,K1,C1,D1
2,A2,B2,K2,C2,D2
3,A3,B3,K3,C3,D3


In [61]:
# 通过on参数指定连接键
pd.merge(left, right, on = 'key')

Unnamed: 0,A,B,key,C,D
0,A0,B0,K0,C0,D0
1,A1,B1,K1,C1,D1
2,A2,B2,K2,C2,D2
3,A3,B3,K3,C3,D3


In [62]:
l2 = pd.DataFrame({
    "A": ["A0", "A1", "A2", "A3"],
    "B": ["B0", "B1", "B2", "B3"],
    "k1": ["K0", "K0", "K2", "K3"],
    "k2": ["K0", "K1", "K2", "K3"]
})
l2

Unnamed: 0,A,B,k1,k2
0,A0,B0,K0,K0
1,A1,B1,K0,K1
2,A2,B2,K2,K2
3,A3,B3,K3,K3


In [65]:
r2 = pd.DataFrame({
    "C": ["C0", "C1", "C2", "C3"],
    "D": ["D0", "D1", "D2", "D3"],
    "k1": ["K0", "K1", "K2", "K3"],
    "k2": ["K0", "K1", "K2", "K3"]
})
r2

Unnamed: 0,C,D,k1,k2
0,C0,D0,K0,K0
1,C1,D1,K1,K1
2,C2,D2,K2,K2
3,C3,D3,K3,K3


In [67]:
# 指定多个建连接
# 没有匹配成功的则删除不进行连接
pd.merge(l2, r2, on = ["k1", "k2"])

Unnamed: 0,A,B,k1,k2,C,D
0,A0,B0,K0,K0,C0,D0
1,A2,B2,K2,K2,C2,D2
2,A3,B3,K3,K3,C3,D3


In [68]:
l3 = pd.DataFrame({
    "A": ["A0", "A1", "A2", "A3"],
    "B": ["B0", "B1", "B2", "B3"],
    "k1": ["K0", "K0", "K1", "K2"],
    "k2": ["K0", "K1", "K0", "K1"]
})
l3

Unnamed: 0,A,B,k1,k2
0,A0,B0,K0,K0
1,A1,B1,K0,K1
2,A2,B2,K1,K0
3,A3,B3,K2,K1


In [69]:
r3 = pd.DataFrame({
    "C": ["C0", "C1", "C2", "C3"],
    "D": ["D0", "D1", "D2", "D3"],
    "k1": ["K0", "K1", "K1", "K2"],
    "k2": ["K0", "K0", "K0", "K0"]
})
r3

Unnamed: 0,C,D,k1,k2
0,C0,D0,K0,K0
1,C1,D1,K1,K0
2,C2,D2,K1,K0
3,C3,D3,K2,K0


In [71]:
# 通过参数how指定连接方式
# 左链接
pd.merge(l3, r3, how = "left", on = ["k1", "k2"])

Unnamed: 0,A,B,k1,k2,C,D
0,A0,B0,K0,K0,C0,D0
1,A1,B1,K0,K1,,
2,A2,B2,K1,K0,C1,D1
3,A2,B2,K1,K0,C2,D2
4,A3,B3,K2,K1,,


In [72]:
# 通过参数how指定连接方式
# 右链接
pd.merge(l3, r3, how = "right", on = ["k1", "k2"])

Unnamed: 0,A,B,k1,k2,C,D
0,A0,B0,K0,K0,C0,D0
1,A2,B2,K1,K0,C1,D1
2,A2,B2,K1,K0,C2,D2
3,,,K2,K0,C3,D3


In [73]:
# 通过参数how指定连接方式
# 默认不指定链接方式则是内连接
pd.merge(l3, r3, how = "inner", on = ["k1", "k2"])

Unnamed: 0,A,B,k1,k2,C,D
0,A0,B0,K0,K0,C0,D0
1,A2,B2,K1,K0,C1,D1
2,A2,B2,K1,K0,C2,D2


In [74]:
# 通过参数how指定连接方式
# 全外连接
pd.merge(l3, r3, how = "outer", on = ["k1", "k2"])

Unnamed: 0,A,B,k1,k2,C,D
0,A0,B0,K0,K0,C0,D0
1,A1,B1,K0,K1,,
2,A2,B2,K1,K0,C1,D1
3,A2,B2,K1,K0,C2,D2
4,A3,B3,K2,K1,,
5,,,K2,K0,C3,D3


In [75]:
l4 = pd.DataFrame({
    "data": [1, 7, 1, 2, 8, 5, 0],
    "key": ['b', 'b', 'a', 'c', 'a', 'a', 'b']
})
l4

Unnamed: 0,data,key
0,1,b
1,7,b
2,1,a
3,2,c
4,8,a
5,5,a
6,0,b


In [76]:
r4 = pd.DataFrame({
    "data": [0, 4, 7],
    "key": ['a', 'b', 'd']
})
r4

Unnamed: 0,data,key
0,0,a
1,4,b
2,7,d


In [77]:
# 如何处理重复列
# 默认为：_x和_y
pd.merge(l4, r4, on = "key")

Unnamed: 0,data_x,key,data_y
0,1,b,4
1,7,b,4
2,0,b,4
3,1,a,0
4,8,a,0
5,5,a,0


In [78]:
# 通过suffixes参数指定重复列的名称？
pd.merge(l4, r4, on = "key", suffixes = ["_left", "_right"])

Unnamed: 0,data_left,key,data_right
0,1,b,4
1,7,b,4
2,0,b,4
3,1,a,0
4,8,a,0
5,5,a,0


In [79]:
l5 = pd.DataFrame({
    "data": [1, 7, 1, 2, 8, 5, 0],
    "key": ['b', 'b', 'a', 'c', 'a', 'a', 'b']
})
l5

Unnamed: 0,data,key
0,1,b
1,7,b
2,1,a
3,2,c
4,8,a
5,5,a
6,0,b


In [80]:
r5 = pd.DataFrame({
    "data": [0, 4, 7]
}, index = ['a', 'b', 'd'])
r5

Unnamed: 0,data
a,0
b,4
d,7


In [82]:
# 将左集的key和有集的索引列进行连接
# 参数left_on指定左集连接的键是哪个列
# 参数right_index指定右集连接的键是当前数据集的行索引
pd.merge(l5, r5, left_on = "key", right_index = True)

Unnamed: 0,data_x,key,data_y
0,1,b,4
1,7,b,4
6,0,b,4
2,1,a,0
4,8,a,0
5,5,a,0
