# 4 Pandas的索引操作

In [1]:
import pandas as pd
import numpy as np

In [2]:
dict_data = {'A': 1,
             'B': pd.Timestamp('20190926'),
             'C': pd.Series(1, index=list(range(4)),dtype='float32'),
             'D': np.array([1,2,3,4],dtype='int32'),
             'E': ["Python","Java","C++","C"],
             'F': 'wangdao' }
df_obj2 = pd.DataFrame(dict_data)
print(df_obj2.index)

Index([0, 1, 2, 3], dtype='int64')


In [3]:
# 索引对象的值不可变（上面代码增加）
# df_obj2.index[0] = 2    # TypeError: Index does not support mutable operations

TypeError: Index does not support mutable operations

# 3 常见的Index种类
•Index，索引  可以是各种类型
•Int64Index，整数索引
•MultiIndex，层级索引，难度较大
•DatetimeIndex，时间戳类型

In [4]:
ser_obj = pd.Series(range(5), index = list("abcde"))
print(ser_obj)
ser_obj.index

a    0
b    1
c    2
d    3
e    4
dtype: int64


Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [5]:
# 行索引，不仅可以用索引名，可以用索引位置或来取
print(ser_obj['b']) # 索引名
print(ser_obj[2]) # 位置索引

1
2


  print(ser_obj[2]) # 位置索引


In [9]:
print(ser_obj.loc['b']) # 索引名
print(ser_obj.iloc[2]) # 位置索引

1
2


In [8]:
# 切片索引
print(ser_obj.iloc[1:3])  # 索引位置取数据，左闭右开
print(ser_obj.loc['b':'d'])  # 索引名  左闭右闭

b    1
c    2
dtype: int64
b    1
c    2
d    3
dtype: int64


In [7]:
# 不连续索引
print(ser_obj.iloc[[0, 2, 4]])
print(ser_obj.loc[['a', 'e']])

a    0
c    2
e    4
dtype: int64
a    0
e    4
dtype: int64


In [6]:
# 布尔索引
ser_bool = ser_obj > 2
print(ser_obj)
print(ser_bool)

a    0
b    1
c    2
d    3
e    4
dtype: int64
a    False
b    False
c    False
d     True
e     True
dtype: bool


In [10]:
print('-'*50)
print(ser_obj[ser_bool])

print(ser_obj[ser_obj > 2]) #取出大于2的元素

--------------------------------------------------
d    3
e    4
dtype: int64
d    3
e    4
dtype: int64


## 4.4 DataFrame索引

In [11]:
import numpy as np
df_obj = pd.DataFrame(np.random.randn(5,4),
                      columns = ['a', 'b', 'c', 'd'])
print(df_obj.head())

          a         b         c         d
0  0.490484  0.028650 -1.179762  1.017809
1  0.343683  0.783270  1.759397 -0.081700
2  1.787474  0.429684  0.537253 -0.204415
3 -0.763084 -1.415882 -0.231624  0.524957
4 -1.964341  0.642146 -0.404725 -1.477686


In [12]:
# 列索引
print(df_obj['a']) # 返回Series类型
print('-'*50)
print(df_obj[['a']]) # 返回DataFrame类型
print('-'*50)
print(type(df_obj[['a']])) # 返回DataFrame类型

0    0.490484
1    0.343683
2    1.787474
3   -0.763084
4   -1.964341
Name: a, dtype: float64
--------------------------------------------------
          a
0  0.490484
1  0.343683
2  1.787474
3 -0.763084
4 -1.964341
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>


# loc 标签索引(通过索引标签值获取数据)

In [13]:
# 标签索引 loc，建议使用loc，效率更高
# Series
print(ser_obj)
print(ser_obj['b':'d'])
print(ser_obj.loc['b':'d']) #前闭后闭
print('-'*50)

a    0
b    1
c    2
d    3
e    4
dtype: int64
b    1
c    2
d    3
dtype: int64
b    1
c    2
d    3
dtype: int64
--------------------------------------------------


In [14]:
# DataFrame
df_obj = pd.DataFrame(np.random.randn(5,4),
                      columns = list('abcd'),
                      index=list('abcde'))
print(df_obj)
print('-'*50)
print(df_obj['a'])  # 建议不用,拿的是列
print('-'*50)
print(df_obj.loc['a'])  #拿的是行
print('-'*50)

          a         b         c         d
a  0.076546  0.454789  0.195572 -0.450544
b  1.097000 -0.180090  1.585239  0.163251
c  0.352831  0.479101  2.489005  1.384063
d  0.490114  0.670316 -1.071153  0.440258
e -0.422647 -0.556205  0.136639 -0.707411
--------------------------------------------------
a    0.076546
b    1.097000
c    0.352831
d    0.490114
e   -0.422647
Name: a, dtype: float64
--------------------------------------------------
a    0.076546
b    0.454789
c    0.195572
d   -0.450544
Name: a, dtype: float64
--------------------------------------------------


In [15]:
# 第一个参数索引行，第二个参数是列,loc或者iloc效率高于直接用取下标的方式，前闭后闭
print(df_obj.loc['a':'c', 'b':'d']) #连续索引
print(df_obj.loc[['a','c'], ['b','d']]) #不连续索引
print(df_obj.loc[['c'],['b']]) #取一个值,返回的是DataFrame类型
print(df_obj.loc['c','b'])  #取一个值

          b         c         d
a  0.454789  0.195572 -0.450544
b -0.180090  1.585239  0.163251
c  0.479101  2.489005  1.384063
          b         d
a  0.454789 -0.450544
c  0.479101  1.384063
          b
c  0.479101
0.479100665405996


## iloc 位置索引

In [16]:
print('-'*50)
# Series
print(ser_obj[1:3])
print('-'*50)
print(ser_obj.iloc[1:3]) # 前闭后开[)，效率高

--------------------------------------------------
b    1
c    2
dtype: int64
--------------------------------------------------
b    1
c    2
dtype: int64


In [17]:
df_obj

Unnamed: 0,a,b,c,d
a,0.076546,0.454789,0.195572,-0.450544
b,1.097,-0.18009,1.585239,0.163251
c,0.352831,0.479101,2.489005,1.384063
d,0.490114,0.670316,-1.071153,0.440258
e,-0.422647,-0.556205,0.136639,-0.707411


In [18]:
# DataFrame，iloc是前闭后开[)
print(df_obj)
print('-'*50)
print(df_obj.iloc[0:2, 0:2]) 
print('-'*50)
print(df_obj.iloc[[0,2], [0,2]]) # 不连续索引
print('-'*50)
print(df_obj.iloc[0,0]) # 取一个值

          a         b         c         d
a  0.076546  0.454789  0.195572 -0.450544
b  1.097000 -0.180090  1.585239  0.163251
c  0.352831  0.479101  2.489005  1.384063
d  0.490114  0.670316 -1.071153  0.440258
e -0.422647 -0.556205  0.136639 -0.707411
--------------------------------------------------
          a         b
a  0.076546  0.454789
b  1.097000 -0.180090
--------------------------------------------------
          a         c
a  0.076546  0.195572
c  0.352831  2.489005
--------------------------------------------------
0.07654595904380475


In [19]:
# 没有设置行和列索引的DataFrame，iloc和loc的区别
df_obj2 = pd.DataFrame(np.random.randn(5,4))
print(df_obj2)
print('-'*50)
print(df_obj2.iloc[0:2]) #左闭右开 2行
print('-'*50)
print(df_obj2.loc[0:2]) #左闭右闭 3行

          0         1         2         3
0  0.503790 -1.538214 -0.413093  2.279199
1 -0.485180  1.115343 -1.224033  1.466597
2 -0.216464 -2.426223 -0.732580  0.109576
3 -0.239319 -0.284087  0.369931 -0.620841
4  1.188975 -1.456563  0.633115 -1.656135
--------------------------------------------------
         0         1         2         3
0  0.50379 -1.538214 -0.413093  2.279199
1 -0.48518  1.115343 -1.224033  1.466597
--------------------------------------------------
          0         1         2         3
0  0.503790 -1.538214 -0.413093  2.279199
1 -0.485180  1.115343 -1.224033  1.466597
2 -0.216464 -2.426223 -0.732580  0.109576


# 5.对齐运算

In [20]:
import pandas as pd
s1 = pd.Series(range(10, 20), index = range(10))
s2 = pd.Series(range(20, 25), index = range(5))
# Series 对齐运算
print('s1+s2: ')
s3=s1+s2
print(s3)  # 缺失数据默认是NaN  np.nan

s1+s2: 
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64


In [21]:
# 两个长度不同的一维ndarray相加
a1 = np.array([1,2,3,4,5])
a2 = np.array([1]) # 长度为1
print(a2.shape)
print(a1+a2)    # 广播

(1,)
[2 3 4 5 6]


In [22]:
print(s2)
s1

0    20
1    21
2    22
3    23
4    24
dtype: int64


0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64

In [23]:
print(np.isnan(s3[6]))
print('-'*50)
print(s2.add(s1, fill_value = 0))  #未对齐的数据将和填充值做运算
print(s2.sub(s1, fill_value = 0))

True
--------------------------------------------------
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    15.0
6    16.0
7    17.0
8    18.0
9    19.0
dtype: float64
0    10.0
1    10.0
2    10.0
3    10.0
4    10.0
5   -15.0
6   -16.0
7   -17.0
8   -18.0
9   -19.0
dtype: float64


In [24]:
# df的对齐运算
import numpy as np
df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])
df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])
print(df1)
print(df2)
print('-'*50)
print(df2.dtypes)
print(df1-df2)
print(df2.sub(df1, fill_value = 2)) #未对齐的数据将和填充值做运算

     a    b
0  1.0  1.0
1  1.0  1.0
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0
--------------------------------------------------
a    float64
b    float64
c    float64
dtype: object
     a    b   c
0  0.0  0.0 NaN
1  0.0  0.0 NaN
2  NaN  NaN NaN
     a    b    c
0  0.0  0.0 -1.0
1  0.0  0.0 -1.0
2 -1.0 -1.0 -1.0


# 总结：没对齐的元素，默认填充NaN，对齐运算时，fill_value参数可以指定填充值。