In [2]:
import numpy as np

In [3]:
import pandas as pd

In [4]:
#函数应用和映射

In [5]:
frame = pd.DataFrame(np.random.randn(4, 3), columns = list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [6]:
frame

Unnamed: 0,b,d,e
Utah,-1.1274,0.185173,-0.865019
Ohio,0.457054,-1.362572,-0.069823
Texas,-1.187746,1.950906,0.808646
Oregon,1.570549,-1.463818,0.209192


In [7]:
np.abs(frame)       #NumPy的通用函数(逐元素数组方法)对pandas对象也有效

Unnamed: 0,b,d,e
Utah,1.1274,0.185173,0.865019
Ohio,0.457054,1.362572,0.069823
Texas,1.187746,1.950906,0.808646
Oregon,1.570549,1.463818,0.209192


In [8]:
f = lambda x: x.max() - x.min()

In [9]:
frame.apply(f)     #每一列调用一次

b    2.758295
d    3.414724
e    1.673665
dtype: float64

In [10]:
frame.apply(f, axis = 'columns')     #每一行调用一次

Utah      1.312573
Ohio      1.819626
Texas     3.138653
Oregon    3.034366
dtype: float64

In [11]:
def f(x):
    return pd.Series([x.min(), x.max()], index = ['min', 'max'])

In [12]:
frame.apply(f)      #传递给apply的函数可以是返回带有多个值的Series

Unnamed: 0,b,d,e
min,-1.187746,-1.463818,-0.865019
max,1.570549,1.950906,0.808646


In [13]:
format = lambda x : '%.2f' % x 

In [14]:
frame.applymap(format)    #逐元素的Python函数也可以使用

Unnamed: 0,b,d,e
Utah,-1.13,0.19,-0.87
Ohio,0.46,-1.36,-0.07
Texas,-1.19,1.95,0.81
Oregon,1.57,-1.46,0.21


In [15]:
frame['e'].map(format)    #使用applymap作为函数名是因为Series有map方法

Utah      -0.87
Ohio      -0.07
Texas      0.81
Oregon     0.21
Name: e, dtype: object

In [16]:
#排序和排名

In [17]:
#sort_index可以按行或列索引进行字典型排序，该方法返回一个新的、排序好的对象。

In [18]:
obj = pd.Series(range(4), index = ['d', 'a', 'b', 'c'])

In [19]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [20]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])

In [21]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [22]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [23]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [24]:
frame.sort_index(axis=1, ascending=False)          #降序排序，默认为升序。

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [25]:
obj = pd.Series([4, 7, -3, 2])

In [26]:
obj.sort_values()    #根据Series的值进行排序

2   -3
3    2
0    4
1    7
dtype: int64

In [27]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])

In [28]:
obj.sort_values()       #默认情况下缺失值会被排序到Series的尾部

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [29]:
frame = pd.DataFrame({'b' : [4, 7, -3, 2], 'a': [0, 1, 0, 1]})

In [30]:
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [31]:
frame.sort_values(by = 'b')   #将一个或多个列名给可选参数by可以实现使用多个列作为排序键

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [32]:
frame.sort_values(by=['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [33]:
#排名：是指对数组从1到有效数据点总数分配名次的操作。

In [34]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])

In [35]:
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [36]:
obj.rank()       #rank方法将平均排名分配到每个组来打破平级关系
                 #两个7分别排在6和7，因为取的是平均排名，所以7的排名为6.5

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [37]:
obj.rank(method = 'first')    #对相同元素按在数据中的观察顺序进行排名（按从小到大排名后再按观察顺序）

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [38]:
obj.rank(method = 'max')      #对相同元素按大的排名

0    7.0
1    1.0
2    7.0
3    5.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [39]:
obj.rank(method = 'min')     #对相同元素按小的排名

0    6.0
1    1.0
2    6.0
3    4.0
4    3.0
5    2.0
6    4.0
dtype: float64

In [40]:
obj.rank(method = 'dense')

0    5.0
1    1.0
2    5.0
3    4.0
4    3.0
5    2.0
6    4.0
dtype: float64

In [41]:
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [42]:
frame = pd.DataFrame({'b' : [4.3, 7, -3, 2], 'a' : [0, 1, 0, 1], 'c' : [-2, 5, 8, -2.5]})

In [43]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [44]:
frame.rank(axis = 'columns')   #对于DataFrame可以对行或列计算排名

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


In [45]:
#含有重复标签的轴索引

In [47]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])

In [48]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [50]:
obj.index.is_unique       #标签是否唯一

False

In [51]:
obj['a']      #返回一个序列

a    0
a    1
dtype: int64

In [52]:
obj['c']      #返回一个标量值

4

In [53]:
df = pd.DataFrame(np.random.randn(4 ,3), index = ['a', 'a', 'b' ,'b'])

In [54]:
df

Unnamed: 0,0,1,2
a,0.653897,-0.756064,0.45803
a,1.261053,0.166349,0.892311
b,0.016842,0.67389,0.870424
b,0.529081,-1.112633,0.917324


In [55]:
df.loc['b']

Unnamed: 0,0,1,2
b,0.016842,0.67389,0.870424
b,0.529081,-1.112633,0.917324
