In [1]:
import numpy as np
import pandas as pd

### apply和applymap

#### 1.apply函数

apply可以将函数操作作用到具体的数据上，但是仅限于数据对象的axis轴方向上进行运算，不能作用到数据对象的每个元素上。例如：在二维数据的情况下，默认表示列方向的操作。

In [15]:
df = pd.DataFrame(np.random.randint(60, 100, size = (5, 4)), columns = ['Math', 'Chinese', 'English', 'PE'], index = list("ABCDE"))
df

Unnamed: 0,Math,Chinese,English,PE
A,62,90,84,79
B,99,97,68,74
C,97,76,98,63
D,90,73,70,95
E,64,73,69,99


In [16]:
# 求每门学科的最高分
_max = lambda x: x.max()

df.apply(_max, axis = 0)

Math       99
Chinese    97
English    98
PE         99
dtype: int64

In [17]:
# 求每位同学的总分
_sum = lambda x: x.sum()

df.apply(_sum, axis = 1)

A    315
B    338
C    334
D    328
E    305
dtype: int64

#### 2.applymap函数

applymap函数会将函数操作作用的数据对象的每个数据元素上

In [19]:
df2 = pd.DataFrame(np.random.randint(50, 100, size = (5, 4)), columns = ['Math', 'Chinese', 'English', 'PE'], index = list("ABCDE"))
df2

Unnamed: 0,Math,Chinese,English,PE
A,54,73,50,59
B,54,89,66,75
C,79,96,78,79
D,85,96,95,58
E,73,91,88,81


In [20]:
# 判断每位同学每门课程的成绩是否及格，>=60分通过+号表示；<60分通过-号表示
_sign = lambda s: '-' if s < 60 else '+'

df2.applymap(_sign)

Unnamed: 0,Math,Chinese,English,PE
A,-,+,-,-
B,-,+,+,+
C,+,+,+,+
D,+,+,+,-
E,+,+,+,+


### 排序

#### 1.索引排序

In [21]:
# Series排序
s = pd.Series(np.arange(4), index = list("dbac"))
s

d    0
b    1
a    2
c    3
dtype: int64

In [24]:
# 默认升序排序
s.sort_index()

a    2
b    1
c    3
d    0
dtype: int64

In [25]:
# 降序排序
s.sort_index(ascending = False)

d    0
c    3
b    1
a    2
dtype: int64

In [26]:
# DataFrame排序
df2 = pd.DataFrame(np.arange(12).reshape(4, 3), columns = list("BCA"), index = list("bacd"))
df2

Unnamed: 0,B,C,A
b,0,1,2
a,3,4,5
c,6,7,8
d,9,10,11


In [29]:
# DataFrame默认按行升序排序
df2.sort_index()

Unnamed: 0,B,C,A
a,3,4,5
b,0,1,2
c,6,7,8
d,9,10,11


In [28]:
# DataFrame按列排序
df2.sort_index(axis = 1)

Unnamed: 0,A,B,C
b,2,0,1
a,5,3,4
c,8,6,7
d,11,9,10


#### 2.按值排序

In [30]:
# Series排序
s = pd.Series(np.array([65, 60, 91, 98]), index = list("dbac"))
s

d    65
b    60
a    91
c    98
dtype: int64

In [31]:
# 降序排序
s.sort_values(ascending = False)

c    98
a    91
d    65
b    60
dtype: int64

In [32]:
# 当数据中有缺失值，默认排序最后
s[1] = np.nan
s.sort_values(ascending = False)

c    98.0
a    91.0
d    65.0
b     NaN
dtype: float64

In [33]:
# DataFrame排序
df3 = pd.DataFrame(np.arange(12).reshape(4, 3), columns = list("BCA"), index = list("bacd"))
df3

Unnamed: 0,B,C,A
b,0,1,2
a,3,4,5
c,6,7,8
d,9,10,11


In [34]:
# 按列排序
# 按照A列降序排序
df3.sort_values(by = "A", ascending = False)

Unnamed: 0,B,C,A
d,9,10,11
c,6,7,8
a,3,4,5
b,0,1,2


In [35]:
# 按多列排序
# 按照B列和A列降序排序
df3.sort_values(by = ["B", "A"], ascending = False)

Unnamed: 0,B,C,A
d,9,10,11
c,6,7,8
a,3,4,5
b,0,1,2


### 唯一值和成员属性

In [36]:
s2 = pd.Series([2, 3, 4, 2, 2, 1, 1, 2])
s2

0    2
1    3
2    4
3    2
4    2
5    1
6    1
7    2
dtype: int64

In [37]:
# Series去除重复元素
s2.unique()

array([2, 3, 4, 1])

In [38]:
# 计算Series每个元素个素
s2.value_counts()

2    4
1    2
3    1
4    1
Name: count, dtype: int64

In [41]:
# 判断Series中是否存在某个元素
s2.isin([2])

0     True
1    False
2    False
3     True
4     True
5    False
6    False
7     True
dtype: bool

In [42]:
# 判断Series中是否存在多个元素
s2.isin([1, 2, 3])

0     True
1     True
2    False
3     True
4     True
5     True
6     True
7     True
dtype: bool

In [43]:
# DataFrame是否存在某个值
df4 = pd.DataFrame({
    "a": [0, -1, 3, 4],
    "b": [9, 3, 1, 4], 
    "c": [1, 5, 3, 1]
})
df4

Unnamed: 0,a,b,c
0,0,9,1
1,-1,3,5
2,3,1,3
3,4,4,1


In [44]:
df4.isin([-1, 0])

Unnamed: 0,a,b,c
0,True,False,False
1,True,False,False
2,False,False,False
3,False,False,False


### 处理缺失数据

In [65]:
df5 = pd.DataFrame(np.random.randint(0, 10, size = (5, 4)))
df5.iloc[0, 0] = np.nan
df5.iloc[3, 2] = np.nan
df5.iloc[4, 3] = np.nan
df5

Unnamed: 0,0,1,2,3
0,,9,3.0,1.0
1,1.0,2,8.0,6.0
2,2.0,1,9.0,5.0
3,7.0,3,,4.0
4,0.0,6,9.0,


#### 1.判断是否存在缺失值

In [66]:
df5.isnull()

Unnamed: 0,0,1,2,3
0,True,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,False,False,False,True


#### 2.丢弃缺失值

In [69]:
# 默认丢弃缺失值所在的行
df5.dropna()

Unnamed: 0,0,1,2,3
1,1.0,2,8.0,6.0
2,2.0,1,9.0,5.0


In [68]:
# 丢弃缺失值所在的列
df5.dropna(axis = 1)

Unnamed: 0,1
0,9
1,2
2,1
3,3
4,6


#### 3.填充缺失值

In [71]:
df5.fillna(-1)

Unnamed: 0,0,1,2,3
0,-1.0,9,3.0,1.0
1,1.0,2,8.0,6.0
2,2.0,1,9.0,5.0
3,7.0,3,-1.0,4.0
4,0.0,6,9.0,-1.0
