In [1]:
import numpy as np
import pandas as pd

### 一.缺失值处理

#### 1.判断缺失值

In [2]:
s1 = pd.Series(["a", np.nan, "b", "c"])
s1

0      a
1    NaN
2      b
3      c
dtype: object

In [12]:
# [isnull函数]: 判断缺失值
s1.isnull()

0    False
1     True
2    False
3    False
dtype: bool

#### 2.过滤缺失值

In [10]:
# [dropna函数]: 过滤缺失值
s1.dropna()

0    a
2    b
3    c
dtype: object

In [11]:
# 通过notnull函数加上数据对象索引完成过滤缺失值 
# 改方法和dropna函数等价
s1[s1.notnull()]

0    a
2    b
3    c
dtype: object

In [15]:
df1 = pd.DataFrame(np.random.randint(0, 10, 12).reshape(3, 4))
df1.iloc[0, 0] = np.nan
df1.iloc[2] = np.nan
df1

Unnamed: 0,0,1,2,3
0,,0.0,7.0,7.0
1,9.0,0.0,1.0,7.0
2,,,,


In [16]:
# 过滤包含缺失值的行
df1.dropna(axis = 0)

Unnamed: 0,0,1,2,3
1,9.0,0.0,1.0,7.0


In [21]:
# 仅仅过滤全部为缺失值的行
# 参数how指定为all表示过滤全部为缺失值的行
df1.dropna(axis = 0, how = "all")

Unnamed: 0,0,1,2,3
0,,0.0,7.0,7.0
1,9.0,0.0,1.0,7.0


In [22]:
# 仅仅过滤全部为缺失值的列
# 参数how指定为all表示过滤全部为缺失值的列
df1.dropna(axis = 1, how = "all")

Unnamed: 0,0,1,2,3
0,,0.0,7.0,7.0
1,9.0,0.0,1.0,7.0
2,,,,


In [27]:
df2 = pd.DataFrame(np.random.randint(0, 10, 12).reshape(3, 4))
df2.iloc[:2, 2] = np.nan
df2.iloc[1:3, 1] = np.nan
df2.iloc[0, 0] = np.nan
df2

Unnamed: 0,0,1,2,3
0,,1.0,,9
1,6.0,,,4
2,0.0,,4.0,1


In [29]:
# 仅仅过滤每列有两个缺失值的列
# thresh参数控制缺失值所在的行或者列有几个缺失值的行或者列会被删除，这里删除了第1列和第2列
df2.dropna(axis = 1, thresh = 2)

Unnamed: 0,0,3
0,,9
1,6.0,4
2,0.0,1


#### 3.填充缺失值

In [30]:
df3 = pd.DataFrame(np.random.randint(0, 10, 12).reshape(3, 4))
df3.iloc[:2, 2] = np.nan
df3.iloc[1:3, 1] = np.nan
df3.iloc[0, 0] = np.nan
df3

Unnamed: 0,0,1,2,3
0,,8.0,,8
1,4.0,,,2
2,6.0,,6.0,6


In [31]:
# [fillna函数] 填充缺失值
# 将含有缺失值全部替换为0
df3.fillna(0)

Unnamed: 0,0,1,2,3
0,0.0,8.0,0.0,8
1,4.0,0.0,0.0,2
2,6.0,0.0,6.0,6


In [37]:
# 痛过字典进行缺失值的填充
# 将第0列的缺失值替换为-1
# 将第1列的缺失值替换为-2
df3.fillna({
    0: -1,
    1: -2
})

Unnamed: 0,0,1,2,3
0,-1.0,8.0,,8
1,4.0,-2.0,,2
2,6.0,-2.0,6.0,6


In [39]:
df4 = pd.DataFrame(np.random.randint(0, 10, 12).reshape(4, 3))
df4.iloc[1:, 1] = np.nan
df4.iloc[2:, 2] = np.nan
df4

Unnamed: 0,0,1,2
0,4,2.0,5.0
1,7,,8.0
2,5,,
3,1,,


In [41]:
# 以行为单位
# 将第1，2，3行，第1列的缺失值替换为第0行第1列的2
# 将第2，3行，  第2列的缺失值替换为第0行第2列的5
# 通过method参数设置为ffill替换策略
df4.fillna(method = "ffill")

Unnamed: 0,0,1,2
0,4,2.0,5.0
1,7,2.0,8.0
2,5,2.0,8.0
3,1,2.0,8.0


In [42]:
# 以行为单位
# 将第1，2，3行，第1列的缺失值替换为第0行第1列的2 但是按行替换1个
# 将第2，3行，  第2列的缺失值替换为第0行第2列的5  但是按行替换1个
# 通过limit参数设置最多替换几个缺失值
df4.fillna(method = "ffill", limit = 1)

Unnamed: 0,0,1,2
0,4,2.0,5.0
1,7,2.0,8.0
2,5,,8.0
3,1,,


### 2.数据转换

#### 1.移除重复数据

In [43]:
df5 = pd.DataFrame({
    "k1": ["one", "two"] * 3 + ["two"],
    "k2": [1, 1, 2, 3, 3, 4, 4]
})
df5

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [44]:
# [duplicated函数] 检查重复数据
# 检查重复的行
df5.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [47]:
# 去除重复行
# 方法一：索引选择
df5[~df5.duplicated()]

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [50]:
# 去除重复行
# 方法二：[drop_duplicates函数] 删除重复的行
df5.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [52]:
# 按照列进行去重
# 按照字段为k1进行去重
df5.drop_duplicates(["k1"])

Unnamed: 0,k1,k2
0,one,1
1,two,1


In [53]:
# 按照多列进行去重
# 按照字段为k1和k2进行去重
df5.drop_duplicates(["k1", "k2"])

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [54]:
df6 = pd.DataFrame({
    "k1": ["one", "two"] * 3,
    "_version": [1, 2, 2, 3, 3, 4]
})
df6

Unnamed: 0,k1,_version
0,one,1
1,two,2
2,one,2
3,two,3
4,one,3
5,two,4


In [56]:
# 去重重复的行，自上而下保存最下面的行
df6.drop_duplicates(["k1"], keep = "last")

Unnamed: 0,k1,_version
4,one,3
5,two,4


#### 2.利用函数和映射进行数据转换

In [57]:
df7 = pd.DataFrame({
    "food": ["Apple", "Banana", "Orange", "Apple", "Mango", "Tomato"],
    "price":[4, 3, 3.5, 6, 12, 3]
})
df7

Unnamed: 0,food,price
0,Apple,4.0
1,Banana,3.0
2,Orange,3.5
3,Apple,6.0
4,Mango,12.0
5,Tomato,3.0


In [58]:
# 通过字典映射规则进行转换
_mapping = {
    "Apple": "Fruit",
    "Banana": "Fruit",
    "Orange": "Fruit",
    "Mango": "Fruit",
    "Tomato": "Vagetables"
}

df7["class"] = df7["food"].map(_mapping)
df7

Unnamed: 0,food,price,class
0,Apple,4.0,Fruit
1,Banana,3.0,Fruit
2,Orange,3.5,Fruit
3,Apple,6.0,Fruit
4,Mango,12.0,Fruit
5,Tomato,3.0,Vagetables


In [62]:
# 通过函数进行转换
def _discount(x):
    if x > 10:
        return 0.50
    elif x > 5:
        return 0.80
    elif x > 3:
        return 0.95
    else:
        return 1.00

df7["discount"] = df7["price"].map(_discount)
df7

Unnamed: 0,food,price,class,discount
0,Apple,4.0,Fruit,0.95
1,Banana,3.0,Fruit,1.0
2,Orange,3.5,Fruit,0.95
3,Apple,6.0,Fruit,0.8
4,Mango,12.0,Fruit,0.5
5,Tomato,3.0,Vagetables,1.0


#### 3.替换值

In [64]:
s2 = pd.Series([1, -999, 2, -998, 10])
s2

0      1
1   -999
2      2
3   -998
4     10
dtype: int64

In [66]:
# [replace函数] 替换值
# 将-999和-998替换为-1
s2.replace([-999, -998], -1)

0     1
1    -1
2     2
3    -1
4    10
dtype: int64

In [68]:
# [replace函数] 替换值
# 通过字典方式进行替换
s2.replace({
    -999: 0,
    -998: 1
})

0     1
1     0
2     2
3     1
4    10
dtype: int64

#### 4.重命名轴索引

In [69]:
df8 = pd.DataFrame(np.random.randint(80, 100, 12).reshape(3, 4), 
                   index = ["beijing", "tokyo", "new york"],
                   columns = list("abcd"))
df8

Unnamed: 0,a,b,c,d
beijing,97,84,85,84
tokyo,92,83,83,97
new york,89,85,85,99


In [72]:
# 行索引重命名
_city_trans = {
    "beijing": "BJ",
    "tokyo": "TK",
    "new york": "NY"
}

# ⚠️ 会修改原始数据对象
df8.index = df8.index.map(_city_trans)
df8

Unnamed: 0,a,b,c,d
BJ,97,84,85,84
TK,92,83,83,97
NY,89,85,85,99


In [78]:
# [rename函数] 对DataFrame的行和列进行重命名
# ⚠️ 不会修改原始数据对象
df9 = df8.rename(index = str.lower, columns = str.upper)
df9

Unnamed: 0,A,B,C,D
bj,97,84,85,84
tk,92,83,83,97
ny,89,85,85,99


In [79]:
# 通过字典映射对行索引和列索引进行重命名
_index_mapping = {
    "bj": "China",
    "tk": "Japan",
    "ny": "US"
}

_column_mapping = {
    "A": "K98",
    "B": "H87",
    "C": "J45",
    "D": "R49",
}

df9.rename(index = _index_mapping, columns = _column_mapping)

Unnamed: 0,K98,H87,J45,R49
China,97,84,85,84
Japan,92,83,83,97
US,89,85,85,99


#### 5.离散化和面元划分

##### cut函数

In [91]:
# 设置区间段进行面元划分
# 模拟数据
age = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
# 定义面元
# 划定分层离散点
bins = [18, 25, 35, 60, 100]
# [cat函数] 进行面元划分
# 默认情况下每个面元区间是左开右闭
cuts = pd.cut(age, bins)
# 返回的每个元素都对应到给定面元的哪个区间内
cuts

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [92]:
# 面元划分之后的codes属性代表数据对象被划分到哪个区间内的一个编码
# (18, 25]  => 0
# (25, 35]  => 1 
# (35, 60]  => 2 
# (60, 100] => 3
cuts.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [93]:
# 面元划分之后的categories属性代表数据对象所划分的面元信息
cuts.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [94]:
# 查看面元划分之后每个面元区间的分布计数
pd.value_counts(cuts)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

In [96]:
# 通过right参数指定面元区间是左开右闭，还是左闭右开
pd.cut(age, bins, right = False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [98]:
# 通过labels参数指定每个面元区间对应的标签名称
# [18, 25)   => 少年
# [25, 35)   => 青年
# [35, 60)   => 中年
# [60, 100)  => 老年
pd.cut(age, bins, right = False, labels = ["少年", "青年", "中年", "老年"])

['少年', '少年', '青年', '青年', '少年', ..., '青年', '老年', '中年', '中年', '青年']
Length: 12
Categories (4, object): ['少年' < '青年' < '中年' < '老年']

In [103]:
# 通过指定划分面元个数自动设置每个面元区间
# 参数precision设置小数位数为两位小叔
pd.cut(np.random.rand(12), 4, precision = 2)

[(0.7, 0.93], (0.24, 0.47], (0.24, 0.47], (0.0027, 0.24], (0.0027, 0.24], ..., (0.24, 0.47], (0.47, 0.7], (0.47, 0.7], (0.24, 0.47], (0.7, 0.93]]
Length: 12
Categories (4, interval[float64, right]): [(0.0027, 0.24] < (0.24, 0.47] < (0.47, 0.7] < (0.7, 0.93]]

##### qcut函数

In [108]:
# 注意这个函数非常有用
# [qcut函数] 用于自动划分面元使得每个落在每个面元区间的元素格式大小相等
randoms = np.random.rand(1000)
qcuts = pd.qcut(randoms, 4, precision = 2)
qcuts

[(0.73, 1.0], (0.24, 0.46], (0.46, 0.73], (0.24, 0.46], (0.24, 0.46], ..., (-0.0082, 0.24], (0.24, 0.46], (-0.0082, 0.24], (0.73, 1.0], (0.73, 1.0]]
Length: 1000
Categories (4, interval[float64, right]): [(-0.0082, 0.24] < (0.24, 0.46] < (0.46, 0.73] < (0.73, 1.0]]

In [109]:
qcuts.value_counts()

(-0.0082, 0.24]    250
(0.24, 0.46]       250
(0.46, 0.73]       250
(0.73, 1.0]        250
Name: count, dtype: int64

In [117]:
# 如果用cut进行面元划分则每个面元区间的值不一定是相等的
xcuts = pd.cut(randoms, [0, 0.1, 0.5, 0.9, 1])
xcuts

[(0.9, 1.0], (0.1, 0.5], (0.5, 0.9], (0.1, 0.5], (0.1, 0.5], ..., (0.1, 0.5], (0.1, 0.5], (0.1, 0.5], (0.5, 0.9], (0.9, 1.0]]
Length: 1000
Categories (4, interval[float64, right]): [(0.0, 0.1] < (0.1, 0.5] < (0.5, 0.9] < (0.9, 1.0]]

In [118]:
xcuts.value_counts()

(0.0, 0.1]    107
(0.1, 0.5]    424
(0.5, 0.9]    371
(0.9, 1.0]     98
Name: count, dtype: int64

#### 6.检测和过滤异常值

In [184]:
df11 = pd.DataFrame(np.random.randn(1000).reshape(250, 4))
df11

Unnamed: 0,0,1,2,3
0,-1.688311,-1.322154,1.418574,0.264319
1,-0.803931,-0.435370,-1.168180,0.478871
2,0.303118,-0.781728,1.201179,0.031340
3,-1.194779,-0.143848,-0.170433,-0.505695
4,1.787382,-1.404061,0.220867,-0.004166
...,...,...,...,...
245,0.047108,0.051379,-0.053525,0.107720
246,-1.856648,-0.955613,-0.237231,1.719890
247,-0.063821,-0.193127,0.008096,1.267908
248,-0.661492,-1.039166,1.145343,-1.549856


In [190]:
# 这里假设绝对大于2为异常值
# 将异常值替换为2
df11[df11.abs() > 2] = 2
df11.describe()

Unnamed: 0,0,1,2,3
count,250.0,250.0,250.0,250.0
mean,0.012018,-0.035789,0.077054,0.119555
std,0.95133,0.949845,0.942153,0.943892
min,-1.871768,-1.952761,-1.887391,-1.972442
25%,-0.66819,-0.737868,-0.631171,-0.507828
50%,-0.015568,-0.091853,0.061074,0.11265
75%,0.631168,0.47669,0.748817,0.797402
max,2.0,2.0,2.0,2.0


#### 7.排列和随机采样

In [191]:
df12 = pd.DataFrame(np.arange(5 * 4).reshape(5, 4))
df12

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [196]:
# permutation函数用于对指定数据进行打乱顺序
# e.g. 
# >>> np.random.permutation(5)
# >>> [3, 1, 0, 2, 4]
_samp = np.random.permutation(df12.shape[0])
_samp

array([3, 0, 1, 4, 2])

In [197]:
# 随机排列
df12.take(_samp)

Unnamed: 0,0,1,2,3
3,12,13,14,15
0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
2,8,9,10,11


In [204]:
# [smaple函数] 用于随机采样
# 随机采样两条数据
# ⚠️ 这里的n值不能大于DataFrame的总行数
df12.sample(n = 2)

Unnamed: 0,0,1,2,3
0,0,1,2,3
3,12,13,14,15


In [208]:
# 采样数据的个数大于数据集本身的个数
# replace参数设置为True代表允许从数据集中重复采样
df12.sample(n = 10, replace = True)

Unnamed: 0,0,1,2,3
2,8,9,10,11
1,4,5,6,7
4,16,17,18,19
0,0,1,2,3
4,16,17,18,19
1,4,5,6,7
1,4,5,6,7
2,8,9,10,11
2,8,9,10,11
3,12,13,14,15
