In [1]:
import numpy as np

### 什么是NAN?

NAN（Not a number）代表不是一个数字，但是它却属于浮点类型。它有如下特点：

(1) NAN和NAN并不相等，比如：`np.NAN != np.NAN` 这个条件是成立的。

(2) NAN和任何数值做运算结果都是NAN。

In [3]:
# NAN和NAN并不相等
np.NAN == np.NAN

False

In [9]:
# NAN的是数据类型
d = np.arange(12).reshape(3, 4)
print("<d>\n", d)
print("<d dtype>", d.dtype)
d[0, 0] = np.NAN

<d>
 [[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
<d dtype> int64


ValueError: cannot convert float NaN to integer

In [10]:
d = np.arange(12, dtype = np.float16).reshape(3, 4)
print("<d>\n", d)
print("<d dtype>", d.dtype)
d[0, 0] = np.NAN
print("<d>\n", d)

<d>
 [[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]
<d dtype> float16
<d>
 [[nan  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]


### 缺失值的处理方式

#### 删除缺失值

In [18]:
# 方式一：删除所有缺失值（简单粗暴）
# 缺点：丢失数据，会变成一维数组

# 构造数据
m1 = np.random.randint(0, 10, size = (4, 4)).astype(np.float16)
m1[0, 1] = np.nan
m1[1, 3] = np.nan
print("<m1>\n", m1)

# ⚠️ 因为np.NAN == np.NAN不成立，所以不能通过这种方式判断，这里通过isnan函数判断缺失值
m1[~np.isnan(m1)]

<m1>
 [[ 6. nan  6.  3.]
 [ 1.  8.  1. nan]
 [ 6.  9.  3.  5.]
 [ 4.  3.  4.  4.]]


array([6., 6., 3., 1., 8., 1., 6., 9., 3., 5., 4., 3., 4., 4.],
      dtype=float16)

In [35]:
# 方式二：删除NAN所在的列
m2 = np.random.randint(0, 10, size = (4, 4)).astype(np.float16)
m2[0, 1] = np.nan
m2[0, 2] = np.nan
m2[1, 2] = np.nan
print("<m2>\n", m2)

# 查找哪些行有缺失值
# where函数会返回缺失值的具体位置的集合
# 返回值为一个元组，第一个元素为所有缺失值的x轴位置集合，第二个元素为所有缺失值的y轴位置集合
idx = np.where(np.isnan(m2))
print("<find missing value location>", idx)
# 去重获取缺失值的行索引
x_idx = np.unique(idx[0])
# delete函数
# axis = 0表示按照行删除，此时obj参数给定的切片为数据的行索引数组，表示删除哪些行
# axis = 1表示按照列删除，此时obj参数给定的切片为数据的列索引数组，表示删除哪些列
# ⚠️：delete函数默认不改变原始对象
np.delete(m2, obj = x_idx, axis = 0)

<m2>
 [[ 2. nan nan  7.]
 [ 9.  1. nan  8.]
 [ 8.  2.  6.  2.]
 [ 1.  8.  5.  9.]]
<find missing value location> (array([0, 0, 1]), array([1, 2, 2]))


array([[8., 2., 6., 2.],
       [1., 8., 5., 9.]], dtype=float16)

In [49]:
# [补充] delete函数
m3 = np.random.randint(0, 10, size = (4, 4))
print("<m3>\n", m3)

# 删除第2行和第3行
# ⚠️：按行删除 axis = 0
ml = np.delete(m3, obj = [1, 2], axis = 0)
print("<ml>\n", ml)

# 删除第1列和第4列
# ⚠️：按列删除 axis = 1
mc = np.delete(m3, obj = [0, 3], axis = 1)
print("<mc>\n", mc)

# 对象打平删除第1个和第16个元素
# ⚠️：按第几个删除 axis = None
me = np.delete(m3, obj = [0, 15], axis = None)
print("<me>\n", me)

<m3>
 [[7 8 3 0]
 [6 0 6 9]
 [5 8 7 7]
 [2 8 3 7]]
<ml>
 [[7 8 3 0]
 [2 8 3 7]]
<mc>
 [[8 3]
 [0 6]
 [8 7]
 [8 3]]
<me>
 [8 3 0 6 0 6 9 5 8 7 7 2 8 3]


#### 替换缺失值

In [56]:
# 模拟成绩表数据
# 将每门学科的缺失值用均值替换
scores = [
    [59, 90, 78, 34,     np.NAN, 23], # math
    [89, 32, 45, np.NAN, 56,     56]  # english
]
score = np.array(scores).T
score 

array([[59., 89.],
       [90., 32.],
       [78., 45.],
       [34., nan],
       [nan, 56.],
       [23., 56.]])

In [57]:
# 将缺失值替换为0
score[np.isnan(score)] = 0
score

array([[59., 89.],
       [90., 32.],
       [78., 45.],
       [34.,  0.],
       [ 0., 56.],
       [23., 56.]])

In [65]:
# 求每门学科的均值
# ⚠️：需要剔除非0再计数
mean = np.sum(score, axis = 0) / np.count_nonzero(score, axis = 0)
mean

array([56.8, 55.6])

In [88]:
# 找出缺失值的位置
idx = np.where(score == 0)
idx

(array([3, 4]), array([1, 0]))

In [140]:
# 替换缺失值
location = np.vstack([idx[0], idx[1]]).T
for col in range(0, location.shape[0]):
    x = location[col][0]
    y = location[col][1]
    # if表达式
    v = mean[0] if y == 0 else mean[1]
    score[x, y] = v

score

array([[59. , 89. ],
       [90. , 32. ],
       [78. , 45. ],
       [34. , 55.6],
       [56.8, 56. ],
       [23. , 56. ]])