在处理numpy或者dataframe时会遇到含缺失值的平均数计算

In [1]:
import numpy as np
import pandas as pd

### Numpy中的处理

In [2]:
arr = np.array([[4, 5, 6], [3, 7, np.nan], [np.nan, 5, 9]])

In [12]:
arr

array([[ 4.,  5.,  6.],
       [ 3.,  7., nan],
       [nan,  5.,  9.]])

In [4]:
# numpy中的mean不支持nan计算
arr.mean(axis=1)

array([ 5., nan, nan])

In [5]:
# numpy中的nanmean支持nan，但不支持加权
np.nanmean(arr, axis=1)

array([5., 5., 7.])

In [6]:
# numpy中的average支持加权计算, 但无法对nan计算
weights = [0.4, 0.3, 0.3]
np.average(arr, axis=1, weights=weights)

array([4.9, nan, nan])

#### 方法一：构造加权权重数组

对于含np.nan的加权平均，需要先构造一个和arr同维度的weights数组<br>
注意该方法仅适用于mask array

In [39]:
# 类似于mask
weights_isnan = np.where(np.isnan(arr), np.nan, 1) * weights
weights_isnan

array([[0.4, 0.3, 0.3],
       [0.4, 0.3, nan],
       [nan, 0.3, 0.3]])

In [41]:
reweighted_weights = weights_isnan/(np.nansum(weights_isnan, axis=1).reshape(-1, 1))
reweighted_weights

array([[0.4       , 0.3       , 0.3       ],
       [0.57142857, 0.42857143,        nan],
       [       nan, 0.5       , 0.5       ]])

In [43]:
np.nansum(arr * reweighted_weights, axis=1)

array([4.9       , 4.71428571, 7.        ])

#### 方法二：利用array.ma.average函数

该函数专门用于处理这种待缺失值的加权平均计算

##### 基本功能介绍

In [44]:
a = np.ma.array([1., 2., 3., 4.], mask=[False, False, True, True])

In [45]:
np.ma.average(a, weights=[3, 1, 0, 0])

1.25

In [46]:
x = np.ma.arange(6.).reshape(3, 2)
x

masked_array(
  data=[[0., 1.],
        [2., 3.],
        [4., 5.]],
  mask=False,
  fill_value=1e+20)

In [47]:
avg, sumweights = np.ma.average(x, axis=0, weights=[1, 2, 3], returned=True)

In [48]:
avg

masked_array(data=[2.6666666666666665, 3.6666666666666665],
             mask=[False, False],
       fill_value=1e+20)

In [49]:
sumweights

array([6., 6.])

##### 示例使用

In [50]:
arr

array([[ 4.,  5.,  6.],
       [ 3.,  7., nan],
       [nan,  5.,  9.]])

In [53]:
np.ma.average(np.ma.array(arr, mask=np.isnan(arr)), weights=weights, axis=1)

masked_array(data=[4.9, 4.714285714285715, 6.999999999999999],
             mask=[False, False, False],
       fill_value=1e+20)

### Pandas中使用

一种思路是取dataframe中的values（返回值为ndarray），然后使用Numpy接口<br>
另一种思路是采用pd中的相关API构造权重矩阵或mask array

In [85]:
df = pd.DataFrame(arr, columns=["A","B","C"])
df

Unnamed: 0,A,B,C
0,4.0,5.0,6.0
1,3.0,7.0,
2,,5.0,9.0


#### 方法一：构造权重矩阵

In [56]:
# 一种方式仍为利用Numpy的API
np.where(df.isnull(), 1, np.nan)

array([[nan, nan, nan],
       [nan, nan,  1.],
       [ 1., nan, nan]])

In [86]:
# 另一种直接调用pandas
df2 = df.copy()
df2[df.notnull()] = 1
df2

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,1.0,1.0,
2,,1.0,1.0


In [87]:
weights_isnan2 = df2 * weights
weights_isnan2

Unnamed: 0,A,B,C
0,0.4,0.3,0.3
1,0.4,0.3,
2,,0.3,0.3


In [88]:
reweighted_weights2 = weights_isnan2.multiply(1/weights_isnan2.sum(axis=1), axis=0)
reweighted_weights2

Unnamed: 0,A,B,C
0,0.4,0.3,0.3
1,0.571429,0.428571,
2,,0.5,0.5


In [89]:
(df * reweighted_weights2).sum(axis=1)

0    4.900000
1    4.714286
2    7.000000
dtype: float64

#### 利用np.ma.average

In [93]:
np.ma.average(np.ma.array(df, mask=df.isnull()), weights=weights, axis=1)

masked_array(data=[4.9, 4.714285714285715, 6.999999999999999],
             mask=[False, False, False],
       fill_value=1e+20)