# pandas处理丢失数据

In [1]:
import pandas as pd
import numpy as np
dates = pd.date_range('20220610',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
df

Unnamed: 0,A,B,C,D
2022-06-10,0,,2.0,3
2022-06-11,4,5.0,,7
2022-06-12,8,9.0,10.0,11
2022-06-13,12,13.0,14.0,15
2022-06-14,16,17.0,18.0,19
2022-06-15,20,21.0,22.0,23


## 过滤含有NaN的行

In [2]:
df.dropna(axis=0,how='any')

Unnamed: 0,A,B,C,D
2022-06-12,8,9.0,10.0,11
2022-06-13,12,13.0,14.0,15
2022-06-14,16,17.0,18.0,19
2022-06-15,20,21.0,22.0,23


how = {'any','all'}，any代表只要这一行有任何一个NaN，就丢掉这一行，all代表这一行全部为NaN才丢掉这一行

## 过滤掉含有NaN的列

In [3]:
df.dropna(axis=1,how='any')

Unnamed: 0,A,D
2022-06-10,0,3
2022-06-11,4,7
2022-06-12,8,11
2022-06-13,12,15
2022-06-14,16,19
2022-06-15,20,23


## 填充缺失值

In [4]:
df.fillna(value=0)

Unnamed: 0,A,B,C,D
2022-06-10,0,0.0,2.0,3
2022-06-11,4,5.0,0.0,7
2022-06-12,8,9.0,10.0,11
2022-06-13,12,13.0,14.0,15
2022-06-14,16,17.0,18.0,19
2022-06-15,20,21.0,22.0,23


## 检查DataFrame是否有缺失值

In [5]:
df.isnull()

Unnamed: 0,A,B,C,D
2022-06-10,False,True,False,False
2022-06-11,False,False,True,False
2022-06-12,False,False,False,False
2022-06-13,False,False,False,False
2022-06-14,False,False,False,False
2022-06-15,False,False,False,False


如果这个DataFrame特别大，肉眼很难观察True和False的时候，可以用下面这种方法

In [6]:
print(np.any(df.isnull()) == True)

True


numpy.any()任意一个元素为True就输出True