In [24]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'rating_A': [np.nan,3,4,np.nan,3, np.nan],
                   'rating_B': [np.nan,3,4,5,3, 4],
                    },
                  index=['A001','A002','A003','A004','A005','A006'])
df

Unnamed: 0,rating_A,rating_B
A001,,
A002,3.0,3.0
A003,4.0,4.0
A004,,5.0
A005,3.0,3.0
A006,,4.0


NaN 为缺失值

In [5]:
# 如何识别缺失值
# df.isnull()
df.isnull().any()  #含有缺失值的列

rating_A    True
rating_B    True
dtype: bool

In [6]:
df.isnull().all()  #全部为缺失值的列

rating_A    False
rating_B    False
dtype: bool

In [7]:
df.isnull().any(axis=1)  #含有缺失值的行

A001     True
A002    False
A003    False
A004     True
A005    False
A006     True
dtype: bool

In [8]:
df.isnull().all(axis=1)  #全部为缺失值的行

A001     True
A002    False
A003    False
A004    False
A005    False
A006    False
dtype: bool

In [9]:
df.isnull().sum()  #每列缺失值的个数

rating_A    3
rating_B    1
dtype: int64

In [10]:
df.isnull().sum(axis=1)  #每行缺失值的个数

A001    2
A002    0
A003    0
A004    1
A005    0
A006    1
dtype: int64

In [11]:
df.isnull().sum().sum()  #所有缺失值的个数

4

In [12]:
df.isnull().any().any()  #是否含有缺失值

True

In [13]:
df.isnull().all().all()  #是否全部为缺失值

False

In [14]:
df.isnull().any(axis=1).sum()  #含有缺失值的行数

3

In [15]:
df.isnull().all(axis=1).sum()  #全部为缺失值的行数

1

In [16]:
df.isnull().any(axis=0).sum()  #含有缺失值的列数

2

In [17]:
na_default=df.dropna()  # 删除有缺失值的行
na_default

Unnamed: 0,rating_A,rating_B
A002,3.0,3.0
A003,4.0,4.0
A005,3.0,3.0


In [18]:
na_col=df.dropna(axis=1)  # 删除有缺失值的列
na_col

A001
A002
A003
A004
A005
A006


In [19]:
nal=df.dropna(how='all')  # 删除全部为缺失值的行
nal

Unnamed: 0,rating_A,rating_B
A002,3.0,3.0
A003,4.0,4.0
A004,,5.0
A005,3.0,3.0
A006,,4.0


In [20]:
nal1 = df.dropna(thresh=2) # 删除缺失值个数小于2的行
nal1

Unnamed: 0,rating_A,rating_B
A002,3.0,3.0
A003,4.0,4.0
A005,3.0,3.0


In [21]:
nal2 = df.dropna(thresh=1) # 删除缺失值个数小于1的行
nal2

Unnamed: 0,rating_A,rating_B
A002,3.0,3.0
A003,4.0,4.0
A004,,5.0
A005,3.0,3.0
A006,,4.0


# 如何填充缺失值

In [25]:
na_fill = df.fillna(method='backfill') # 用后面的值填充缺失值
na_fill #

Unnamed: 0,rating_A,rating_B
A001,3.0,3.0
A002,3.0,3.0
A003,4.0,4.0
A004,3.0,5.0
A005,3.0,3.0
A006,,4.0


In [26]:
df
# 对比上下两个数据
# A001 的值是有A002的值来填充的
# A006 的值因为下面已经没有值了，所以这里不变

Unnamed: 0,rating_A,rating_B
A001,,
A002,3.0,3.0
A003,4.0,4.0
A004,,5.0
A005,3.0,3.0
A006,,4.0


In [23]:
na_fill1 = df.fillna(method='pad') # 用前面的值填充缺失值
na_fill1

Unnamed: 0,rating_A,rating_B
A001,,
A002,3.0,3.0
A003,4.0,4.0
A004,4.0,5.0
A005,3.0,3.0
A006,3.0,4.0


In [27]:
df

Unnamed: 0,rating_A,rating_B
A001,,
A002,3.0,3.0
A003,4.0,4.0
A004,,5.0
A005,3.0,3.0
A006,,4.0


In [28]:
na_file2 = df.fillna(df.mean()) # 用平均值填充缺失值
na_file2

Unnamed: 0,rating_A,rating_B
A001,3.333333,3.8
A002,3.0,3.0
A003,4.0,4.0
A004,3.333333,5.0
A005,3.0,3.0
A006,3.333333,4.0


In [29]:
na_file3 = df.fillna(df.median())  # 用中位数填充缺失值
na_file3

Unnamed: 0,rating_A,rating_B
A001,3.0,4.0
A002,3.0,3.0
A003,4.0,4.0
A004,3.0,5.0
A005,3.0,3.0
A006,3.0,4.0


In [30]:
na_file4 = df.fillna(0)  # 用0填充缺失值
na_file4

Unnamed: 0,rating_A,rating_B
A001,0.0,0.0
A002,3.0,3.0
A003,4.0,4.0
A004,0.0,5.0
A005,3.0,3.0
A006,0.0,4.0


# 使用replace()函数填充缺失值

In [31]:
na_repl = df.replace(np.nan,0)  # 用0填充缺失值
na_repl

Unnamed: 0,rating_A,rating_B
A001,0.0,0.0
A002,3.0,3.0
A003,4.0,4.0
A004,0.0,5.0
A005,3.0,3.0
A006,0.0,4.0



## sklearn 中的缺失值处理