In [1]:
import numpy as np
import pandas as pd

# 定义索引信息
index = pd.Index(data=["zhangsan", "lishi", "wangwu", "zhaoliu", "wanger"])
data = {
    "age": [22, 17, np.nan, 16, 25],
    "address": ["nj", None, "nj", "sh", "bj"]
}
# 构建DataFrame1:数据完整，提供索引
base_info = pd.DataFrame(data=data, index=index)
base_info['gender'] = ['f', 'm', 'f', 'm', 'f']
print(base_info)
print('*' * 20)


           age address gender
zhangsan  22.0      nj      f
lishi     17.0    None      m
wangwu     NaN      nj      f
zhaoliu   16.0      sh      m
wanger    25.0      bj      f
********************


In [2]:
# 缺失值处理
# 字符串长度为0，变量未赋值，NaN，NaT,None理解为null值
print(base_info.isnull())
print(base_info.notnull())


            age  address  gender
zhangsan  False    False   False
lishi     False     True   False
wangwu     True    False   False
zhaoliu   False    False   False
wanger    False    False   False
            age  address  gender
zhangsan   True     True    True
lishi      True    False    True
wangwu    False     True    True
zhaoliu    True     True    True
wanger     True     True    True


In [4]:
# 丢弃
print(base_info["age"].dropna())
# axis=0是轴，实际上是代表行的操作
# 一行中只要有一个空值，就删除整行
print(base_info.dropna(axis=0, how="any"))


zhangsan    22.0
lishi       17.0
zhaoliu     16.0
wanger      25.0
Name: age, dtype: float64
           age address gender
zhangsan  22.0      nj      f
zhaoliu   16.0      sh      m
wanger    25.0      bj      f


In [5]:
# 一行中所有列全部为空值，才能删除整行
print(base_info.dropna(axis=0, how="all"))


           age address gender
zhangsan  22.0      nj      f
lishi     17.0    None      m
wangwu     NaN      nj      f
zhaoliu   16.0      sh      m
wanger    25.0      bj      f


In [6]:
# 前一个值，后一个值，填充
print(base_info["age"].fillna(method="ffill"))
print(base_info["age"].fillna(method="bfill"))


zhangsan    22.0
lishi       17.0
wangwu      17.0
zhaoliu     16.0
wanger      25.0
Name: age, dtype: float64
zhangsan    22.0
lishi       17.0
wangwu      16.0
zhaoliu     16.0
wanger      25.0
Name: age, dtype: float64


In [7]:
# 线性差值
print(base_info["age"].interpolate())


zhangsan    22.0
lishi       17.0
wangwu      16.5
zhaoliu     16.0
wanger      25.0
Name: age, dtype: float64


In [8]:
# 替换缺失值：将无效值替换成有效值
# 假设年龄25不合理，替换成22
print(base_info["age"].replace(25, 22))
# 同时替换多个值
print(base_info["age"].replace({25: 22}))


zhangsan    22.0
lishi       17.0
wangwu       NaN
zhaoliu     16.0
wanger      22.0
Name: age, dtype: float64
zhangsan    22.0
lishi       17.0
wangwu       NaN
zhaoliu     16.0
wanger      22.0
Name: age, dtype: float64


In [9]:
# 也可以2步骤完成：将无效值替换成空值，再将所有空值，填充为合理值
r1 = base_info["age"].replace(25, np.nan)
print(r1)
r2 = r1.interpolate()
print(r2)


zhangsan    22.0
lishi       17.0
wangwu       NaN
zhaoliu     16.0
wanger       NaN
Name: age, dtype: float64
zhangsan    22.0
lishi       17.0
wangwu      16.5
zhaoliu     16.0
wanger      16.0
Name: age, dtype: float64


In [10]:
# 替换多列:将无效值替换成空值后，再单独每个列的空值
print(base_info.replace({"age": 25, "address": "sh"}, np.nan))


           age address gender
zhangsan  22.0      nj      f
lishi     17.0    None      m
wangwu     NaN      nj      f
zhaoliu   16.0     NaN      m
wanger     NaN      bj      f


In [11]:
# 使用对象来填充：对应的空值填充为新值
print("*" * 20)
print(base_info)


********************
           age address gender
zhangsan  22.0      nj      f
lishi     17.0    None      m
wangwu     NaN      nj      f
zhaoliu   16.0      sh      m
wanger    25.0      bj      f


In [12]:
age_new = base_info["age"].copy()
age_new.fillna(20, inplace=True)
age_new.replace(25, 22, inplace=True)
print(age_new)
print(base_info["age"].combine_first(age_new))


zhangsan    22.0
lishi       17.0
wangwu      20.0
zhaoliu     16.0
wanger      22.0
Name: age, dtype: float64
zhangsan    22.0
lishi       17.0
wangwu      20.0
zhaoliu     16.0
wanger      25.0
Name: age, dtype: float64
