# 利用 Python 进行数据分析

# 第 7 章 数据清洗和准备

### 处理缺失数据

In [1]:
# 对于数值数据 ，pandas 使用浮点数值 NaN (Not a Number) 表示缺失数据
import pandas as pd
from pandas import Series
import numpy as np

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [2]:
import pandas as pd
from pandas import Series
import numpy as np

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [3]:
# 将缺失值表示为 NA，它表示不可用 not available。
# 在统计应用中，NA 数据可能是不存在的数据或者虽然存在，但是没有观察到（例如，数据采集中发生了问题）。
# 当进行数据清洗以进行分析时，最好直接对缺失数据进行分析，以判断数据采集的问题或缺失数据可能导致的偏差。
# Python 内置的 None 值在对象数组中也可以作为 NA:
import pandas as pd
from pandas import Series
import numpy as np

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

**缺失数据处理的函数**

| 方法  | 说明  |
| ----- | ----- |
| dropna | 根据各标签的值中是否存在缺失数据对轴标签进行过滤，可通过阈值调节对缺失值的容忍度 |
| fillna | 用指定值或插值方法（如 ffill 或 bfill）填充缺失数据 |
| isnull | 返回一个含有布尔值的对象，这些布尔值表示哪些值是缺失值/NA，该对象的类型与源类型一样 |
| notnull | isnull 的否定式 |

#### 滤除缺失数据

In [6]:
# 过滤掉缺失数据的办法有很多种
# ① pandas.isnull
# ② 布尔索引
# ③ dropna()
# 对于一个 Series，dropna 返回一个仅含非空数据和索引值的 Series

import pandas as pd
from numpy import nan as NA
from pandas import Series

data = pd.Series([1, NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [7]:
# ③ dropna()
import pandas as pd
from numpy import nan as NA
from pandas import Series

data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [8]:
# 等价于：
import pandas as pd
from numpy import nan as NA
from pandas import Series

data = pd.Series([1, NA, 3.5, NA, 7])
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
# 对于DataFrame对象，事情就有点复杂了。你可能希望丢弃全NA或含有NA的行或列。
# dropna默认丢弃任何含有缺失值的行：

import pandas as pd
from pandas import DataFrame

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                   [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [15]:
# 对于DataFrame对象，事情就有点复杂了。你可能希望丢弃全NA或含有NA的行或列。
# dropna 默认丢弃任何含有缺失值的行：

import pandas as pd
from pandas import DataFrame

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                   [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [14]:
# 传入how='all'将只丢弃全为NA的那些行：

import pandas as pd
from pandas import DataFrame

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                   [NA, NA, NA], [NA, 6.5, 3.]])
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [16]:
# 传入how='all'将只丢弃全为NA的那些行，用这种方式丢弃列，只需传入 axis=1 即可：
import pandas as pd
from pandas import DataFrame

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                   [NA, NA, NA], [NA, 6.5, 3.]])
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [17]:
import pandas as pd
from pandas import DataFrame

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                   [NA, NA, NA], [NA, 6.5, 3.]])
data[4] = NA
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [18]:
# 另一个滤除DataFrame行的问题涉及时间序列数据。
# 假设你只想留下一部分观测数据，可以用thresh参数实现此目的：

import pandas as pd
from pandas import DataFrame
import numpy as np

df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,1.137358,,
1,0.177352,,
2,-0.24209,,-0.71217
3,1.641257,,-0.461575
4,1.106498,-0.438548,-0.056705
5,-1.891436,1.56601,-0.439087
6,-0.865879,0.176137,-0.544621


In [19]:
import pandas as pd
from pandas import DataFrame
import numpy as np

df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df.dropna()  # dropna 默认丢弃任何含有缺失值的行

Unnamed: 0,0,1,2
4,0.344242,-0.910568,-0.358092
5,-2.170079,0.394477,-1.616863
6,0.593278,-0.065126,0.297342


In [23]:
import pandas as pd
from pandas import DataFrame
import numpy as np

df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df.dropna(thresh=2)  # 留下一部分观测数据，可以用thresh参数实现此目的

Unnamed: 0,0,1,2
2,1.457971,,-0.272514
3,0.215545,,-1.814718
4,1.531975,-0.729414,1.016393
5,0.948217,1.123848,1.126797
6,-0.471294,-0.269885,1.809374


#### 填充缺失数据

In [24]:
# 你可能不想滤除缺失数据（有可能会丢弃跟它有关的其他数据），而是希望通过其他方式填补那些“空洞”。
# 对于大多数情况而言，fillna 方法是最主要的函数。
# 通过一个常数调用 fillna 就会将缺失值替换为那个常数值：

import pandas as pd
from pandas import DataFrame
import numpy as np

df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df.fillna(0)

Unnamed: 0,0,1,2
0,-1.979536,0.0,0.0
1,0.44095,0.0,0.0
2,0.218219,0.0,-0.426306
3,0.593135,0.0,0.630775
4,-1.015067,-1.930206,-0.733413
5,-2.460985,-0.282031,-0.019065
6,-0.451649,0.571238,0.290104


In [25]:
# 若是通过一个字典调用 fillna，就可以实现对不同的列填充不同的值

import pandas as pd
from pandas import DataFrame
import numpy as np

df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.694309,0.5,0.0
1,1.529238,0.5,0.0
2,-0.926407,0.5,-0.351316
3,1.136441,0.5,-0.617225
4,0.645827,-0.752895,-0.990073
5,-0.606109,1.014608,-1.570972
6,0.324873,0.313598,-2.084463


In [26]:
# fillna默认会返回新对象，但也可以对现有对象进行就地修改：

import pandas as pd
from pandas import DataFrame
import numpy as np

df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,-2.415293,0.0,0.0
1,-0.524403,0.0,0.0
2,0.922647,0.0,-1.101025
3,-1.066192,0.0,0.211702
4,-1.103998,0.413325,-0.11646
5,0.102482,-0.013281,0.638468
6,0.572332,-1.891186,-0.204214


In [27]:
# 对reindexing有效的那些插值方法也可用于fillna：
import pandas as pd
from pandas import DataFrame
import numpy as np

df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,0.367644,-0.297945,-0.518253
1,-2.460337,-1.440248,-0.37652
2,1.441508,,-0.426574
3,1.946621,,1.018397
4,1.228389,,
5,-1.583195,,


In [28]:
# 对reindexing有效的那些插值方法也可用于fillna：
import pandas as pd
from pandas import DataFrame
import numpy as np

df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.327797,-0.302466,0.325797
1,0.716625,0.140723,-0.705385
2,1.524721,0.140723,0.034024
3,-0.114816,0.140723,0.017904
4,-1.568014,0.140723,0.017904
5,-0.187448,0.140723,0.017904


In [29]:
# 对reindexing有效的那些插值方法也可用于fillna：
import pandas as pd
from pandas import DataFrame
import numpy as np

df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.646166,0.024409,-0.64606
1,0.088185,-1.82765,1.791821
2,0.465949,-1.82765,-2.014193
3,1.179209,-1.82765,1.441212
4,0.501259,,1.441212
5,-0.056034,,1.441212


In [30]:
# 可以利用fillna实现许多别的功能。比如说，你可以传入Series的平均值或中位数：
import pandas as pd
from pandas import Series

data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

>**fillna 参数参考表**

| 参数 | 说明 |
| ---- | ---- |
| value | 用于填充缺失值的标量值或字典对象 |
| method | 插值方式。如果函数调用时未指定其他参数的话，默认为 "ffill" |
| axis | 待填充的轴，默认 axis=0 |
| inplace | 修改调用者对象而不产生副本 |
| limit | （对于前向和后向填充）可以连续填充的最大数量 |

#### 数据转换