In [1]:
%run py_import.py

In [2]:
string_data = pd.Series(['aardvark','artichoke',np.nan,'avocado'])

In [3]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

**None值在对象组中也会被当做Na（not available）处理**

In [5]:
string_data[0] = None

In [6]:
string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [7]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### 1. 过滤缺失值

**Series**

In [8]:
from numpy import nan as NA

In [9]:
data = pd.Series([1, NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [10]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
data[data.notnull()]   # 与上面函数等价

0    1.0
2    3.5
4    7.0
dtype: float64

**DataFrame**

In [12]:
data1 = pd.DataFrame([[1., 6.5, 3.],[1., NA, NA],[NA, NA, NA],[NA, 6.5, 3.]])

In [13]:
cleaned = data1.dropna()   # 对于DataFrame使用dropna函数默认删除有NA值的行

In [14]:
data1

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [15]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [16]:
data1.dropna(how = 'all')  # 传入参数how = 'all'将删除所有值均为NA的行

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [17]:
data1[4] = NA
data1

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [18]:
data1.dropna(how = 'all', axis = 'columns')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [19]:
df = pd.DataFrame(np.random.rand(7,3))  # 生成7行3列的随机数
df

Unnamed: 0,0,1,2
0,0.382022,0.081657,0.561423
1,0.96824,0.583937,0.511642
2,0.752671,0.232461,0.202345
3,0.483076,0.162428,0.741042
4,0.683289,0.956473,0.892219
5,0.473329,0.725974,0.149376
6,0.419281,0.630058,0.462246


In [20]:
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA

In [21]:
df

Unnamed: 0,0,1,2
0,0.382022,,
1,0.96824,,
2,0.752671,,0.202345
3,0.483076,,0.741042
4,0.683289,0.956473,0.892219
5,0.473329,0.725974,0.149376
6,0.419281,0.630058,0.462246


In [22]:
df.dropna()

Unnamed: 0,0,1,2
4,0.683289,0.956473,0.892219
5,0.473329,0.725974,0.149376
6,0.419281,0.630058,0.462246


In [23]:
df.dropna(thresh = 1)   # thresh = n 表示：这一行除去NA值后，剩余数值的数量大于等于n，就显示这一行

Unnamed: 0,0,1,2
0,0.382022,,
1,0.96824,,
2,0.752671,,0.202345
3,0.483076,,0.741042
4,0.683289,0.956473,0.892219
5,0.473329,0.725974,0.149376
6,0.419281,0.630058,0.462246


### 2. 补全缺失值

In [24]:
df

Unnamed: 0,0,1,2
0,0.382022,,
1,0.96824,,
2,0.752671,,0.202345
3,0.483076,,0.741042
4,0.683289,0.956473,0.892219
5,0.473329,0.725974,0.149376
6,0.419281,0.630058,0.462246


In [25]:
df.fillna(0)   # 设置一个常数替换缺失值

Unnamed: 0,0,1,2
0,0.382022,0.0,0.0
1,0.96824,0.0,0.0
2,0.752671,0.0,0.202345
3,0.483076,0.0,0.741042
4,0.683289,0.956473,0.892219
5,0.473329,0.725974,0.149376
6,0.419281,0.630058,0.462246


In [26]:
df.fillna({1:0.5, 2:0})   # 不同列设置不同的填充值

Unnamed: 0,0,1,2
0,0.382022,0.5,0.0
1,0.96824,0.5,0.0
2,0.752671,0.5,0.202345
3,0.483076,0.5,0.741042
4,0.683289,0.956473,0.892219
5,0.473329,0.725974,0.149376
6,0.419281,0.630058,0.462246


In [None]:
# 上述两种方法都是创建一个新的对象，原有的df不改变

In [27]:
_ = df.fillna(0, inplace = True)  # 改变了原有数组

In [28]:
df

Unnamed: 0,0,1,2
0,0.382022,0.0,0.0
1,0.96824,0.0,0.0
2,0.752671,0.0,0.202345
3,0.483076,0.0,0.741042
4,0.683289,0.956473,0.892219
5,0.473329,0.725974,0.149376
6,0.419281,0.630058,0.462246


In [29]:
df = pd.DataFrame(np.random.randn(6,3))

In [30]:
df.iloc[2:, 1] = NA

In [32]:
df.iloc[4:, 2] = NA

In [33]:
df

Unnamed: 0,0,1,2
0,0.940042,0.823768,0.863908
1,1.315491,0.709983,-1.105565
2,-1.248435,,0.336584
3,0.264706,,1.267525
4,-1.882724,,
5,-0.206922,,


In [34]:
df.fillna(method = 'ffill')  # method：插值方式； ffill：向下填充

Unnamed: 0,0,1,2
0,0.940042,0.823768,0.863908
1,1.315491,0.709983,-1.105565
2,-1.248435,0.709983,0.336584
3,0.264706,0.709983,1.267525
4,-1.882724,0.709983,1.267525
5,-0.206922,0.709983,1.267525


In [35]:
df.fillna(method = 'ffill', limit = 2)

Unnamed: 0,0,1,2
0,0.940042,0.823768,0.863908
1,1.315491,0.709983,-1.105565
2,-1.248435,0.709983,0.336584
3,0.264706,0.709983,1.267525
4,-1.882724,,1.267525
5,-0.206922,,1.267525


In [36]:
data2 = pd.Series([1., NA, 3.5, NA, 7])

In [37]:
data2.fillna(data2.mean())    # 用给定Series对象的平均值填充NAN值

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64