In [1]:
import pandas as pd
import numpy as np

### handling mising data

In [2]:
string_data = pd.Series(['a', 'b', 'c', np.nan, 'd', 'e'])
string_data

0      a
1      b
2      c
3    NaN
4      d
5      e
dtype: object

In [3]:
string_data.isnull()

0    False
1    False
2    False
3     True
4    False
5    False
dtype: bool

In [4]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2    False
3     True
4    False
5    False
dtype: bool

### filtering out missing data

In [5]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [6]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [7]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [8]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [9]:
from numpy import nan as NA

In [10]:
# 按行删除？
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
print(data)
print(cleaned)

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0
     0    1    2
0  1.0  6.5  3.0


In [11]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [12]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [13]:
data.dropna(how='all', axis=1) # all 全部为na

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [14]:
data.dropna(axis=0, how='all')

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
3,,6.5,3.0,


In [15]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [16]:
df = pd.DataFrame(np.random.randn(7,3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,0.383245,,
1,-0.58135,,
2,0.707863,,0.365805
3,0.55312,,0.96312
4,-1.216009,0.798765,-0.121581
5,-0.050854,-0.351301,1.088893
6,0.161894,0.521851,0.330147


In [17]:
df.dropna()

Unnamed: 0,0,1,2
4,-1.216009,0.798765,-0.121581
5,-0.050854,-0.351301,1.088893
6,0.161894,0.521851,0.330147


In [18]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.707863,,0.365805
3,0.55312,,0.96312
4,-1.216009,0.798765,-0.121581
5,-0.050854,-0.351301,1.088893
6,0.161894,0.521851,0.330147


In [19]:
df.dropna(thresh=2, axis=1)  # 不按照轴？

Unnamed: 0,0,1,2
0,0.383245,,
1,-0.58135,,
2,0.707863,,0.365805
3,0.55312,,0.96312
4,-1.216009,0.798765,-0.121581
5,-0.050854,-0.351301,1.088893
6,0.161894,0.521851,0.330147


### filling in missing data

In [20]:
df

Unnamed: 0,0,1,2
0,0.383245,,
1,-0.58135,,
2,0.707863,,0.365805
3,0.55312,,0.96312
4,-1.216009,0.798765,-0.121581
5,-0.050854,-0.351301,1.088893
6,0.161894,0.521851,0.330147


In [21]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.383245,0.0,0.0
1,-0.58135,0.0,0.0
2,0.707863,0.0,0.365805
3,0.55312,0.0,0.96312
4,-1.216009,0.798765,-0.121581
5,-0.050854,-0.351301,1.088893
6,0.161894,0.521851,0.330147


In [22]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.383245,0.5,0.0
1,-0.58135,0.5,0.0
2,0.707863,0.5,0.365805
3,0.55312,0.5,0.96312
4,-1.216009,0.798765,-0.121581
5,-0.050854,-0.351301,1.088893
6,0.161894,0.521851,0.330147


In [23]:
_ = df.fillna(0, inplace=True)  #???

In [24]:
df

Unnamed: 0,0,1,2
0,0.383245,0.0,0.0
1,-0.58135,0.0,0.0
2,0.707863,0.0,0.365805
3,0.55312,0.0,0.96312
4,-1.216009,0.798765,-0.121581
5,-0.050854,-0.351301,1.088893
6,0.161894,0.521851,0.330147


In [25]:
df = pd.DataFrame(np.random.randn(6,3))
df

Unnamed: 0,0,1,2
0,1.216449,0.58435,1.306635
1,0.982505,-1.075459,0.240866
2,1.302049,1.209891,0.326789
3,-0.982506,-1.042403,0.740574
4,-1.534312,0.892187,-1.125909
5,0.250594,0.371401,-1.458404


In [26]:
df.iloc[:2, 1] = NA

In [27]:
df.iloc[:4, 2] = NA

In [28]:
df

Unnamed: 0,0,1,2
0,1.216449,,
1,0.982505,,
2,1.302049,1.209891,
3,-0.982506,-1.042403,
4,-1.534312,0.892187,-1.125909
5,0.250594,0.371401,-1.458404


In [32]:
df.fillna(method='ffill')
df.fillna(method='ffill', limit=2) #?
df

Unnamed: 0,0,1,2
0,1.216449,,
1,0.982505,,
2,1.302049,1.209891,
3,-0.982506,-1.042403,
4,-1.534312,0.892187,-1.125909
5,0.250594,0.371401,-1.458404


In [33]:
df.fillna(method='ffill') 

Unnamed: 0,0,1,2
0,1.216449,,
1,0.982505,,
2,1.302049,1.209891,
3,-0.982506,-1.042403,
4,-1.534312,0.892187,-1.125909
5,0.250594,0.371401,-1.458404
