# Mod13 Handling Missing Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
np.__version__

'1.20.1'

In [3]:
pd.__version__

'1.2.4'

### None & NAN

In [4]:
data = pd.Series([1, np.nan, 3, None]); print(data, end="\n\n")            # None是object所以放在float array裡會自動轉成 NaN
data = pd.Series([1, np.nan, 'hello', None]); print(data)                  # None可以存在object array

0    1.0
1    NaN
2    3.0
3    NaN
dtype: float64

0        1
1      NaN
2    hello
3     None
dtype: object


In [5]:
df = pd.DataFrame([[1, np.nan, 2],[2, 3, 5],[np.nan, 4, 6]], columns=list('ABC'))
df['D'] = np.nan
df

Unnamed: 0,A,B,C,D
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


### Detecting null values

In [6]:
print(data.isnull(), end="\n-------------------\n")
print(data.isnull().any(), end="\n-------------------\n")                 # 當資料量很大時會用any()來查看是否有null值
print(data.isnull().values.any())

0    False
1     True
2    False
3     True
dtype: bool
-------------------
True
-------------------
True


In [7]:
print(df.isnull(), end="\n-------------------\n")
print(df.isnull().any(), end="\n-------------------\n")                  # 當資料量很大時會用any()來查看是否有null值
print(df.isnull().values.any())

       A      B      C     D
0  False   True  False  True
1  False  False  False  True
2   True  False  False  True
-------------------
A     True
B     True
C    False
D     True
dtype: bool
-------------------
True


### Dropping null values

In [16]:
print(data[data.notnull()],end="\n-------------------\n")
print(data.dropna())

0        1
2    hello
dtype: object
-------------------
0        1
2    hello
dtype: object


In [17]:
display(df.dropna())                       # 預設以 axis=0 刪除
display(df.dropna(axis=1))

Unnamed: 0,A,B,C,D


Unnamed: 0,C
0,2
1,5
2,6


In [103]:
display(df.dropna(axis=1, how='all'))         #整欄是NaN才刪除
display(df.dropna(axis=1, how='any'))         #整欄有一個是NaN就刪除

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


Unnamed: 0,2
0,2
1,5
2,6


In [104]:
display(df.dropna(axis=0, thresh=3))          # 每列有3個或以上非NaN值就保留該列
display(df.dropna(axis=1, thresh=3))          # 每欄有3個或以上非NaN值就保留該列

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


Unnamed: 0,2
0,2
1,5
2,6


### Filling null values

#### For ``Series``

Masking style

In [69]:
s1 = pd.Series([1, np.nan, 2, None, 3], index=list('abcde')); s1

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [70]:
s1[s1.isnull()]=0; s1

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [71]:
s1 = pd.Series([1, np.nan, 2, None, 3], index=list('abcde')); s1

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [74]:
print(s1.fillna(0),end="\n-----------------------\n")
print(s1.fillna(data.mean()),end="\n-----------------------\n")
print(s1.fillna(data.median()),end="\n-----------------------\n")

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64
-----------------------
a    1.0
b    2.0
c    2.0
d    2.0
e    3.0
dtype: float64
-----------------------
a    1.0
b    2.0
c    2.0
d    2.0
e    3.0
dtype: float64
-----------------------


In [75]:
print(s1.fillna(method='ffill'), end="\n----------------\n")         # 拿前一個數來補
print(s1.fillna(method='bfill'), end="\n----------------\n")         # 拿後一個數來補

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64
----------------
a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64
----------------


#### For ``DataFrame``

In [76]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [77]:
display(df.fillna(method='ffill'))               # 因為前一個沒有數字所以會出現 NaN
display(df.fillna(method='ffill', axis=1))

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,2.0,4.0,6,


Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


## Lab

<b>有一個 DataFrame df，試著計算空值個數有幾個?</b>

In [27]:
arr = np.array([[ 8.,  2., 17., 20., 10.],
       [ 4., np.nan,  3., np.nan,  2.],
       [24., 26., 14., 23., 21.],
       [ 7., 29.,  3., 19., 25.],
       [14., 24., np.nan, 21., 10.],
       [np.nan, np.nan, 20., 26., np.nan]])

df = pd.DataFrame(arr, columns=['one', 'two', 'three', 'four', 'five'])
df

Unnamed: 0,one,two,three,four,five
0,8.0,2.0,17.0,20.0,10.0
1,4.0,,3.0,,2.0
2,24.0,26.0,14.0,23.0,21.0
3,7.0,29.0,3.0,19.0,25.0
4,14.0,24.0,,21.0,10.0
5,,,20.0,26.0,


In [28]:
df.isnull().values.sum()

6

In [30]:
np.sum(df.isnull().values)

6

<b>試著將有空值的列拋棄</b>

In [82]:
df.dropna(axis=0)

Unnamed: 0,one,two,three,four,five
0,8.0,2.0,17.0,20.0,10.0
2,24.0,26.0,14.0,23.0,21.0
3,7.0,29.0,3.0,19.0,25.0


<b>試著將空值超過一個以上的列拋棄</b>

In [55]:
df.dropna(axis=0, thresh=4)

Unnamed: 0,one,two,three,four,five
0,8.0,2.0,17.0,20.0,10.0
2,24.0,26.0,14.0,23.0,21.0
3,7.0,29.0,3.0,19.0,25.0
4,14.0,24.0,,21.0,10.0
