In [52]:
import pandas as pd
import numpy as np

In [3]:
s = pd.Series(['a',3,np.nan,1,np.nan])

In [4]:
print(s.notnull().sum())

3


In [8]:
s = pd.Series([np.nan,1,2,np.nan,3])
s = s.ffill()

In [9]:
print(s)

0    NaN
1    1.0
2    2.0
3    2.0
4    3.0
dtype: float64


In [10]:
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94, 60.655, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI':[
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Contient': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population','GDP','Surface Area','HDI','Continent'])

In [11]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,
1,63.951,2833687,640679,0.888,
2,80.94,3874437,357114,0.916,
3,60.655,2167744,301336,0.873,
4,127.061,4602367,377930,0.891,
5,64.511,2950039,242495,0.907,
6,318.523,17348075,9525067,0.915,


In [13]:
df.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
dtype: bool

In [14]:
s.duplicated()

0    False
1    False
2    False
3     True
4    False
dtype: bool

## Missing Data: NaN|inf

In [1]:
falsy_values = (0, False, None, '', [],{})

In [2]:
any(falsy_values)

False

Python will treat any of the above as 'falsy' values, but numpy only cares about NaN values

In [8]:
np.nan

nan

In [9]:
3+np.nan

nan

In [10]:
a=np.array([1,2,3, np.nan, np.nan, 4]) 

In [11]:
a.sum()

nan

NaN is a contagion, everything it touches will become NaN.

In [12]:
a.mean()

nan

In [13]:
3 + None
#Nones will throw an error.

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [16]:
a = np.array([1,2,3,np.nan,None,4], dtype='float')

In [17]:
a

array([ 1.,  2.,  3., nan, nan,  4.])

Nones will automatically become NaN in an array w/ a  type.

In [18]:
a.mean()

nan

In [20]:
np.inf

inf

In [21]:
3 + np.inf

inf

Similarly behaves as a contagion.

In [23]:
1/np.inf

0.0

In [24]:
np.inf/np.inf

nan

In [40]:
b=np.array([1,2,3,np.inf,np.nan,4], dtype="float")

In [42]:
b.sum()

nan

### Checking for NaN|inf

In [30]:
np.isnan(np.nan)

True

In [31]:
np.isinf(np.inf)

True

In [32]:
np.isfinite(np.nan), np.isfinite(np.inf)

(False, False)

A Joint operation can be performed, forming a tuple /w two boolean results

In [33]:
np.isnan(a)

array([False, False, False,  True,  True, False])

In [34]:
np.isnan(b)

array([False, False, False, False,  True, False])

In [37]:
np.isfinite(a)

array([ True,  True,  True, False, False,  True])

In [43]:
np.isfinite(b)

array([ True,  True,  True, False, False,  True])

### Filtering the NaN|inf

In [45]:
a = np.array([1,2,3,np.nan,np.nan,4])

In [46]:
a[~np.isnan(a)]

array([1., 2., 3., 4.])

In [47]:
a[np.isfinite(a)]

array([1., 2., 3., 4.])

Find the values in a that are not NaN, returning those values. IMPACTS ORDER

In [48]:
a[np.isfinite(a)].sum()

10.0

In [49]:
a[np.isfinite(a)].mean()

2.5

## Missing Data w/ Pandas

In [53]:
pd.isnull(np.nan)

True

In [54]:
pd.isnull(None)

True

In [55]:
pd.isna(None)

True

In [56]:
pd.notnull(None)

False

In [57]:
pd.notnull(np.nan)

False

In [58]:
pd.notna(np.nan)

False

In [59]:
pd.notna(3)

True

In [60]:
pd.isnull(pd.Series([1,np.nan,7]))

0    False
1     True
2    False
dtype: bool

In [61]:
pd.notnull(pd.Series([1,np.nan,7]))

0     True
1    False
2     True
dtype: bool

In [62]:
pd.isnull(pd.DataFrame({
    'Column A': [1,np.nan,7],
    'Column B': [np.nan,2,3],
    'Column C': [np.nan,2,np.nan]
}))

Unnamed: 0,Column A,Column B,Column C
0,False,True,True
1,True,False,False
2,False,False,True


In [63]:
pd.Series([1,2,np.nan]).count()

2

Unlike numpy, Pandas is not so thrown by NaN. It will process the finite numbers and skip the NaN.

In [64]:
pd.Series([1,2,np.nan]).sum()

3.0

In [65]:
pd.Series([1,2,np.nan]).mean()

1.5

In [66]:
s=pd.Series([1,2,3,np.nan,np.nan,4])

In [67]:
pd.notnull(s)

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [68]:
pd.isnull(s)

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [69]:
pd.notnull(s).sum()

4

pd.isnull(s).sum()

Above sum the number for Trues!

In [71]:
s[pd.notnull(s)]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

INDEX REMAINS!

In [72]:
s.isnull()

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [73]:
s.notnull()

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [75]:
s[s.notnull()]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [76]:
s[s.isna()]

3   NaN
4   NaN
dtype: float64

### Dropping Nulls

In [78]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [79]:
s.dropna()

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

INDEX REMAINS!

In [82]:
df=pd.DataFrame({
    'Column A': [1,np.nan,30,np.nan],
    'Column B': [2,8,31,np.nan],
    'Column C': [np.nan,9,32,100],
    'Column D': [5,8,34,110]
})

In [83]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [84]:
df.shape

(4, 4)

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Column A  2 non-null      float64
 1   Column B  3 non-null      float64
 2   Column C  3 non-null      float64
 3   Column D  4 non-null      int64  
dtypes: float64(3), int64(1)
memory usage: 260.0 bytes


In [86]:
df.isnull()

Unnamed: 0,Column A,Column B,Column C,Column D
0,False,False,True,False
1,True,False,False,False
2,False,False,False,False
3,True,True,False,False


In [87]:
df.isnull().sum()

Column A    2
Column B    1
Column C    1
Column D    0
dtype: int64

In [90]:
df.dropna()

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


Default Dropna behavior will drop ANY row with 1+ NaN

In [91]:
df.dropna(axis=1)

Unnamed: 0,Column D
0,5
1,8
2,34
3,110


Setting axis n. will drop all nulls in that axis

In [93]:
df2 = pd.DataFrame({
    'Column A': [1,np.nan,30],
    'Column B': [2,np.nan,31],
    'Column C': [np.nan,np.nan,100]
})

In [94]:
df2

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,
1,,,
2,30.0,31.0,100.0


In [96]:
df2.dropna(how='all')

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,
2,30.0,31.0,100.0


Sets a requirement that the entire row is NaN in order to drop.

In [98]:
df2.dropna(how='any')

Unnamed: 0,Column A,Column B,Column C
2,30.0,31.0,100.0


In [99]:
df2.dropna(thresh=2)

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,
2,30.0,31.0,100.0


In [100]:
df.dropna(thresh=3)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34


In [104]:
df.dropna(thresh=3, axis='columns')

Unnamed: 0,Column B,Column C,Column D
0,2.0,,5
1,8.0,9.0,8
2,31.0,32.0,34
3,,100.0,110


### Finding Nulls

In [106]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [107]:
s.fillna(0)

0    1.0
1    2.0
2    3.0
3    0.0
4    0.0
5    4.0
dtype: float64

In [108]:
s.fillna(s.mean())

0    1.0
1    2.0
2    3.0
3    2.5
4    2.5
5    4.0
dtype: float64

In [115]:
s.fillna(method='ffill')

  s.fillna(method='ffill')


0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    4.0
dtype: float64

In [116]:
s.ffill()

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    4.0
dtype: float64

In [117]:
s.fillna(method='bfill')

  s.fillna(method='bfill')


0    1.0
1    2.0
2    3.0
3    4.0
4    4.0
5    4.0
dtype: float64

In [118]:
s.bfill()

0    1.0
1    2.0
2    3.0
3    4.0
4    4.0
5    4.0
dtype: float64

In [120]:
pd.Series([np.nan,3,np.nan,9]).ffill()

0    NaN
1    3.0
2    3.0
3    9.0
dtype: float64

In [121]:
pd.Series([1,np.nan,3,np.nan,np.nan]).bfill()

0    1.0
1    3.0
2    3.0
3    NaN
4    NaN
dtype: float64

Carefully ensure when using `ffill` and `bfill` that the first|last values are not NaN themselves.

In [123]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [125]:
df.fillna({'Column A': 0,'Column B': 99,'Column C': df['Column C'].mean()})

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,47.0,5
1,0.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,0.0,99.0,100.0,110


Different columns can be filled with differing methods

In [128]:
df.ffill(axis=0)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,1.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,30.0,31.0,100.0,110


In [129]:
df.ffill(axis=1)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,2.0,5.0
1,,8.0,9.0,8.0
2,30.0,31.0,32.0,34.0
3,,,100.0,110.0


### Checking for NA

In [131]:
s.dropna().count()

4

In [135]:
missing_values = len(s.dropna()) !=len(s)
missing_values

True

In [136]:
len(s)

6

In [137]:
s.count()

4

In [139]:
missing_values = s.count() !=len(s)
missing_values

True

Python methods can also be usefully utilized.

In [140]:
pd.Series([True,False,False]).any()

True

In [141]:
pd.Series([True,False,False]).all()

False

In [142]:
pd.Series([True,True,True]).all()

True

In [143]:
s.isnull()

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [144]:
pd.Series([1,np.nan]).isnull().any()

True

In [145]:
pd.Series([1,2]).isnull().any()

False

In [146]:
s.isnull().any()

True

In [147]:
s.isnull().values

array([False, False, False,  True,  True, False])

In [148]:
s.isnull().values.any()

True