In [1]:
import pandas as pd
import numpy as np


### an empty value, or a number 0, or an invalid value (a string for example) can be considered "missing data"

In [2]:
falsy_values = (0, False, None, '', [], {})

In [3]:
any(falsy_values)

False

In [4]:
np.nan

nan

In [5]:
3 + np.nan

nan

In [6]:
a = np.array([1, 2, 3, np.nan, np.nan, 4])

In [7]:
a.sum() , a.mean()

(nan, nan)

### if we replace None with np.nan, it raises an exception

In [8]:
3 + np.nan

nan

In [9]:
###The np.nan value is kind of a virus. Everything that it touches becomes np.nan:

### for numeric array np.nan is replaced by None

In [10]:
a = np.array([1, 2, 3, np.nan, None, 4], dtype = 'float')
a

array([ 1.,  2.,  3., nan, nan,  4.])

In [11]:
a = np.array([1, 2, 3, np.nan, np.nan, 4])
a.mean() ,a.sum()

(nan, nan)

### Numpy also support an 'Infinite' type, which also behaves like a virus

In [12]:
np.inf

inf

In [13]:
3 + np.inf


inf

In [14]:
np.inf / 3

inf

In [15]:
np.inf / np.inf

nan

In [16]:
b = np.array([1, 2, 3, np.inf, np.nan, 4], dtype=np.float)
b.sum()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  b = np.array([1, 2, 3, np.inf, np.nan, 4], dtype=np.float)


nan

# Checking for nan or inf

### We will do the cheaking using 'np.isnan' and 'np.isinf'

In [17]:
np.isnan(np.nan), np.isinf(np.inf)

(True, True)

### And the joint operation can be performed with np.isfinite

In [18]:
np.isfinite(np.nan), np.isfinite(np.inf)

(False, False)

### np.isnan and np.isinf also take arrays as inputs, and return boolean arrays as results

In [19]:
np.isnan(np.array([1, 2, 3, np.nan, np.inf, 4]))


array([False, False, False,  True, False, False])

In [20]:
np.isinf(np.array([1, 2, 3, np.nan, np.inf, 4]))


array([False, False, False, False,  True, False])

In [21]:
np.isfinite(np.array([1, 2, 3, np.nan, np.inf, 4]))

array([ True,  True,  True, False, False,  True])

### Note: It's not so common to find infinite values. From now on, we'll keep working with only np.nan

# Filtering them out

### To avoid propagation we use filtering

In [22]:
a = np.array([1, 2, 3, np.nan, np.nan, 4])

In [23]:
a[~np.isnan(a)]

array([1., 2., 3., 4.])

### which is equivalent to

In [24]:
a[np.isfinite(a)]

array([1., 2., 3., 4.])

### And with that result, all the operation can be now performed

In [25]:
a[np.isfinite(a)].sum()

10.0

In [26]:
a[np.isfinite(a)].mean()

2.5