Data Cleaning

In [3]:
import pandas as pd
import numpy as np

In [4]:
falsy_values = (0, False, None, '', [], {})

In [5]:
any(falsy_values)

False

In [6]:
np.nan

nan

In [7]:
3 + np.nan

nan

In [8]:
a = np.array([1, 2, 3, np.nan, np.nan, 4])

In [9]:
a.sum()

nan

In [10]:
a.mean()

nan

In [11]:
3 + np.inf

inf

In [12]:
np.inf / 3

inf

In [13]:
np.inf / np.inf

nan

In [14]:
b = np.array([1, 2, 3, np.inf, np.nan, 4], dtype=np.float64)

In [15]:
b.sum()

nan

In [16]:
np.isnan(np.nan)

True

Handling missing data with pandas

In [17]:
pd.isnull(np.nan)

True

In [18]:
pd.isnull(None)

True

In [19]:
pd.isna(np.nan)

True

In [20]:
pd.isna(None)

True

In [21]:
pd.notnull(None)

False

In [22]:
pd.notnull(np.nan)

False

In [23]:
pd.notna(np.nan)

False

In [24]:
pd.notnull(3)

True

In [25]:
pd.isnull(pd.Series([1, np.nan, 7]))

0    False
1     True
2    False
dtype: bool

In [26]:
pd.notnull(pd.Series([1, np.nan, 7]))

0     True
1    False
2     True
dtype: bool

In [27]:
pd.isnull(pd.DataFrame({
    'Column A': [1, np.nan, 7],
    'Column B': [np.nan, 2, 3],
    'Column C': [np.nan, 2, np.nan]
}))

Unnamed: 0,Column A,Column B,Column C
0,False,True,True
1,True,False,False
2,False,False,True


In [28]:
pd.Series([1, 2, np.nan]).count()

2

In [29]:
pd.Series([1, 2, np.nan]).sum()

3.0

In [30]:
pd.Series([2, 2, np.nan]).mean()

2.0

In [31]:
s = pd.Series([1, 2, 3, np.nan, np.nan, 4])

In [32]:
pd.notnull(s)

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [33]:
pd.isnull(s)

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [34]:
pd.notnull(s).sum()

4

In [35]:
pd.isnull(s).sum()

2

In [36]:
s[pd.notnull(s)]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [37]:
s.isnull()

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [38]:
s[s.notnull()]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [39]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [40]:
s.dropna()

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

DROPING NULL VALUES ON DATAFRAMES

In [41]:
df = pd.DataFrame({
    'Column A': [1, np.nan, 30, np.nan],
    'Column B': [2, 8, 31, np.nan],
    'Column C': [np.nan, 9, 32, 100],
    'Column D': [5, 8, 34, 110],
})

In [42]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Column A  2 non-null      float64
 1   Column B  3 non-null      float64
 2   Column C  3 non-null      float64
 3   Column D  4 non-null      int64  
dtypes: float64(3), int64(1)
memory usage: 256.0 bytes


In [44]:
df.isnull()

Unnamed: 0,Column A,Column B,Column C,Column D
0,False,False,True,False
1,True,False,False,False
2,False,False,False,False
3,True,True,False,False


In [45]:
df.isnull().sum()

Column A    2
Column B    1
Column C    1
Column D    0
dtype: int64

In [46]:
df.dropna()

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


In [47]:
df.dropna(axis=1)  # axis='columns' also works

Unnamed: 0,Column D
0,5
1,8
2,34
3,110


In [48]:
df2 = pd.DataFrame({
    'Column A': [1, np.nan, 30],
    'Column B': [2, np.nan, 31],
    'Column C': [np.nan, np.nan, 100]
})

In [49]:
df2

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,
1,,,
2,30.0,31.0,100.0


In [50]:
df.dropna(how='all')

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [51]:
df.dropna(how='any')  # default behavior

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


In [52]:
df
df.dropna(thresh=3)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34


In [53]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [54]:
df.dropna(thresh=3, axis='columns')

Unnamed: 0,Column B,Column C,Column D
0,2.0,,5
1,8.0,9.0,8
2,31.0,32.0,34
3,,100.0,110


In [55]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [56]:
s.fillna(0)

0    1.0
1    2.0
2    3.0
3    0.0
4    0.0
5    4.0
dtype: float64

In [57]:
s.fillna(s.mean())

0    1.0
1    2.0
2    3.0
3    2.5
4    2.5
5    4.0
dtype: float64

In [58]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

Filling nulls with contiguous (close) values

The method argument is used to fill null values with other values close to that null one:

In [59]:
s.fillna(method='ffill')

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    4.0
dtype: float64

In [60]:
s.fillna(method='bfill')


0    1.0
1    2.0
2    3.0
3    4.0
4    4.0
5    4.0
dtype: float64

#This can still leave null values at the extremes of the Series/DataFrame:

In [61]:
pd.Series([np.nan, 3, np.nan, 9]).fillna(method='ffill')

0    NaN
1    3.0
2    3.0
3    9.0
dtype: float64

In [62]:
pd.Series([1, np.nan, 3, np.nan, np.nan]).fillna(method='bfill')

0    1.0
1    3.0
2    3.0
3    NaN
4    NaN
dtype: float64

Filling null values on DataFrames
The fillna method also works on DataFrames, and it works similarly. The main differences are that you can specify the axis (as usual, rows or columns) to use to fill the values (specially for methods) and that you have more control on the values passed:

In [63]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [64]:
df.fillna({'Column A': 0, 'Column B': 99, 'Column C': df['Column C'].mean()})

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,47.0,5
1,0.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,0.0,99.0,100.0,110


In [65]:
df.fillna(method='ffill', axis=0)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,1.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,30.0,31.0,100.0,110


In [66]:
df.fillna(method='ffill', axis=0)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,1.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,30.0,31.0,100.0,110


In [67]:
df.fillna(method='ffill', axis=1)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,2.0,5.0
1,,8.0,9.0,8.0
2,30.0,31.0,32.0,34.0
3,,,100.0,110.0


Checking if there are NAs
The question is: Does this Series or DataFrame contain any missing value? The answer should be yes or no: True or False. How can you verify it?

Example 1: Checking the length

If there are missing values, s.dropna() will have less elements than s:

In [68]:
s.dropna().count()

4

In [69]:
missing_values = len(s.dropna()) != len(s)
missing_values

True

In [71]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [70]:
len(s)

6

In [72]:
s.count()

4

In [73]:
missing_values = s.count() != len(s)
missing_values

True

In [74]:
pd.Series([True, False, False]).any()

True

In [75]:
pd.Series([True, False, False]).all()

False

In [76]:
pd.Series([True, True, True]).all()

True

In [77]:
s.isnull()

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [78]:
pd.Series([1, np.nan]).isnull().any()

True

In [79]:
pd.Series([1, 2]).isnull().any()

False

In [80]:
s.isnull().any()

True

In [81]:
s.isnull().values

array([False, False, False,  True,  True, False])

In [82]:
s.isnull().values.any()

True