In [2]:
import pandas as pd
import numpy as np

# Create a dictionary with some NaN values
data = {
    'A': ['A', 'B', 'B', 'C'],
    'B': [5, np.nan, np.nan, 8],
    'C': [9, 10, 11, 12]
}

# Create a DataFrame from the dictionary
df = pd.DataFrame(data)


In [3]:
df

Unnamed: 0,A,B,C
0,A,5.0,9
1,B,,10
2,B,,11
3,C,8.0,12


In [4]:
df.isna().sum()

A    0
B    2
C    0
dtype: int64

In [5]:
media = df['B'].mean()

In [6]:
df['B'].fillna(media, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['B'].fillna(media, inplace=True)


In [7]:
df

Unnamed: 0,A,B,C
0,A,5.0,9
1,B,6.5,10
2,B,6.5,11
3,C,8.0,12


In [8]:

pd.isnull(pd.Series([1, np.nan, 7]))

0    False
1     True
2    False
dtype: bool

In [9]:
pd.notnull(pd.Series([1, np.nan, 7]))

0     True
1    False
2     True
dtype: bool

In [10]:
pd.isnull(pd.DataFrame({
    'Column A': [1, np.nan, 7],
    'Column B': [np.nan, 2, 3],
    'Column C': [np.nan, 2, np.nan]
}))

Unnamed: 0,Column A,Column B,Column C
0,False,True,True
1,True,False,False
2,False,False,True


    Pandas Operations with Missing Values
    

In [11]:
pd.Series([1,2,np.nan]).count()

2

In [12]:
pd.Series([1,2,np.nan]).sum()

3.0

In [13]:
pd.Series([2,2,np.nan]).mean()

2.0

    Filtering missing data

In [14]:
s = pd.Series([1,2,3,np.nan, np.nan,4])

In [15]:
pd.notnull(s)

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [16]:
pd.isnull(s)

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [17]:
pd.notnull(s).sum()

4

In [18]:

pd.isnull(s).sum()

2

    Dropping null values

In [19]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [20]:
s.dropna()

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

    Dropping null values on DataFrames

In [21]:
df2 = pd.DataFrame({
    'Column A': [1, np.nan, 30, np.nan],
    'Column B': [2, 8, 31, np.nan],
    'Column C': [np.nan, 9, 32, 100],
    'Column D': [5, 8, 34, 110],
})

In [22]:
df2

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       4 non-null      object 
 1   B       4 non-null      float64
 2   C       4 non-null      int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 228.0+ bytes


In [24]:
df2.isnull()

Unnamed: 0,Column A,Column B,Column C,Column D
0,False,False,True,False
1,True,False,False,False
2,False,False,False,False
3,True,True,False,False


In [25]:
df2.isnull().sum()

Column A    2
Column B    1
Column C    1
Column D    0
dtype: int64

In [26]:
# The default dropna behavior will drop all the rows in which any null value is present:
df2.dropna()

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


In [27]:
df2.dropna(axis=1)

Unnamed: 0,Column D
0,5
1,8
2,34
3,110


In [28]:
df3 = pd.DataFrame({
    'Column A': [1, np.nan, 30],
    'Column B': [2, np.nan, 31],
    'Column C': [np.nan, np.nan, 100]
})

In [29]:
df3

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,
1,,,
2,30.0,31.0,100.0


In [30]:
df3.dropna(how='all')

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,
2,30.0,31.0,100.0


In [31]:
df3.dropna(how='any')

Unnamed: 0,Column A,Column B,Column C
2,30.0,31.0,100.0


In [32]:
df3.dropna(thresh=2)

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,
2,30.0,31.0,100.0


In [33]:
df3.dropna(thresh=2, axis=1)

Unnamed: 0,Column A,Column B
0,1.0,2.0
1,,
2,30.0,31.0


In [34]:
df3

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,
1,,,
2,30.0,31.0,100.0


    Filling null values

In [35]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [36]:
s.fillna(0)

0    1.0
1    2.0
2    3.0
3    0.0
4    0.0
5    4.0
dtype: float64

In [38]:
s.fillna(s.mean())

0    1.0
1    2.0
2    3.0
3    2.5
4    2.5
5    4.0
dtype: float64

In [40]:
s.ffill()

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    4.0
dtype: float64

    Filling null values on DataFrames

In [41]:
df2

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [42]:
df2.fillna({'Column A': 0,'Column B': 99, 'Column C': df2['Column C'].mean()})

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,47.0,5
1,0.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,0.0,99.0,100.0,110


In [43]:
df2

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [44]:
df2.fillna({'Column A': df2['Column A'].mean()})

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,15.5,8.0,9.0,8
2,30.0,31.0,32.0,34
3,15.5,,100.0,110


In [45]:
df2.ffill()

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,1.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,30.0,31.0,100.0,110


    Checking if there are NAs

In [47]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Column A  2 non-null      float64
 1   Column B  3 non-null      float64
 2   Column C  3 non-null      float64
 3   Column D  4 non-null      int64  
dtypes: float64(3), int64(1)
memory usage: 260.0 bytes


In [50]:
df.dropna().count()

A    4
B    4
C    4
dtype: int64