# Handling Missing Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
d = {'A': [1,2, np.nan], 'B': [5, np.nan, np.nan], 'C': [8,9,10]}

In [4]:
df = pd.DataFrame(d)

In [5]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,8
1,2.0,,9
2,,,10


## 1. dropna() method
This method will remove all `rows` or `columns` having `NaN` value(s).
* If the `axis=0` then it will drop all `rows`.
* If the `axis=1` then it will drop all `columns`.

In [6]:
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,8


In [7]:
df.dropna(axis=1)

Unnamed: 0,C
0,8
1,9
2,10


<p style="font-size: 18px; color: green;">Here also, to see the effect in original <code>DataFrame</code>, we need to pass argument <code>inplace=True</code>.</p>

In [8]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,8
1,2.0,,9
2,,,10


### We can also set threshold value for rows or columns having NaN values.

In [10]:
# will drop those rows having atleast two NaN values.
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,8
1,2.0,,9


## fillna() method
Instead of droping the NaN values, this method will replace those values.

In [12]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,8
1,2.0,,9
2,,,10


In [13]:
df.fillna(value="FILL")

Unnamed: 0,A,B,C
0,1.0,5.0,8
1,2.0,FILL,9
2,FILL,FILL,10


<p style="font-size: 18px; color: green;">Here also, to see the effect in original <code>DataFrame</code>, we need to pass argument <code>inplace=True</code>.</p>

### filling values with the mean of that column.

In [14]:
df['A']

0    1.0
1    2.0
2    NaN
Name: A, dtype: float64

In [15]:
df['A'].fillna(value=df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64