In [3]:
import pandas as pd
import numpy as np

### Handling the missing values:

* isnull()
* isna()
* notnull()
* dropna()
* fillna()

In [4]:
customer = {"EmpID" : [1,3,5,6,np.nan, None, 5],
           "Name" : ["John","Paul","Eric","Sue","Farmer","Mukesh","Lucas"],
           "City" : [np.nan, None, np.nan,"Noida","Sydney",np.nan,"Delhi"],
           "Marks" : [10,np.nan, np.nan,11,23,34,np.nan]}

In [5]:
df = pd.DataFrame(customer)

In [7]:
df.head()

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Sue,Noida,11.0
4,,Farmer,Sydney,23.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   EmpID   5 non-null      float64
 1   Name    7 non-null      object 
 2   City    3 non-null      object 
 3   Marks   4 non-null      float64
dtypes: float64(2), object(2)
memory usage: 356.0+ bytes


### How we can the null values in each columns

In [9]:
df

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Sue,Noida,11.0
4,,Farmer,Sydney,23.0
5,,Mukesh,,34.0
6,5.0,Lucas,Delhi,


In [10]:
df.isna()

Unnamed: 0,EmpID,Name,City,Marks
0,False,False,True,False
1,False,False,True,True
2,False,False,True,True
3,False,False,False,False
4,True,False,False,False
5,True,False,True,False
6,False,False,False,True


In [11]:
df.isna().sum()

EmpID    2
Name     0
City     4
Marks    3
dtype: int64

In [12]:
df.isnull().sum()

EmpID    2
Name     0
City     4
Marks    3
dtype: int64

In [13]:
df.notnull().sum() # True -> when value is not missing

EmpID    5
Name     7
City     3
Marks    4
dtype: int64

In [14]:
df["Marks"].isnull()

0    False
1     True
2     True
3    False
4    False
5    False
6     True
Name: Marks, dtype: bool

In [15]:
sum(df["Marks"].isnull())

3

In [16]:
df

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Sue,Noida,11.0
4,,Farmer,Sydney,23.0
5,,Mukesh,,34.0
6,5.0,Lucas,Delhi,


### I want to delete a row which contains any single missing value

In [17]:
df.dropna()

Unnamed: 0,EmpID,Name,City,Marks
3,6.0,Sue,Noida,11.0


In [18]:
df

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Sue,Noida,11.0
4,,Farmer,Sydney,23.0
5,,Mukesh,,34.0
6,5.0,Lucas,Delhi,


In [19]:
df.dropna(axis = 1)

Unnamed: 0,Name
0,John
1,Paul
2,Eric
3,Sue
4,Farmer
5,Mukesh
6,Lucas


In [20]:
df

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Sue,Noida,11.0
4,,Farmer,Sydney,23.0
5,,Mukesh,,34.0
6,5.0,Lucas,Delhi,


### thresh parameter:

If thresh = 3, means, I want to keep only those rows, which contains atleast 3 non missing value.

In [21]:
df.dropna(thresh=3)

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
3,6.0,Sue,Noida,11.0
4,,Farmer,Sydney,23.0
6,5.0,Lucas,Delhi,


In [22]:
df.dropna(thresh=4)

Unnamed: 0,EmpID,Name,City,Marks
3,6.0,Sue,Noida,11.0


In [23]:
df.dropna(thresh=3, axis = 1)

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Sue,Noida,11.0
4,,Farmer,Sydney,23.0
5,,Mukesh,,34.0
6,5.0,Lucas,Delhi,


In [24]:
df.dropna(thresh=5, axis = 1)

Unnamed: 0,EmpID,Name
0,1.0,John
1,3.0,Paul
2,5.0,Eric
3,6.0,Sue
4,,Farmer
5,,Mukesh
6,5.0,Lucas


In [25]:
df

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Sue,Noida,11.0
4,,Farmer,Sydney,23.0
5,,Mukesh,,34.0
6,5.0,Lucas,Delhi,


In [26]:
df["Marks"].fillna("Modi")

0    10.0
1    Modi
2    Modi
3    11.0
4    23.0
5    34.0
6    Modi
Name: Marks, dtype: object

In [27]:
df.fillna("Rahul")

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,Rahul,10.0
1,3.0,Paul,Rahul,Rahul
2,5.0,Eric,Rahul,Rahul
3,6.0,Sue,Noida,11.0
4,Rahul,Farmer,Sydney,23.0
5,Rahul,Mukesh,Rahul,34.0
6,5.0,Lucas,Delhi,Rahul


In [28]:
df

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Sue,Noida,11.0
4,,Farmer,Sydney,23.0
5,,Mukesh,,34.0
6,5.0,Lucas,Delhi,


In [29]:
df["Marks"].mean()

19.5

In [30]:
np.mean(df["Marks"])

19.5

In [31]:
df["Marks"].fillna(df["Marks"].mean(), inplace = True)

In [32]:
df

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,19.5
2,5.0,Eric,,19.5
3,6.0,Sue,Noida,11.0
4,,Farmer,Sydney,23.0
5,,Mukesh,,34.0
6,5.0,Lucas,Delhi,19.5


In [34]:
df["City"].mode()

0     Delhi
1     Noida
2    Sydney
Name: City, dtype: object

In [36]:
newdf = pd.DataFrame(customer)

In [37]:
newdf

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Sue,Noida,11.0
4,,Farmer,Sydney,23.0
5,,Mukesh,,34.0
6,5.0,Lucas,Delhi,


### Fillna method

In [38]:
newdf.fillna(method = "ffill")

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,10.0
2,5.0,Eric,,10.0
3,6.0,Sue,Noida,11.0
4,6.0,Farmer,Sydney,23.0
5,6.0,Mukesh,Sydney,34.0
6,5.0,Lucas,Delhi,34.0


In [39]:
newdf

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Sue,Noida,11.0
4,,Farmer,Sydney,23.0
5,,Mukesh,,34.0
6,5.0,Lucas,Delhi,


In [40]:
newdf.fillna(method = "bfill")

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,Noida,10.0
1,3.0,Paul,Noida,11.0
2,5.0,Eric,Noida,11.0
3,6.0,Sue,Noida,11.0
4,5.0,Farmer,Sydney,23.0
5,5.0,Mukesh,Delhi,34.0
6,5.0,Lucas,Delhi,


In [41]:
newdf

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Sue,Noida,11.0
4,,Farmer,Sydney,23.0
5,,Mukesh,,34.0
6,5.0,Lucas,Delhi,


In [42]:
newdf.fillna(method = "ffill", axis = 1)

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,John,10.0
1,3.0,Paul,Paul,Paul
2,5.0,Eric,Eric,Eric
3,6.0,Sue,Noida,11.0
4,,Farmer,Sydney,23.0
5,,Mukesh,Mukesh,34.0
6,5.0,Lucas,Delhi,Delhi


In [44]:
newdf.fillna(method = "bfill", axis = 1)

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,10.0,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Sue,Noida,11.0
4,Farmer,Farmer,Sydney,23.0
5,Mukesh,Mukesh,34.0,34.0
6,5.0,Lucas,Delhi,


### Summary of Fillna

We can use two method such as ffill, bfill against rows and column.

In [47]:
newdf.fillna(0)

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,0,10.0
1,3.0,Paul,0,0.0
2,5.0,Eric,0,0.0
3,6.0,Sue,Noida,11.0
4,0.0,Farmer,Sydney,23.0
5,0.0,Mukesh,0,34.0
6,5.0,Lucas,Delhi,0.0


In [50]:
newdf.fillna(method = "ffill", axis = 0).fillna(method="bfill", axis = 0)

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,Noida,10.0
1,3.0,Paul,Noida,10.0
2,5.0,Eric,Noida,10.0
3,6.0,Sue,Noida,11.0
4,6.0,Farmer,Sydney,23.0
5,6.0,Mukesh,Sydney,34.0
6,5.0,Lucas,Delhi,34.0


In [49]:
type(newdf.fillna(method = "ffill", axis = 0))

pandas.core.frame.DataFrame