In [44]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

### Handling the missing values:
* isnull()
* isna()
* notnull()
* dropna()
* fillna()
* notna()

In [4]:
customer = {"EmpID" : [1,3,5,6,np.nan,None,5],
           "Name" : ['John','Paul','Eric',"Amit","Mukesh","Lucas",'Farmer'],
           "City" : [np.nan, None, np.nan,'Noida','Sydney',np.nan,"Delhi"],
           "Marks" : [10,np.nan,np.nan,11,23,34,np.nan]}

In [5]:
df = pd.DataFrame(customer)

In [6]:
df

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Amit,Noida,11.0
4,,Mukesh,Sydney,23.0
5,,Lucas,,34.0
6,5.0,Farmer,Delhi,


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   EmpID   5 non-null      float64
 1   Name    7 non-null      object 
 2   City    3 non-null      object 
 3   Marks   4 non-null      float64
dtypes: float64(2), object(2)
memory usage: 356.0+ bytes


In [8]:
df

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Amit,Noida,11.0
4,,Mukesh,Sydney,23.0
5,,Lucas,,34.0
6,5.0,Farmer,Delhi,


In [9]:
df.isnull()

Unnamed: 0,EmpID,Name,City,Marks
0,False,False,True,False
1,False,False,True,True
2,False,False,True,True
3,False,False,False,False
4,True,False,False,False
5,True,False,True,False
6,False,False,False,True


In [10]:
df.isnull().sum()

EmpID    2
Name     0
City     4
Marks    3
dtype: int64

In [11]:
df.isna().sum()

EmpID    2
Name     0
City     4
Marks    3
dtype: int64

In [13]:
df.notna().sum()

EmpID    5
Name     7
City     3
Marks    4
dtype: int64

In [14]:
df.notnull().sum()

EmpID    5
Name     7
City     3
Marks    4
dtype: int64

In [15]:
df

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Amit,Noida,11.0
4,,Mukesh,Sydney,23.0
5,,Lucas,,34.0
6,5.0,Farmer,Delhi,


In [18]:
result = df.isnull().mean() * 100

In [20]:
result

EmpID    28.571429
Name      0.000000
City     57.142857
Marks    42.857143
dtype: float64

In [19]:
result.apply(lambda x : f'{x:.2f}%')

EmpID    28.57%
Name      0.00%
City     57.14%
Marks    42.86%
dtype: object

In [21]:
df

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Amit,Noida,11.0
4,,Mukesh,Sydney,23.0
5,,Lucas,,34.0
6,5.0,Farmer,Delhi,


# dropna 

using this function we can drop the rows by default if any row contains missing value

In [23]:
df.dropna()

Unnamed: 0,EmpID,Name,City,Marks
3,6.0,Amit,Noida,11.0


In [24]:
df.dropna(axis = "rows")

Unnamed: 0,EmpID,Name,City,Marks
3,6.0,Amit,Noida,11.0


In [26]:
df.dropna(axis = 0)

Unnamed: 0,EmpID,Name,City,Marks
3,6.0,Amit,Noida,11.0


In [27]:
# delete a column, a column contain a missing value

In [28]:
df.dropna(axis = 1)

Unnamed: 0,Name
0,John
1,Paul
2,Eric
3,Amit
4,Mukesh
5,Lucas
6,Farmer


In [29]:
df['City'].isnull().sum()

4

### Thresh Parameter

If thresh = 3, which means I want to keep only those rows, which contains atleast 3 non-missing value.

In [31]:
df

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Amit,Noida,11.0
4,,Mukesh,Sydney,23.0
5,,Lucas,,34.0
6,5.0,Farmer,Delhi,


In [30]:
df.dropna(thresh=3)

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
3,6.0,Amit,Noida,11.0
4,,Mukesh,Sydney,23.0
6,5.0,Farmer,Delhi,


In [32]:
df.dropna(thresh=4)

Unnamed: 0,EmpID,Name,City,Marks
3,6.0,Amit,Noida,11.0


In [33]:
df.dropna(thresh=5, axis = 1)

Unnamed: 0,EmpID,Name
0,1.0,John
1,3.0,Paul
2,5.0,Eric
3,6.0,Amit
4,,Mukesh
5,,Lucas
6,5.0,Farmer


In [34]:
df.dropna(thresh=4, axis = 1)

Unnamed: 0,EmpID,Name,Marks
0,1.0,John,10.0
1,3.0,Paul,
2,5.0,Eric,
3,6.0,Amit,11.0
4,,Mukesh,23.0
5,,Lucas,34.0
6,5.0,Farmer,


### Fillna 

using this function we can fill our value based on your choice.

In [35]:
df

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Amit,Noida,11.0
4,,Mukesh,Sydney,23.0
5,,Lucas,,34.0
6,5.0,Farmer,Delhi,


In [36]:
df["City"].fillna("Modi")

0      Modi
1      Modi
2      Modi
3     Noida
4    Sydney
5      Modi
6     Delhi
Name: City, dtype: object

In [37]:
df.fillna("Modi")

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,Modi,10.0
1,3.0,Paul,Modi,Modi
2,5.0,Eric,Modi,Modi
3,6.0,Amit,Noida,11.0
4,Modi,Mukesh,Sydney,23.0
5,Modi,Lucas,Modi,34.0
6,5.0,Farmer,Delhi,Modi


In [38]:
df.fillna("Rahul")

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,Rahul,10.0
1,3.0,Paul,Rahul,Rahul
2,5.0,Eric,Rahul,Rahul
3,6.0,Amit,Noida,11.0
4,Rahul,Mukesh,Sydney,23.0
5,Rahul,Lucas,Rahul,34.0
6,5.0,Farmer,Delhi,Rahul


In [39]:
df

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,
2,5.0,Eric,,
3,6.0,Amit,Noida,11.0
4,,Mukesh,Sydney,23.0
5,,Lucas,,34.0
6,5.0,Farmer,Delhi,


In [40]:
df['Marks'].mean()

19.5

In [41]:
round(df['Marks'].mean(),0)

20.0

In [45]:
df['Marks'].fillna(round(df['Marks'].mean(),0), inplace = True)

In [46]:
df

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,,10.0
1,3.0,Paul,,20.0
2,5.0,Eric,,20.0
3,6.0,Amit,Noida,11.0
4,,Mukesh,Sydney,23.0
5,,Lucas,,34.0
6,5.0,Farmer,Delhi,20.0


In [47]:
df['City'].mode()

0     Delhi
1     Noida
2    Sydney
Name: City, dtype: object

In [50]:
df["City"].fillna(method = 'bfill', inplace = True)

In [49]:
df["City"].fillna(method = 'ffill')

0       NaN
1       NaN
2       NaN
3     Noida
4    Sydney
5    Sydney
6     Delhi
Name: City, dtype: object

In [51]:
df

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,Noida,10.0
1,3.0,Paul,Noida,20.0
2,5.0,Eric,Noida,20.0
3,6.0,Amit,Noida,11.0
4,,Mukesh,Sydney,23.0
5,,Lucas,Delhi,34.0
6,5.0,Farmer,Delhi,20.0


In [53]:
df.dropna(inplace=True)

In [54]:
df

Unnamed: 0,EmpID,Name,City,Marks
0,1.0,John,Noida,10.0
1,3.0,Paul,Noida,20.0
2,5.0,Eric,Noida,20.0
3,6.0,Amit,Noida,11.0
6,5.0,Farmer,Delhi,20.0
