In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### np.nan ==> for any data

In [3]:
# creating missing value using np.nan

d1={'names':['Ram','Sur',np.nan,'Mah'],
   'age':[31,32,33,np.nan],
   'city':[np.nan,'Hyd','Mum','Chen']}

pd.DataFrame(d1)

Unnamed: 0,names,age,city
0,Ram,31.0,
1,Sur,32.0,Hyd
2,,33.0,Mum
3,Mah,,Chen


In [4]:
# these NaN are missing values

data1=pd.DataFrame(d1)
data1.isnull()

Unnamed: 0,names,age,city
0,False,False,True
1,False,False,False
2,True,False,False
3,False,True,False


In [4]:
data1.isnull().sum()
# every column has a missing value

names    1
age      1
city     1
dtype: int64

In [5]:
# to get the percentage

data1.isnull().sum()/len(data1)

# 0.25 -> 25%

names    0.25
age      0.25
city     0.25
dtype: float64

### None ==> only for Categorical data

In [6]:
# creating missing values using None

d2={'names':['Ram','Sur',None,'Mah'],
   'age':[31,32,33,None],
   'city':[None,'Hyd','Mum','Chen']}

data2=pd.DataFrame(d2)
data2

Unnamed: 0,names,age,city
0,Ram,31.0,
1,Sur,32.0,Hyd
2,,33.0,Mum
3,Mah,,Chen


In [7]:
data2.isnull()        # even when we provided None for numerical data it got converted into NaN

Unnamed: 0,names,age,city
0,False,False,True
1,False,False,False
2,True,False,False
3,False,True,False


## Methods used to fill the missing values:

#### Method - 1: fillna

In [5]:
# fill all values with random value

data1.fillna(40)

Unnamed: 0,names,age,city
0,Ram,31.0,40
1,Sur,32.0,Hyd
2,40,33.0,Mum
3,Mah,40.0,Chen


In [9]:
data1

Unnamed: 0,names,age,city
0,Ram,31.0,
1,Sur,32.0,Hyd
2,,33.0,Mum
3,Mah,,Chen


In [10]:
data1.fillna(40,inplace=True)
data1

Unnamed: 0,names,age,city
0,Ram,31.0,40
1,Sur,32.0,Hyd
2,40,33.0,Mum
3,Mah,40.0,Chen


#### Method -2:

In [11]:
# fill all values in a specific column with random value

data1['names'].fillna('Sat')

0    Ram
1    Sur
2     40
3    Mah
Name: names, dtype: object

In [12]:
data1['names'].fillna('Sat',inplace=True)
data1

Unnamed: 0,names,age,city
0,Ram,31.0,40
1,Sur,32.0,Hyd
2,40,33.0,Mum
3,Mah,40.0,Chen


#### Method -3: bfill, ffill, pad, backfill

In [13]:
d1={'names':['Ram','Sur',np.nan,'Mah'],
   'age':[31,32,33,np.nan],
   'city':[np.nan,'Hyd','Mum','Chen']}

data1=pd.DataFrame(d1)
data1

Unnamed: 0,names,age,city
0,Ram,31.0,
1,Sur,32.0,Hyd
2,,33.0,Mum
3,Mah,,Chen


In [15]:
# bfill and backfill are same

data1.fillna(method='backfill',inplace=True)
data1

# in names: missing value is at index 2
#           so the value back of the missing value is copied

# in age:   missing value is at index 3
#           there is no value at its back, so NaN is remained

# in city:  missing value is at index 1
#           so the value back of the missing value is copied

Unnamed: 0,names,age,city
0,Ram,31.0,Hyd
1,Sur,32.0,Hyd
2,Mah,33.0,Mum
3,Mah,,Chen


In [17]:
d1={'names':['Ram','Sur',np.nan,'Mah'],
   'age':[31,32,33,np.nan],
   'city':[np.nan,'Hyd','Mum','Chen']}

data1=pd.DataFrame(d1)
data1

Unnamed: 0,names,age,city
0,Ram,31.0,
1,Sur,32.0,Hyd
2,,33.0,Mum
3,Mah,,Chen


In [18]:
# ffill and pad are same

data1.fillna(method='ffill',inplace=True)
data1

Unnamed: 0,names,age,city
0,Ram,31.0,
1,Sur,32.0,Hyd
2,Sur,33.0,Mum
3,Mah,33.0,Chen


#### Method -4: mean, median, mode

In [19]:
d1={'names':['Ram','Sur',np.nan,'Mah'],
   'age':[31,32,33,np.nan],
   'city':[np.nan,'Hyd','Mum','Chen']}

data1=pd.DataFrame(d1)
data1

Unnamed: 0,names,age,city
0,Ram,31.0,
1,Sur,32.0,Hyd
2,,33.0,Mum
3,Mah,,Chen


In [20]:
age_mean=data1['age'].mean()
age_mean

32.0

In [24]:
data1['age'].fillna(age_mean,inplace=True)     # instead of providing random data provide mean value
data1

Unnamed: 0,names,age,city
0,Ram,31.0,
1,Sur,32.0,Hyd
2,,33.0,Mum
3,Mah,32.0,Chen


In [26]:
d1={'names':['Ram','Sur',np.nan,'Mah'],
   'age':[31,32,33,np.nan],
   'city':[np.nan,'Hyd','Mum','Chen']}

data1=pd.DataFrame(d1)
data1

Unnamed: 0,names,age,city
0,Ram,31.0,
1,Sur,32.0,Hyd
2,,33.0,Mum
3,Mah,,Chen


In [27]:
age_median=data1['age'].median()
age_median

32.0

In [28]:
data1['age'].fillna(age_median,inplace=True)
data1

Unnamed: 0,names,age,city
0,Ram,31.0,
1,Sur,32.0,Hyd
2,,33.0,Mum
3,Mah,32.0,Chen


In [33]:
# data1['age'].mode()

#### Method -5: KNN imputer

In [38]:
# instead of taking the mean of all the values Knn will only take the mean of nearest neighbours
# only for numerical data(because mean)

In [6]:
d1={'names':['Ram','Sur',np.nan,'Mah'],
   'age':[31,32,33,np.nan],
   'city':[np.nan,'Hyd','Mum','Chen']}

data1=pd.DataFrame(d1)
data1

Unnamed: 0,names,age,city
0,Ram,31.0,
1,Sur,32.0,Hyd
2,,33.0,Mum
3,Mah,,Chen


In [8]:
from sklearn.impute import KNNImputer
knni=KNNImputer(n_neighbors=2)               # euclidean distance is used to calculate
knni.fit_transform(data1[['age']])           # here the value NaN has 33.0 and 32.0 as 2 nearest neighbours,So the NaN is replaced with mean of 32 and 33 

array([[31.],
       [32.],
       [33.],
       [32.]])

In [53]:
data1['age']=knni.fit_transform(data1[['age']])
data1

Unnamed: 0,names,age,city
0,Ram,31.0,
1,Sur,32.0,Hyd
2,,33.0,Mum
3,Mah,32.0,Chen


#### Method -6: Based on other columns

In [54]:
# sometimes all above methods will not provide good justification
# at that time we need to check other column's dependency on the missing value columns
# most of the time we will pick a column which has highest correlation