# Data Cleaning , Part 2  ( Using interpolation)

<span style="color:blue">
    
- Interpolation is a technique in Python used to estimate unknown data points between two known data points.
    
- Interpolation is mostly used to impute missing values in the dataframe or series while preprocessing data.
    
- Interpolation is also used in Image Processing when expanding an image you can estimate the pixel value with help of neighboring pixels.  
</span>

In [84]:
import pandas as pd
import numpy as np

In [85]:
df = pd.DataFrame({"A":[1,2,3,4,5,np.NaN,7,8,9]})

In [86]:
df

Unnamed: 0,A
0,1.0
1,2.0
2,3.0
3,4.0
4,5.0
5,
6,7.0
7,8.0
8,9.0


In [87]:
df.isnull().sum()

A    1
dtype: int64

In [88]:
df.interpolate() # Linear 

Unnamed: 0,A
0,1.0
1,2.0
2,3.0
3,4.0
4,5.0
5,6.0
6,7.0
7,8.0
8,9.0


In [89]:
df = pd.read_csv('school.csv')

In [90]:
df

Unnamed: 0,Student ID,Math,English,Date
0,1,70.0,60.0,01/01/2022
1,2,9999.0,55.0,03/01/2022
2,3,45.0,,04/01/2022
3,4,75.0,50.0,05/01/2022
4,5,,75.0,08/01/2022
5,6,90.0,,10/01/2012
6,7,95.0,80.0,11/01/2022
7,8,,57.0,13/01/2022
8,9,80.0,,15/01/2022
9,10,,64.0,16/01/2022


In [92]:
df = df.replace(9999.0,np.NAN)

In [93]:
df

Unnamed: 0,Student ID,Math,English,Date
0,1,70.0,60.0,01/01/2022
1,2,,55.0,03/01/2022
2,3,45.0,,04/01/2022
3,4,75.0,50.0,05/01/2022
4,5,,75.0,08/01/2022
5,6,90.0,,10/01/2012
6,7,95.0,80.0,11/01/2022
7,8,,57.0,13/01/2022
8,9,80.0,,15/01/2022
9,10,,64.0,16/01/2022


In [94]:
df.interpolate()  # Linear 

Unnamed: 0,Student ID,Math,English,Date
0,1,70.0,60.0,01/01/2022
1,2,57.5,55.0,03/01/2022
2,3,45.0,52.5,04/01/2022
3,4,75.0,50.0,05/01/2022
4,5,82.5,75.0,08/01/2022
5,6,90.0,77.5,10/01/2012
6,7,95.0,80.0,11/01/2022
7,8,87.5,57.0,13/01/2022
8,9,80.0,60.5,15/01/2022
9,10,80.0,64.0,16/01/2022


In [95]:
df.interpolate(method='time')

ValueError: time-weighted interpolation only works on Series or DataFrames with a DatetimeIndex

In [96]:
df.dtypes

Student ID      int64
Math          float64
English       float64
Date           object
dtype: object

In [97]:
df = pd.read_csv('school.csv', parse_dates=['Date'], index_col='Date')

In [99]:
df = df.replace(9999.0,np.NAN)

In [100]:
df

Unnamed: 0_level_0,Student ID,Math,English
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,1,70.0,60.0
2022-03-01,2,,55.0
2022-04-01,3,45.0,
2022-05-01,4,75.0,50.0
2022-08-01,5,,75.0
2012-10-01,6,90.0,
2022-11-01,7,95.0,80.0
2022-01-13,8,,57.0
2022-01-15,9,80.0,
2022-01-16,10,,64.0


In [101]:
df.interpolate(method='time')

Unnamed: 0_level_0,Student ID,Math,English
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,1,70.0,60.0
2022-03-01,2,59.276316,55.0
2022-04-01,3,45.0,52.459016
2022-05-01,4,75.0,50.0
2022-08-01,5,85.0,75.0
2012-10-01,6,90.0,60.0
2022-11-01,7,95.0,80.0
2022-01-13,8,78.571429,57.0
2022-01-15,9,80.0,61.666667
2022-01-16,10,79.539474,64.0


In [102]:
df.interpolate(method='nearest')

Unnamed: 0_level_0,Student ID,Math,English
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,1,70.0,60.0
2022-03-01,2,45.0,55.0
2022-04-01,3,45.0,50.0
2022-05-01,4,75.0,50.0
2022-08-01,5,75.0,75.0
2012-10-01,6,90.0,
2022-11-01,7,95.0,80.0
2022-01-13,8,80.0,57.0
2022-01-15,9,80.0,64.0
2022-01-16,10,80.0,64.0
