## <font color="maroon"><h4 align="center">Handling Missing Data - fillna, interpolate, dropna</font>

In [1115]:
import pandas as pd
df = pd.read_csv("weather_data.csv",parse_dates=['day'])
type(df.day[0])

# type(df.day[0]) changes 01-01-2017 to 2017-01-01	

df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32,6,Rain
1,2017-01-04,32,9,Sunny
2,2017-01-05,28,9,Snow
3,2017-01-06,32,7,Snow
4,2017-01-07,32,9,Rain
5,2017-01-08,32,9,Sunny
6,2017-03-09,32,9,Cloudy
7,2017-01-10,34,8,Rain
8,2017-01-11,40,7,Rain


In [1116]:
df.set_index('day',inplace=True)
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-04,32,9,Sunny
2017-01-05,28,9,Snow
2017-01-06,32,7,Snow
2017-01-07,32,9,Rain
2017-01-08,32,9,Sunny
2017-03-09,32,9,Cloudy
2017-01-10,34,8,Rain
2017-01-11,40,7,Rain


## <font color="blue">fillna</font>

<font color="purple">**Fill all NaN with one specific value**</font>

In [1117]:
new_df = df.fillna(0)
new_df
# Fills Empty Value With Zero

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-04,32,9,Sunny
2017-01-05,28,9,Snow
2017-01-06,32,7,Snow
2017-01-07,32,9,Rain
2017-01-08,32,9,Sunny
2017-03-09,32,9,Cloudy
2017-01-10,34,8,Rain
2017-01-11,40,7,Rain


<font color="purple">**Fill na using column names and dict**</font>

In [1118]:
new_df = df.fillna({
        'temperature': 0,
        'windspeed': 0,
        'event': 'No Event'
    })
new_df

# Fills empty Values With specific values using Dictionary
# For eg.Empty Event will be replaced with 'No Event' string 

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-04,32,9,Sunny
2017-01-05,28,9,Snow
2017-01-06,32,7,Snow
2017-01-07,32,9,Rain
2017-01-08,32,9,Sunny
2017-03-09,32,9,Cloudy
2017-01-10,34,8,Rain
2017-01-11,40,7,Rain


<font color="purple">**Use method to determine how to fill na values**</font>

In [1119]:
new_df = df.fillna(method="ffill")
new_df
# Fills Empty Value With  Previous Value


  new_df = df.fillna(method="ffill")


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-04,32,9,Sunny
2017-01-05,28,9,Snow
2017-01-06,32,7,Snow
2017-01-07,32,9,Rain
2017-01-08,32,9,Sunny
2017-03-09,32,9,Cloudy
2017-01-10,34,8,Rain
2017-01-11,40,7,Rain


In [1120]:
new_df = df.fillna(method="bfill")
new_df
# Fills  missing values with the value from the next row also  
# for the last row it show NaN as there is no  next row to fill from. 


  new_df = df.fillna(method="bfill")


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-04,32,9,Sunny
2017-01-05,28,9,Snow
2017-01-06,32,7,Snow
2017-01-07,32,9,Rain
2017-01-08,32,9,Sunny
2017-03-09,32,9,Cloudy
2017-01-10,34,8,Rain
2017-01-11,40,7,Rain


<font color="purple">**Use of axis**</font>

In [1121]:
new_df = df.fillna(method="bfill", axis="index") # axis is either "index" or "columns"
new_df
# For index
# Fills  missing values with the value from the next row also  
# for the last row it show NaN as there is no  next row to fill from.
# ---------------------------------------------------------------------------------------
# For columns
# Fills  missing values with the value from the next columns also  
# for the last columns it show NaN as there is no  next columns to fill from. 

  new_df = df.fillna(method="bfill", axis="index") # axis is either "index" or "columns"


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-04,32,9,Sunny
2017-01-05,28,9,Snow
2017-01-06,32,7,Snow
2017-01-07,32,9,Rain
2017-01-08,32,9,Sunny
2017-03-09,32,9,Cloudy
2017-01-10,34,8,Rain
2017-01-11,40,7,Rain


<font color="purple">**limit parameter**</font>

In [1122]:
new_df = df.fillna(method="ffill",limit=1)
new_df
# Replaces NaN value with previous  value in the same column


  new_df = df.fillna(method="ffill",limit=1)


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-04,32,9,Sunny
2017-01-05,28,9,Snow
2017-01-06,32,7,Snow
2017-01-07,32,9,Rain
2017-01-08,32,9,Sunny
2017-03-09,32,9,Cloudy
2017-01-10,34,8,Rain
2017-01-11,40,7,Rain


### <font color="blue">interpolate</font>

In [1123]:
new_df = df.interpolate()
new_df
# It takes the value ahead of the NaN value value behind of the NaN values  and averages them to fill the NaN value. 


  new_df = df.interpolate()


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-04,32,9,Sunny
2017-01-05,28,9,Snow
2017-01-06,32,7,Snow
2017-01-07,32,9,Rain
2017-01-08,32,9,Sunny
2017-03-09,32,9,Cloudy
2017-01-10,34,8,Rain
2017-01-11,40,7,Rain


In [1124]:
new_df = df.interpolate(method="time") 
new_df      
# Looks at Time: It checks the time stamps of your data.
# Fills in Gaps: If there are missing values, it estimates what those values should 
# be based on the values before and after them, considering how much time has passed.
# Smooths Data: This helps create a smooth line of data over time, making it more complete and useful.
# So, it’s a way to guess missing data points while respecting the order of time!

  new_df = df.interpolate(method="time")


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-04,32,9,Sunny
2017-01-05,28,9,Snow
2017-01-06,32,7,Snow
2017-01-07,32,9,Rain
2017-01-08,32,9,Sunny
2017-03-09,32,9,Cloudy
2017-01-10,34,8,Rain
2017-01-11,40,7,Rain


**Notice that in above temperature on 2017-01-04 is 29 instead of 30 (in plain linear interpolate)**

**There are many other methods for interpolation such as quadratic, piecewise_polynomial, cubic etc. 
Just google "dataframe interpolate" to see complete documentation**

### <font color="blue">dropna</font>

In [1125]:
new_df = df.dropna()
new_df
# In simple words, 
# df.dropna() removes any rows from the DataFrame (df) that have missing values (like blanks or NaNs).
# So, if a row has even one empty spot,
#  that whole row gets taken out. The result is a cleaner table with only the complete rows left.

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-04,32,9,Sunny
2017-01-05,28,9,Snow
2017-01-06,32,7,Snow
2017-01-07,32,9,Rain
2017-01-08,32,9,Sunny
2017-03-09,32,9,Cloudy
2017-01-10,34,8,Rain
2017-01-11,40,7,Rain


In [1126]:
new_df = df.dropna(how='all')
new_df

# If a row has at least one value that isn’t missing, it will stay in the new DataFrame (new_df).
# If more than one value is missing  in a row, it will be dropped. If all values are missing, it will be dropped.

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-04,32,9,Sunny
2017-01-05,28,9,Snow
2017-01-06,32,7,Snow
2017-01-07,32,9,Rain
2017-01-08,32,9,Sunny
2017-03-09,32,9,Cloudy
2017-01-10,34,8,Rain
2017-01-11,40,7,Rain


In [1127]:
new_df = df.dropna(thresh=1)
new_df  
# If a row has 1 values, it will be dropped as thresh=1
# If a row has 2 values, it will be dropped as thresh=2 
# If we keep thresh=4 , then rows with 4 values will be dropped


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-04,32,9,Sunny
2017-01-05,28,9,Snow
2017-01-06,32,7,Snow
2017-01-07,32,9,Rain
2017-01-08,32,9,Sunny
2017-03-09,32,9,Cloudy
2017-01-10,34,8,Rain
2017-01-11,40,7,Rain


### <font color="blue">Inserting Missing Dates</font>

In [1128]:
dt = pd.date_range("01-01-2017","01-11-2017")
idx = pd.DatetimeIndex(dt)
df.reindex(idx)

Unnamed: 0,temperature,windspeed,event
2017-01-01,32.0,6.0,Rain
2017-01-02,,,
2017-01-03,,,
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,32.0,7.0,Snow
2017-01-07,32.0,9.0,Rain
2017-01-08,32.0,9.0,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Rain
