# fillna to file missing values using different ways.
# interpolate to make a guess on missing values using interpolation.
# dropna to drop rows with missing values

In [3]:
import pandas as pd
df = pd.read_csv("05_handledata.csv")
df

Unnamed: 0,Day,Temp,Windspeed,Event
0,01/01/2024,3.0,9.0,Snow
1,01/02/2024,,,
2,01/03/2024,19.0,8.0,Rain
3,01/04/2024,,,
4,01/05/2024,32.0,4.0,Sunny
5,01/06/2024,,,
6,01/07/2024,40.0,2.0,Hot
7,01/08/2024,42.0,1.0,Hot
8,01/09/2024,,,
9,02/09/2024,,,


In [5]:
df = pd.read_csv("05_handledata.csv")
df.Day[0]

'01/01/2024'

In [6]:
df = pd.read_csv("05_handledata.csv")
type(df.Day[0]) # see it is type of string so to convert it to date type we need to parse it

str

In [7]:
df = pd.read_csv("05_handledata.csv",parse_dates=['Day'])
type(df.Day[0])

pandas._libs.tslibs.timestamps.Timestamp

In [8]:
 df = pd.read_csv("05_handledata.csv",parse_dates=['Day'])
df # see now it is converted

Unnamed: 0,Day,Temp,Windspeed,Event
0,2024-01-01,3.0,9.0,Snow
1,2024-01-02,,,
2,2024-01-03,19.0,8.0,Rain
3,2024-01-04,,,
4,2024-01-05,32.0,4.0,Sunny
5,2024-01-06,,,
6,2024-01-07,40.0,2.0,Hot
7,2024-01-08,42.0,1.0,Hot
8,2024-01-09,,,
9,2024-01-10,20.0,6.0,Mild


In [5]:
# to make the day as an index
df = pd.read_csv("05_handledata.csv",parse_dates=['Day'])
df.set_index('Day',inplace=True)
df # now our index is day

Unnamed: 0_level_0,Temp,Windspeed,Event
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-01,3.0,9.0,Snow
2024-01-02,,,
2024-01-03,19.0,8.0,Rain
2024-01-04,,,
2024-01-05,32.0,4.0,Sunny
2024-01-06,,,
2024-01-07,40.0,2.0,Hot
2024-01-08,42.0,1.0,Hot
2024-01-09,,,
2024-02-09,,,


In [11]:
new_df =df.fillna(0)
new_df # it convert all the NA values to 0 values but it's not accurate as Event dont need to have 0 value because it need data like snow
# rain etc so for this we need to specify the speicific data

Unnamed: 0_level_0,Temp,Windspeed,Event
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-01,3.0,9.0,Snow
2024-01-02,0.0,0.0,0
2024-01-03,19.0,8.0,Rain
2024-01-04,0.0,0.0,0
2024-01-05,32.0,4.0,Sunny
2024-01-06,0.0,0.0,0
2024-01-07,40.0,2.0,Hot
2024-01-08,42.0,1.0,Hot
2024-01-09,0.0,0.0,0
2024-01-10,20.0,6.0,Mild


In [7]:
new_df =df.fillna({
    'Temp':0,
    'Windspeed':0,
    'Event':"No event"
})
new_df # now it's look good as all the data have accurate results

Unnamed: 0_level_0,Temp,Windspeed,Event
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-01,3.0,9.0,Snow
2024-01-02,0.0,0.0,No event
2024-01-03,19.0,8.0,Rain
2024-01-04,0.0,0.0,No event
2024-01-05,32.0,4.0,Sunny
2024-01-06,0.0,0.0,No event
2024-01-07,40.0,2.0,Hot
2024-01-08,42.0,1.0,Hot
2024-01-09,0.0,0.0,No event
2024-02-09,0.0,0.0,No event


In [15]:
# in previous example we supply 0 to th missing NA values which is good that we fill but not in a sense of math because when
# we gonna find the median of that specific data like temperature than from 32C and dropping to 0C will make a dumb impact on the entire data
# so for this we gonna use the forwarfill method. Forwardfill means filling data from the previous one

In [8]:
new_df =df.ffill()
new_df # what it does that it forward the values like in row 1 we have 3 temp so it forware the 3 temp value to row 2

Unnamed: 0_level_0,Temp,Windspeed,Event
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-01,3.0,9.0,Snow
2024-01-02,3.0,9.0,Snow
2024-01-03,19.0,8.0,Rain
2024-01-04,19.0,8.0,Rain
2024-01-05,32.0,4.0,Sunny
2024-01-06,32.0,4.0,Sunny
2024-01-07,40.0,2.0,Hot
2024-01-08,42.0,1.0,Hot
2024-01-09,42.0,1.0,Hot
2024-02-09,42.0,1.0,Hot


In [20]:
# similary we have a method called backfill means filling data from the front data
new_df = df.bfill()
new_df

Unnamed: 0_level_0,Temp,Windspeed,Event
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-01,3.0,9.0,Snow
2024-01-02,19.0,8.0,Rain
2024-01-03,19.0,8.0,Rain
2024-01-04,32.0,4.0,Sunny
2024-01-05,32.0,4.0,Sunny
2024-01-06,40.0,2.0,Hot
2024-01-07,40.0,2.0,Hot
2024-01-08,42.0,1.0,Hot
2024-01-09,20.0,6.0,Mild
2024-01-10,20.0,6.0,Mild


In [41]:
df1 = pd.read_csv("05_handledata1.csv")
df1
new_df = df1.ffill(limit=1)
new_df # what it do is that it can now only forward values just one time like at row 8 we seen that it jsut forward 42 one time and not forward it to row 9

Unnamed: 0,Day,Temp,Windspeed,Event
0,01/01/2024,3.0,9.0,Snow
1,01/02/2024,3.0,9.0,Snow
2,01/03/2024,19.0,8.0,Rain
3,01/04/2024,19.0,8.0,Rain
4,01/05/2024,32.0,4.0,Sunny
5,01/06/2024,32.0,4.0,Sunny
6,01/07/2024,40.0,2.0,Hot
7,01/08/2024,42.0,1.0,Hot
8,01/09/2024,42.0,1.0,Hot
9,02/09/2024,,,


In [46]:
new_df=df.fillna()
new_df

Unnamed: 0,Day,Temp,Windspeed,Event
0,01/01/2024,3.0,9.0,Snow
1,01/02/2024,,,
2,01/03/2024,19.0,8.0,Rain
3,01/04/2024,,,
4,01/05/2024,32.0,4.0,Sunny
5,01/06/2024,,,
6,01/07/2024,40.0,2.0,Hot
7,01/08/2024,42.0,1.0,Hot
8,01/09/2024,,,
9,01/10/2024,20.0,6.0,Mild


In [47]:
new_df =df.fillna({
    'Temp':0,
    'Windspeed':0,
    'Event':"No event"
})
new_df

Unnamed: 0,Day,Temp,Windspeed,Event
0,01/01/2024,3.0,9.0,Snow
1,01/02/2024,0.0,0.0,No event
2,01/03/2024,19.0,8.0,Rain
3,01/04/2024,0.0,0.0,No event
4,01/05/2024,32.0,4.0,Sunny
5,01/06/2024,0.0,0.0,No event
6,01/07/2024,40.0,2.0,Hot
7,01/08/2024,42.0,1.0,Hot
8,01/09/2024,0.0,0.0,No event
9,01/10/2024,20.0,6.0,Mild


In [48]:
new_df=df.ffill()
new_df

Unnamed: 0,Day,Temp,Windspeed,Event
0,01/01/2024,3.0,9.0,Snow
1,01/02/2024,3.0,9.0,Snow
2,01/03/2024,19.0,8.0,Rain
3,01/04/2024,19.0,8.0,Rain
4,01/05/2024,32.0,4.0,Sunny
5,01/06/2024,32.0,4.0,Sunny
6,01/07/2024,40.0,2.0,Hot
7,01/08/2024,42.0,1.0,Hot
8,01/09/2024,42.0,1.0,Hot
9,01/10/2024,20.0,6.0,Mild


In [17]:
new_df=df.bfill(axis="columns")
new_df

  new_df=df.bfill(axis="columns")


Unnamed: 0_level_0,Temp,Windspeed,Event
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-01,3.0,9.0,Snow
2024-01-02,,,
2024-01-03,19.0,8.0,Rain
2024-01-04,,,
2024-01-05,32.0,4.0,Sunny
2024-01-06,,,
2024-01-07,40.0,2.0,Hot
2024-01-08,42.0,1.0,Hot
2024-01-09,,,
2024-02-09,,,


In [18]:
new_df=df.bfill(limit=1)
new_df

Unnamed: 0_level_0,Temp,Windspeed,Event
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-01,3.0,9.0,Snow
2024-01-02,19.0,8.0,Rain
2024-01-03,19.0,8.0,Rain
2024-01-04,32.0,4.0,Sunny
2024-01-05,32.0,4.0,Sunny
2024-01-06,40.0,2.0,Hot
2024-01-07,40.0,2.0,Hot
2024-01-08,42.0,1.0,Hot
2024-01-09,,,
2024-02-09,20.0,6.0,Mild


In [28]:
new_df=df.ffill()
new_df

Unnamed: 0_level_0,Temp,Windspeed,Event
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-01,3.0,9.0,Snow
2024-01-02,11.0,8.5,Snow
2024-01-03,19.0,8.0,Rain
2024-01-04,25.5,6.0,Rain
2024-01-05,32.0,4.0,Sunny
2024-01-06,36.0,3.0,Sunny
2024-01-07,40.0,2.0,Hot
2024-01-08,42.0,1.0,Hot
2024-01-09,34.666667,2.666667,Hot
2024-02-09,27.333333,4.333333,Hot


In [29]:
new_df =df.interpolate(method='time') # this time method is used for the missing dates like if we 2024-01-01 and 2024-01-03 but dont have 2024-01-02 then this will perform some operation on it
new_df

  new_df =df.interpolate(method='time')


Unnamed: 0_level_0,Temp,Windspeed,Event
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-01,3.0,9.0,Snow
2024-01-02,11.0,8.5,
2024-01-03,19.0,8.0,Rain
2024-01-04,25.5,6.0,
2024-01-05,32.0,4.0,Sunny
2024-01-06,36.0,3.0,
2024-01-07,40.0,2.0,Hot
2024-01-08,42.0,1.0,Hot
2024-01-09,34.666667,2.666667,
2024-02-09,27.333333,4.333333,


Dropna

In [30]:
new_df=df.dropna()
new_df # it will drop all the NAN values

Unnamed: 0_level_0,Temp,Windspeed,Event
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-01,3.0,9.0,Snow
2024-01-03,19.0,8.0,Rain
2024-01-05,32.0,4.0,Sunny
2024-01-07,40.0,2.0,Hot
2024-01-08,42.0,1.0,Hot
2024-01-10,20.0,6.0,Mild
2024-01-11,9.0,10.0,Snow


In [31]:
new_df=df.dropna(how="all")
new_df

Unnamed: 0_level_0,Temp,Windspeed,Event
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-01,3.0,9.0,Snow
2024-01-02,11.0,8.5,
2024-01-03,19.0,8.0,Rain
2024-01-04,25.5,6.0,
2024-01-05,32.0,4.0,Sunny
2024-01-06,36.0,3.0,
2024-01-07,40.0,2.0,Hot
2024-01-08,42.0,1.0,Hot
2024-01-09,34.666667,2.666667,
2024-02-09,27.333333,4.333333,


In [35]:
new_df=df.dropna(thresh=3) # thresh means checking NAN values in a row if it =1 means it will drop a row with thresh one so we put 3 into it and all the columns with 3 NaN values got dropped
new_df

Unnamed: 0_level_0,Temp,Windspeed,Event
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-01,3.0,9.0,Snow
2024-01-03,19.0,8.0,Rain
2024-01-05,32.0,4.0,Sunny
2024-01-07,40.0,2.0,Hot
2024-01-08,42.0,1.0,Hot
2024-01-10,20.0,6.0,Mild
2024-01-11,9.0,10.0,Snow


In [36]:
dt =pd.date_range('01/01/2024','01/12/2024')
idx = pd.DatetimeIndex(dt)
df=df.reindex(idx)
df               # what it does is that it create a new date

Unnamed: 0,Temp,Windspeed,Event
2024-01-01,3.0,9.0,Snow
2024-01-02,11.0,8.5,
2024-01-03,19.0,8.0,Rain
2024-01-04,25.5,6.0,
2024-01-05,32.0,4.0,Sunny
2024-01-06,36.0,3.0,
2024-01-07,40.0,2.0,Hot
2024-01-08,42.0,1.0,Hot
2024-01-09,34.666667,2.666667,
2024-01-10,20.0,6.0,Mild
