#### HANDLING MISSING VALUES

#### To Check the missing data

In [38]:
#creating a dataset to be used for further examples
import pandas as pd
data = {
    "Name" : ["Ram",None,"Ghanshyam","Sita","Geeta","Mohan","Chirag"],
    "Age" : [25,26,25,20,27,29,23],
    "Salary" : [20000,30000,35000,25000,22000,None,40000],
    "City" : ["Delhi","Meerut","Pune",None,"Gurugram","Ghaziabad","Meerut"]
}
df = pd.DataFrame(data)
print(df)

        Name  Age   Salary       City
0        Ram   25  20000.0      Delhi
1       None   26  30000.0     Meerut
2  Ghanshyam   25  35000.0       Pune
3       Sita   20  25000.0       None
4      Geeta   27  22000.0   Gurugram
5      Mohan   29      NaN  Ghaziabad
6     Chirag   23  40000.0     Meerut


In [39]:
#to check NAN values in dataset

print(df.isnull())   
print("Missing values count")
print(df.isnull().sum())

    Name    Age  Salary   City
0  False  False   False  False
1   True  False   False  False
2  False  False   False  False
3  False  False   False   True
4  False  False   False  False
5  False  False    True  False
6  False  False   False  False
Missing values count
Name      1
Age       0
Salary    1
City      1
dtype: int64


#### To handle the missing data

In [40]:
# first method is to delete that specific row or column

new = df.dropna(axis=0,inplace = False)  #creation of new dataset
print(new)
print("\nTHE SHAPE OF NEWLY CREATED DATASET : ")
print(new.shape)

        Name  Age   Salary      City
0        Ram   25  20000.0     Delhi
2  Ghanshyam   25  35000.0      Pune
4      Geeta   27  22000.0  Gurugram
6     Chirag   23  40000.0    Meerut

THE SHAPE OF NEWLY CREATED DATASET : 
(4, 4)


In [None]:
# second method is to fill the missing values

new2 = df.fillna(10,inplace = False)    #filling default value
new2

Unnamed: 0,Name,Age,Salary,City
0,Ram,25.0,20000.0,Delhi
1,10,26.0,30000.0,Meerut
2,Ghanshyam,25.0,35000.0,Pune
3,Sita,10.0,25000.0,10
4,Geeta,27.0,22000.0,Gurugram
5,Mohan,29.0,10.0,Ghaziabad
6,Chirag,23.0,40000.0,Meerut


In [None]:
# to fill values column wise

df["Name"].fillna("Shyam",inplace = True)                   #handling the missing name
df["Salary"].fillna(df["Salary"].mean(),inplace = True)     #handling the missing salary by average salary
df 


Unnamed: 0,Name,Age,Salary,City
0,Ram,25,20000.0,Delhi
1,Shyam,26,30000.0,Meerut
2,Ghanshyam,25,35000.0,Pune
3,Sita,20,25000.0,
4,Geeta,27,22000.0,Gurugram
5,Mohan,29,28666.666667,Ghaziabad
6,Chirag,23,40000.0,Meerut


#### INTREPOLATION
used for filling missing values by estimation

In [16]:
#data creation
import numpy as np
import pandas as pd
df = pd.Series([1,np.nan,np.nan,10])
print(df)

0     1.0
1     NaN
2     NaN
3    10.0
dtype: float64


In [22]:
#pad method --> fill in the previous known values
new1 = df.interpolate(method = 'pad',inplace = False)
new1

  new1 = df.interpolate(method = 'pad',inplace = False)


0     1.0
1     1.0
2     1.0
3    10.0
dtype: float64

In [23]:
#nearest method --> fill nearest values
new2 = df.interpolate(method = 'nearest')
new2 

0     1.0
1     1.0
2    10.0
3    10.0
dtype: float64

In [24]:
#linear method to fill in missing values
new3 = df.interpolate(method = 'linear',inplace = False)
new3

0     1.0
1     4.0
2     7.0
3    10.0
dtype: float64

In [18]:
#polynomial method
#creation of new dataset
s = pd.Series([0, 2, np.nan, 8, np.nan, 18, 20])
print(s.interpolate(method='polynomial', order=2))

0     0.000000
1     2.000000
2     4.457143
3     8.000000
4    13.257143
5    18.000000
6    20.000000
dtype: float64


In [19]:
#time series data--> time method
s = pd.Series([10, None, 40],
              index=pd.to_datetime(['2023-01-01','2023-01-02','2023-01-04']))
print(s.interpolate(method='time'))

2023-01-01    10.0
2023-01-02    20.0
2023-01-04    40.0
dtype: float64
