10. Handling Missing Values 

In [25]:
import pandas as pd
import numpy as np

In [26]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    "Name": ["Aman", "Riya", "Kunal", "Sneha", "Vikram"],
    "Age": [22, np.nan, 24, 28, np.nan],
    "Salary": [50000, 48000, np.nan, 70000, 45000],
    "JoinDate": ["2021-01-12", None, "2022-07-10", "2019-11-05", None]
})

df


Unnamed: 0,Name,Age,Salary,JoinDate
0,Aman,22.0,50000.0,2021-01-12
1,Riya,,48000.0,
2,Kunal,24.0,,2022-07-10
3,Sneha,28.0,70000.0,2019-11-05
4,Vikram,,45000.0,


In [27]:
# Detect Missing Values (VERY IMPORTANT)
# Check NaN
df.isnull()  
# True = missing
# False = value present


Unnamed: 0,Name,Age,Salary,JoinDate
0,False,False,False,False
1,False,True,False,True
2,False,False,True,False
3,False,False,False,False
4,False,True,False,True


In [28]:
# Count missing values
df.isnull().sum()

Name        0
Age         2
Salary      1
JoinDate    2
dtype: int64

In [29]:
# Opposite (non-missing)
df.notnull()

Unnamed: 0,Name,Age,Salary,JoinDate
0,True,True,True,True
1,True,False,True,False
2,True,True,False,True
3,True,True,True,True
4,True,False,True,False


In [30]:
# Remove Missing Values (dropna)
df.dropna() 

Unnamed: 0,Name,Age,Salary,JoinDate
0,Aman,22.0,50000.0,2021-01-12
3,Sneha,28.0,70000.0,2019-11-05


In [31]:
# Drop rows with ALL missing values
df.dropna(how="all")

Unnamed: 0,Name,Age,Salary,JoinDate
0,Aman,22.0,50000.0,2021-01-12
1,Riya,,48000.0,
2,Kunal,24.0,,2022-07-10
3,Sneha,28.0,70000.0,2019-11-05
4,Vikram,,45000.0,


In [32]:
# Drop rows based on specific column
df.dropna(subset=["Salary"])

Unnamed: 0,Name,Age,Salary,JoinDate
0,Aman,22.0,50000.0,2021-01-12
1,Riya,,48000.0,
3,Sneha,28.0,70000.0,2019-11-05
4,Vikram,,45000.0,


In [33]:
# Drop columns with missing values
df.dropna(axis=1)

Unnamed: 0,Name
0,Aman
1,Riya
2,Kunal
3,Sneha
4,Vikram


In [34]:
# Fill Missing Values (fillna) ðŸ”¥ MOST IMPORTANT
# Fill with fixed value
df["Age"].fillna(0)
df

Unnamed: 0,Name,Age,Salary,JoinDate
0,Aman,22.0,50000.0,2021-01-12
1,Riya,,48000.0,
2,Kunal,24.0,,2022-07-10
3,Sneha,28.0,70000.0,2019-11-05
4,Vikram,,45000.0,


In [35]:
# Fill with mean (BEST PRACTICE for numeric)
df["Age"].fillna(df["Age"].mean(),inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(),inplace=True)


Unnamed: 0,Name,Age,Salary,JoinDate
0,Aman,22.0,50000.0,2021-01-12
1,Riya,24.666667,48000.0,
2,Kunal,24.0,,2022-07-10
3,Sneha,28.0,70000.0,2019-11-05
4,Vikram,24.666667,45000.0,


In [36]:
# Fill Salary with median
df["Salary"].fillna(df["Salary"].median(),inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Salary"].fillna(df["Salary"].median(),inplace=True)


Unnamed: 0,Name,Age,Salary,JoinDate
0,Aman,22.0,50000.0,2021-01-12
1,Riya,24.666667,48000.0,
2,Kunal,24.0,49000.0,2022-07-10
3,Sneha,28.0,70000.0,2019-11-05
4,Vikram,24.666667,45000.0,


In [37]:
# Handle Missing Dates (NaT)
df["JoinDate"] = pd.to_datetime(df["JoinDate"],errors="coerce")

In [38]:
# Fill missing dates
df["JoinDate"].fillna(pd.Timestamp("2020-02-11"),inplace=True)
df

Unnamed: 0,Name,Age,Salary,JoinDate
0,Aman,22.0,50000.0,2021-01-12
1,Riya,24.666667,48000.0,2020-02-11
2,Kunal,24.0,49000.0,2022-07-10
3,Sneha,28.0,70000.0,2019-11-05
4,Vikram,24.666667,45000.0,2020-02-11


In [None]:
# # Replace values (replace vs fillna)
# df.replace(np.nan,0,inplace=True)
df

Unnamed: 0,Name,Age,Salary,JoinDate
0,Aman,22.0,50000.0,2021-01-12
1,Riya,24.666667,48000.0,2020-02-11
2,Kunal,24.0,49000.0,2022-07-10
3,Sneha,28.0,70000.0,2019-11-05
4,Vikram,24.666667,45000.0,2020-02-11
