In [1]:
import numpy as np
import pandas as pd

## finding missing data

In [8]:
data = {
    "employee_id": [101, 102, 103, 104, 105, 106, 107],
    "name": ["Alice", "Bob", np.nan, "David", "Emma", "Frank", "Grace"],
    "age": [25, np.nan, 29, 28, np.nan, 35, 30],
    "gender": ["F", "M", "F", np.nan, "F", "M", np.nan],
    "department": ["HR", np.nan, "IT", "Finance", "IT", np.nan, "HR"],
    "salary": [75000, 68000, np.nan, 90000, 81000, 77000, np.nan],
    "join_date": ["2021-01-15", "2020-03-22", "2022-07-19", np.nan, "2021-06-10", "2022-01-01", np.nan],
    "city": ["New York", np.nan, "Chicago", "Houston", "San Francisco", "Seattle", np.nan]
}
alex=pd.DataFrame(data) # np.nan it means null value

In [9]:
alex

Unnamed: 0,employee_id,name,age,gender,department,salary,join_date,city
0,101,Alice,25.0,F,HR,75000.0,2021-01-15,New York
1,102,Bob,,M,,68000.0,2020-03-22,
2,103,,29.0,F,IT,,2022-07-19,Chicago
3,104,David,28.0,,Finance,90000.0,,Houston
4,105,Emma,,F,IT,81000.0,2021-06-10,San Francisco
5,106,Frank,35.0,M,,77000.0,2022-01-01,Seattle
6,107,Grace,30.0,,HR,,,


In [12]:
alex.isnull() #or alex.isna()

Unnamed: 0,employee_id,name,age,gender,department,salary,join_date,city
0,False,False,False,False,False,False,False,False
1,False,False,True,False,True,False,False,True
2,False,True,False,False,False,True,False,False
3,False,False,False,True,False,False,True,False
4,False,False,True,False,False,False,False,False
5,False,False,False,False,True,False,False,False
6,False,False,False,True,False,True,True,True


In [15]:
alex.isnull().sum() # sum column wise, is there any null value, so give me total null in that column

employee_id    0
name           1
age            2
gender         2
department     2
salary         2
join_date      2
city           2
dtype: int64

In [17]:
alex.isnull().any() #i want to ask, any row which don't contain null value

employee_id    False
name            True
age             True
gender          True
department      True
salary          True
join_date       True
city            True
dtype: bool

## removing null data

In [18]:
alex

Unnamed: 0,employee_id,name,age,gender,department,salary,join_date,city
0,101,Alice,25.0,F,HR,75000.0,2021-01-15,New York
1,102,Bob,,M,,68000.0,2020-03-22,
2,103,,29.0,F,IT,,2022-07-19,Chicago
3,104,David,28.0,,Finance,90000.0,,Houston
4,105,Emma,,F,IT,81000.0,2021-06-10,San Francisco
5,106,Frank,35.0,M,,77000.0,2022-01-01,Seattle
6,107,Grace,30.0,,HR,,,


In [19]:
# we remove null data by row wise

In [22]:
alex.dropna() # this remove whole rows which contain null value

Unnamed: 0,employee_id,name,age,gender,department,salary,join_date,city
0,101,Alice,25.0,F,HR,75000.0,2021-01-15,New York


In [30]:
alex.dropna(thresh=7) # this means, rows which doesnot contain atleast 7 non null values will be removed

Unnamed: 0,employee_id,name,age,gender,department,salary,join_date,city
0,101,Alice,25.0,F,HR,75000.0,2021-01-15,New York
4,105,Emma,,F,IT,81000.0,2021-06-10,San Francisco
5,106,Frank,35.0,M,,77000.0,2022-01-01,Seattle


## filling the missing value

In [47]:
alex.fillna(0)  #to apply the changes in dataframe, u have to mention inplace= True, in any function

Unnamed: 0,employee_id,name,age,gender,department,salary,join_date,city
0,101,Alice,25.0,F,HR,75000.0,2021-01-15,New York
1,102,Bob,0.0,M,0,68000.0,2020-03-22,0
2,103,0,29.0,F,IT,0.0,2022-07-19,Chicago
3,104,David,28.0,0,Finance,90000.0,0,Houston
4,105,Emma,0.0,F,IT,81000.0,2021-06-10,San Francisco
5,106,Frank,35.0,M,0,77000.0,2022-01-01,Seattle
6,107,Grace,30.0,0,HR,0.0,0,0


In [49]:
miss_data={'employee_id':111,'name':'Ali','age':24,'gender':'M','department':'Finance','salary':43543,'join_date':'2022-02-23','city':'lucknow'}
alex.fillna(value=miss_data)

Unnamed: 0,employee_id,name,age,gender,department,salary,join_date,city
0,101,Alice,25.0,F,HR,75000.0,2021-01-15,New York
1,102,Bob,24.0,M,Finance,68000.0,2020-03-22,lucknow
2,103,Ali,29.0,F,IT,43543.0,2022-07-19,Chicago
3,104,David,28.0,M,Finance,90000.0,2022-02-23,Houston
4,105,Emma,24.0,F,IT,81000.0,2021-06-10,San Francisco
5,106,Frank,35.0,M,Finance,77000.0,2022-01-01,Seattle
6,107,Grace,30.0,M,HR,43543.0,2022-02-23,lucknow


In [None]:
# alex.fillna(alex.mean()) this one operate when your table contain only integer or numeric value, coz alex.mean() fills null value with the mean of 
#whole table