## Managing missing Data

In [5]:
import pandas as pd
import numpy as np

data=pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', np.nan],
    'Age': [25, np.nan, 30, 35, 40],
    'City': ['New York', 'Los Angeles', np.nan, 'Chicago', 'Houston'],  
    'Score': [85, 90, np.nan, 95, np.nan],
    'ID': [111,112,113,114,115]


})
data

Unnamed: 0,Name,Age,City,Score,ID
0,Alice,25.0,New York,85.0,111
1,Bob,,Los Angeles,90.0,112
2,Charlie,30.0,,,113
3,David,35.0,Chicago,95.0,114
4,,40.0,Houston,,115


## Identifying null values

In [6]:
data.isnull()

Unnamed: 0,Name,Age,City,Score,ID
0,False,False,False,False,False
1,False,True,False,False,False
2,False,False,True,True,False
3,False,False,False,False,False
4,True,False,False,True,False


In [7]:
data.notnull()        ## Opposite of isnull()

Unnamed: 0,Name,Age,City,Score,ID
0,True,True,True,True,True
1,True,False,True,True,True
2,True,True,False,False,True
3,True,True,True,True,True
4,False,True,True,False,True


## Removing null values from dataframe

In [8]:
data.dropna()      ## Removes rows with any null values

Unnamed: 0,Name,Age,City,Score,ID
0,Alice,25.0,New York,85.0,111
3,David,35.0,Chicago,95.0,114


In [9]:
data.dropna(axis=1)   ## Removes columns with any null values

Unnamed: 0,ID
0,111
1,112
2,113
3,114
4,115


## Replacing missing Data

In [10]:
data.fillna('Unknown')

Unnamed: 0,Name,Age,City,Score,ID
0,Alice,25.0,New York,85.0,111
1,Bob,Unknown,Los Angeles,90.0,112
2,Charlie,30.0,Unknown,Unknown,113
3,David,35.0,Chicago,95.0,114
4,Unknown,40.0,Houston,Unknown,115


In [11]:
data.fillna(method='ffill')  ## Forward fill
                             ## Fills null values with the last known non-null value

  data.fillna(method='ffill')


Unnamed: 0,Name,Age,City,Score,ID
0,Alice,25.0,New York,85.0,111
1,Bob,25.0,Los Angeles,90.0,112
2,Charlie,30.0,Los Angeles,90.0,113
3,David,35.0,Chicago,95.0,114
4,David,40.0,Houston,95.0,115


In [12]:
data.fillna(method='bfill')  ## Backward fill
                             ## Fills null values with the next known non-null value

  data.fillna(method='bfill')  ## Backward fill


Unnamed: 0,Name,Age,City,Score,ID
0,Alice,25.0,New York,85.0,111
1,Bob,30.0,Los Angeles,90.0,112
2,Charlie,30.0,Chicago,95.0,113
3,David,35.0,Chicago,95.0,114
4,,40.0,Houston,,115


Filling average in place of NULL numerical columns

In [23]:
mean=data['Score'].mean()
## data['Score']=data['Score'].fillna(mean)  ## Filling average in place of NULL numerical columns and permanently changing the original dataframe
## data
data['Score'].fillna(mean)

0    85.0
1    90.0
2    90.0
3    95.0
4    90.0
Name: Score, dtype: float64

In [27]:
fill_values={
    'Name':'Unknown',
    'Age':data['Age'].mean(),
    'City':'Unknown',
    'Score':data['Score'].median()
}
data.fillna(fill_values)

Unnamed: 0,Name,Age,City,Score,ID
0,Alice,25.0,New York,85.0,111
1,Bob,32.5,Los Angeles,90.0,112
2,Charlie,30.0,Unknown,90.0,113
3,David,35.0,Chicago,95.0,114
4,Unknown,40.0,Houston,90.0,115
