![Cover](cover/05.%20Missing%20Values.png)

#### Outline
* na_values
* keep_default_na
* na_filter
* isnull()
* notnull()
* dropna

### Handle Missing Values
- Handling missing values means finding and fixing missing data.
- Missing data occurs due to errors or incomplete information.
- It’s important because it affects analysis and model accuracy.

#### Import Library

In [1]:
import pandas as pd

#### na_values

In [3]:
dataset = pd.read_csv('dataset.csv')
dataset

Unnamed: 0,House_ID,Location,Area_sqft,Bedrooms,Price_USD
0,H1,City Center,1200,3,250000.0
1,H2,Suburb,1800,4,320000.0
2,H3,Rural,950,,180000.0
3,H4,City Center,2000,5,
4,H5,Suburb,,3,270000.0
5,H6,Rural,1100,2,200000.0
6,H7,Suburb,1750,nooo,310000.0
7,H8,City Center,1600,4,295000.0
8,H9,Rural,not availableo,3,190000.0
9,H10,Suburb,1400,2,


In [5]:
dataset1 = pd.read_csv('dataset.csv', na_values=['not availableo', 'nooo'])
dataset1

Unnamed: 0,House_ID,Location,Area_sqft,Bedrooms,Price_USD
0,H1,City Center,1200.0,3.0,250000.0
1,H2,Suburb,1800.0,4.0,320000.0
2,H3,Rural,950.0,,180000.0
3,H4,City Center,2000.0,5.0,
4,H5,Suburb,,3.0,270000.0
5,H6,Rural,1100.0,2.0,200000.0
6,H7,Suburb,1750.0,,310000.0
7,H8,City Center,1600.0,4.0,295000.0
8,H9,Rural,,3.0,190000.0
9,H10,Suburb,1400.0,2.0,


In [6]:
dataset2 = pd.read_csv('dataset.csv', na_values={'Area_sqft': 'not availableo', 'Price_USD': 'nooo'})
dataset2

Unnamed: 0,House_ID,Location,Area_sqft,Bedrooms,Price_USD
0,H1,City Center,1200.0,3,250000.0
1,H2,Suburb,1800.0,4,320000.0
2,H3,Rural,950.0,,180000.0
3,H4,City Center,2000.0,5,
4,H5,Suburb,,3,270000.0
5,H6,Rural,1100.0,2,200000.0
6,H7,Suburb,1750.0,nooo,310000.0
7,H8,City Center,1600.0,4,295000.0
8,H9,Rural,,3,190000.0
9,H10,Suburb,1400.0,2,


#### Keep_default_na
- Controls how pandas treats missing values while reading a CSV file.
- By default, pandas automatically interprets certain strings like "NA", "NaN", "n/a", "null", or "--" as missing values (NaN)

In [7]:
dataset1

Unnamed: 0,House_ID,Location,Area_sqft,Bedrooms,Price_USD
0,H1,City Center,1200.0,3.0,250000.0
1,H2,Suburb,1800.0,4.0,320000.0
2,H3,Rural,950.0,,180000.0
3,H4,City Center,2000.0,5.0,
4,H5,Suburb,,3.0,270000.0
5,H6,Rural,1100.0,2.0,200000.0
6,H7,Suburb,1750.0,,310000.0
7,H8,City Center,1600.0,4.0,295000.0
8,H9,Rural,,3.0,190000.0
9,H10,Suburb,1400.0,2.0,


In [9]:
data = pd.read_csv('Data.csv', keep_default_na=False)
data

Unnamed: 0,House_ID,Location,Area_sqft,Bedrooms,Price_USD
0,H1,City Center,1200.0,3.0,250000.0
1,H2,Suburb,1800.0,4.0,320000.0
2,H3,Rural,950.0,,180000.0
3,H4,City Center,2000.0,5.0,
4,H5,Suburb,,3.0,270000.0
5,H6,Rural,1100.0,2.0,200000.0
6,H7,Suburb,1750.0,,310000.0
7,H8,City Center,1600.0,4.0,295000.0
8,H9,Rural,,3.0,190000.0
9,H10,Suburb,1400.0,2.0,


In [10]:
data = pd.read_csv('Data.csv', keep_default_na=True)
data

Unnamed: 0,House_ID,Location,Area_sqft,Bedrooms,Price_USD
0,H1,City Center,1200.0,3.0,250000.0
1,H2,Suburb,1800.0,4.0,320000.0
2,H3,Rural,950.0,,180000.0
3,H4,City Center,2000.0,5.0,
4,H5,Suburb,,3.0,270000.0
5,H6,Rural,1100.0,2.0,200000.0
6,H7,Suburb,1750.0,,310000.0
7,H8,City Center,1600.0,4.0,295000.0
8,H9,Rural,,3.0,190000.0
9,H10,Suburb,1400.0,2.0,


### isnull()

In [15]:
data.isnull()

Unnamed: 0,House_ID,Location,Area_sqft,Bedrooms,Price_USD
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,True,False
3,False,False,False,False,True
4,False,False,True,False,False
5,False,False,False,False,False
6,False,False,False,True,False
7,False,False,False,False,False
8,False,False,True,False,False
9,False,False,False,False,True


In [16]:
data.isnull().sum()

House_ID     0
Location     0
Area_sqft    2
Bedrooms     2
Price_USD    2
dtype: int64

In [17]:
data.isnull().sum().sum()

np.int64(6)

#### notnull()

In [18]:
data

Unnamed: 0,House_ID,Location,Area_sqft,Bedrooms,Price_USD
0,H1,City Center,1200.0,3.0,250000.0
1,H2,Suburb,1800.0,4.0,320000.0
2,H3,Rural,950.0,,180000.0
3,H4,City Center,2000.0,5.0,
4,H5,Suburb,,3.0,270000.0
5,H6,Rural,1100.0,2.0,200000.0
6,H7,Suburb,1750.0,,310000.0
7,H8,City Center,1600.0,4.0,295000.0
8,H9,Rural,,3.0,190000.0
9,H10,Suburb,1400.0,2.0,


In [19]:
data.notnull()

Unnamed: 0,House_ID,Location,Area_sqft,Bedrooms,Price_USD
0,True,True,True,True,True
1,True,True,True,True,True
2,True,True,True,False,True
3,True,True,True,True,False
4,True,True,False,True,True
5,True,True,True,True,True
6,True,True,True,False,True
7,True,True,True,True,True
8,True,True,False,True,True
9,True,True,True,True,False


In [20]:
data.notnull().sum()

House_ID     10
Location     10
Area_sqft     8
Bedrooms      8
Price_USD     8
dtype: int64

In [21]:
data.notnull().sum().sum()

np.int64(44)

#### dropna

In [32]:
data

Unnamed: 0,House_ID,Location,Area_sqft,Bedrooms,Price_USD
0,H1,City Center,1200.0,3.0,250000.0
1,H2,Suburb,1800.0,4.0,320000.0
2,H3,Rural,950.0,,180000.0
3,H4,City Center,2000.0,5.0,
4,H5,Suburb,,3.0,270000.0
5,H6,Rural,1100.0,2.0,200000.0
6,H7,Suburb,1750.0,,310000.0
7,H8,City Center,1600.0,4.0,295000.0
8,H9,Rural,,3.0,190000.0
9,H10,Suburb,1400.0,2.0,


In [33]:
data.dropna()

Unnamed: 0,House_ID,Location,Area_sqft,Bedrooms,Price_USD
0,H1,City Center,1200.0,3.0,250000.0
1,H2,Suburb,1800.0,4.0,320000.0
5,H6,Rural,1100.0,2.0,200000.0
7,H8,City Center,1600.0,4.0,295000.0


In [34]:
data.dropna(inplace=True)

In [35]:
data

Unnamed: 0,House_ID,Location,Area_sqft,Bedrooms,Price_USD
0,H1,City Center,1200.0,3.0,250000.0
1,H2,Suburb,1800.0,4.0,320000.0
5,H6,Rural,1100.0,2.0,200000.0
7,H8,City Center,1600.0,4.0,295000.0


### For Source code:
https://sites.google.com/view/aorbtech/programming/

### @Aorb Tech
.