In [5]:
#importing the required libraries
import pandas as pd
import numpy as np

In [6]:
#importing the dataset as a dataframe
df = pd.read_csv('Missing Values.csv')

In [7]:
# displaying the head of dataset
print(df.head())

           PID  ST_NUM    ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0  100001000.0   104.0     PUTNAM            Y            3        1  1000
1  100002000.0   197.0  LEXINGTON            N            3      1.5    --
2  100003000.0     NaN  LEXINGTON            N          NaN        1   850
3  100004000.0   201.0   BERKELEY           12            1      NaN   700
4          NaN   203.0   BERKELEY            Y            3        2  1600


In [8]:
#Checking the data type of the columns in dataframe 
df.dtypes

PID             float64
ST_NUM          float64
ST_NAME          object
OWN_OCCUPIED     object
NUM_BEDROOMS     object
NUM_BATH         object
SQ_FT            object
dtype: object

In [9]:
# Changing the data type of one column from 'object' to 'String'
df['OWN_OCCUPIED'] = df['OWN_OCCUPIED'].astype(str)

In [10]:
# Looking at the Missing Values in Street Number Column
print(df['ST_NUM'])
df['ST_NUM'].isnull()

0    104.0
1    197.0
2      NaN
3    201.0
4    203.0
5    207.0
6      NaN
7    213.0
8    215.0
Name: ST_NUM, dtype: float64


0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
8    False
Name: ST_NUM, dtype: bool

In [11]:
# Analyzing what missing values are recognized by pandas
print(df['NUM_BEDROOMS'])
df['NUM_BEDROOMS'].isnull()

0      3
1      3
2    NaN
3      1
4      3
5    NaN
6      2
7      1
8     na
Name: NUM_BEDROOMS, dtype: object


0    False
1    False
2     True
3    False
4    False
5     True
6    False
7    False
8    False
Name: NUM_BEDROOMS, dtype: bool

In [12]:
# making a list of missing values so that pandas recognizes them while reading the csv

missing_values = ["n/a", "na", "--"]
df = pd.read_csv("Missing Values.csv", na_values = missing_values)

In [13]:
# looking into the bedroom column

print(df['NUM_BEDROOMS'])
df['NUM_BEDROOMS'].isnull()

0    3.0
1    3.0
2    NaN
3    1.0
4    3.0
5    NaN
6    2.0
7    1.0
8    NaN
Name: NUM_BEDROOMS, dtype: float64


0    False
1    False
2     True
3    False
4    False
5     True
6    False
7    False
8     True
Name: NUM_BEDROOMS, dtype: bool

In [14]:
print(df['OWN_OCCUPIED'])
df['OWN_OCCUPIED'].isnull()

0      Y
1      N
2      N
3     12
4      Y
5      Y
6    NaN
7      Y
8      Y
Name: OWN_OCCUPIED, dtype: object


0    False
1    False
2    False
3    False
4    False
5    False
6     True
7    False
8    False
Name: OWN_OCCUPIED, dtype: bool

In [16]:
# Detecting numbers in categorical Variable

count = 0 
for i in df['OWN_OCCUPIED']:
    try:
        int(i)
        df.loc[count, 'OWN_OCCUPIED']= np.nan
    except ValueError:
        pass
    count += 1    

In [17]:
# Summarizing Missing Values
df.isnull().sum()

PID             1
ST_NUM          2
ST_NAME         0
OWN_OCCUPIED    2
NUM_BEDROOMS    3
NUM_BATH        1
SQ_FT           2
dtype: int64

In [19]:
# total count of missing values in the DataFrame

df.isnull().sum().sum()

11

In [26]:
# Replacing the missing values wiht a number

df['ST_NUM'].fillna(125, inplace = True)

print(df['ST_NUM'])

# Replacement based on location

df.loc[2, 'ST_NUM'] = 125

print(df['ST_NUM'])

0    104.0
1    197.0
2    125.0
3    201.0
4    203.0
5    207.0
6    125.0
7    213.0
8    215.0
Name: ST_NUM, dtype: float64
0    104.0
1    197.0
2    125.0
3    201.0
4    203.0
5    207.0
6    125.0
7    213.0
8    215.0
Name: ST_NUM, dtype: float64


In [29]:
# Replacing values with Median

median = df['NUM_BEDROOMS'].median()
print(median)
print(df['NUM_BEDROOMS'])
df['NUM_BEDROOMS'].fillna(median, inplace = True)
print(df['NUM_BEDROOMS'])

2.5
0    3.0
1    3.0
2    NaN
3    1.0
4    3.0
5    NaN
6    2.0
7    1.0
8    NaN
Name: NUM_BEDROOMS, dtype: float64
0    3.0
1    3.0
2    2.5
3    1.0
4    3.0
5    2.5
6    2.0
7    1.0
8    2.5
Name: NUM_BEDROOMS, dtype: float64
