In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# initialize the dataframe

df = pd.read_csv('housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [21]:
# see available features and their data types

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20637 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20637 non-null  float64
 1   latitude            20637 non-null  float64
 2   housing_median_age  20637 non-null  float64
 3   total_rooms         20637 non-null  float64
 4   total_bedrooms      20430 non-null  float64
 5   population          20637 non-null  float64
 6   households          20637 non-null  float64
 7   median_income       20637 non-null  float64
 8   median_house_value  20637 non-null  float64
 9   ocean_proximity     20637 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.7+ MB


In [43]:
# get an idea of the dataset

df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20430.0,20430.0,20430.0,20430.0,20430.0,20430.0,20430.0,20430.0,20430.0
mean,-119.570604,35.632999,28.635291,2636.812188,537.928879,1425.145032,499.494714,3.871544,206864.417719
std,2.003659,2.136235,12.591329,2185.275419,421.386931,1133.173287,382.292417,1.899135,115436.466637
min,-124.35,32.54,1.0,2.0,1.0,5.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1450.25,296.0,788.0,280.0,2.563925,119500.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.53715,179700.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.75,604.0,4.744,264700.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [7]:
# check physical constraints and remove invalid observations

print(len(df[df['total_bedrooms'] > df['total_rooms']]))
print(len(df[df['households'] > df['population']]))

df = df[df['households'] <= df['population']]

0
3


In [8]:
# check string formatting of ocean_proximity

df['ocean_proximity'] = df['ocean_proximity'].apply(lambda x: x.lower())

df['ocean_proximity'].value_counts()

<1h ocean     9135
inland        6549
near ocean    2658
near bay      2290
island           5
Name: ocean_proximity, dtype: int64

In [9]:
# check for duplicates

df.duplicated().value_counts()

False    20637
dtype: int64

In [38]:
df[df['total_bedrooms'].notna()].describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20430.0,20430.0,20430.0,20430.0,20430.0,20430.0,20430.0,20430.0,20430.0
mean,-119.570604,35.632999,28.635291,2636.812188,537.928879,1425.145032,499.494714,3.871544,206864.417719
std,2.003659,2.136235,12.591329,2185.275419,421.386931,1133.173287,382.292417,1.899135,115436.466637
min,-124.35,32.54,1.0,2.0,1.0,5.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1450.25,296.0,788.0,280.0,2.563925,119500.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.53715,179700.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.75,604.0,4.744,264700.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [40]:
df[df['total_bedrooms'].isna()].describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,207.0,207.0,207.0,207.0,0.0,207.0,207.0,207.0,207.0
mean,-119.47256,35.497633,29.270531,2562.603865,,1477.772947,510.024155,3.822244,206007.280193
std,2.001424,2.097298,11.964927,1787.269789,,1057.448212,386.120704,1.955595,111638.214545
min,-124.13,32.66,4.0,154.0,,37.0,16.0,0.8527,45800.0
25%,-121.81,33.97,19.0,1307.5,,781.0,258.0,2.56415,128750.0
50%,-118.49,34.2,30.0,2155.0,,1217.0,427.0,3.4115,175000.0
75%,-117.985,37.495,38.0,3465.0,,1889.5,628.0,4.61575,267700.0
max,-114.59,40.92,52.0,11709.0,,7604.0,3589.0,15.0001,500001.0


In [41]:
# data seems to be MCAR so we can drop missing values

df = df[df['total_bedrooms'].notna()]

In [44]:
# export cleaned data as a csv

df.to_csv('housing_cleaned.csv')