# Chapter 2: End-to-end Machine Learning Project

Task: Predict median house price values in California block groups. 

Also known as districts, block groups are the smallest geographical unit that the US Census publishes data on. Given all other metrics recorded in the Census data, the model should be able to predict the median housing price in any district.

System:
*   Supervised since the data is labeled and each instance has an expected output
*   Univariate since the model is only predicting on a single value for a given district
*   Regression with batch training since the dataset is not very large and is also not constantly changing while the model is predicting a value.
*   Root Mean Square Error (RMSE) performance metric to increase the weight of large errors



In [1]:
# Import common modules
# Python ≥3.5 is required (not a problem on colab)
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

import numpy as np

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

## Data

In [6]:
# Pipeline to load data

import os
import tarfile
import urllib
import urllib.request

rootDirectory = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
dataPath = os.path.join("datasets", "housing")
dataURL = rootDirectory + "datasets/housing/housing.tgz"
folderName = "housing.tgz"

def getData(URL = dataURL, dataPath = dataPath, fileName = folderName):
  if not os.path.isdir(dataPath):
    os.makedirs(dataPath)
  tgzPath = os.path.join(dataPath, folderName)
  urllib.request.urlretrieve(URL, tgzPath)
  housing_tgz = tarfile.open(tgzPath)
  housing_tgz.extractall(path=dataPath)
  housing_tgz.close()

In [7]:
getData()

In [10]:
import pandas as pd
filename = "housing.csv"

def loadData(dataPath = dataPath):
  csvPath = os.path.join(dataPath,filename)
  return pd.read_csv(csvPath)

In [11]:
housing = loadData()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [12]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [14]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0
