In [1]:
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix

In [2]:
# Import the California Housing Data from used in the HOML book, Chapter 2

import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [3]:
fetch_housing_data()

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [4]:
housing = load_housing_data()

In [5]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
y = housing['median_house_value']
print(y)

0        452600.0
1        358500.0
2        352100.0
3        341300.0
4        342200.0
           ...   
20635     78100.0
20636     77100.0
20637     92300.0
20638     84700.0
20639     89400.0
Name: median_house_value, Length: 20640, dtype: float64


In [7]:
X = housing.drop(['median_house_value','ocean_proximity','total_bedrooms'],axis=1)
print(X)

       longitude  latitude  housing_median_age  total_rooms  population  \
0        -122.23     37.88                41.0        880.0       322.0   
1        -122.22     37.86                21.0       7099.0      2401.0   
2        -122.24     37.85                52.0       1467.0       496.0   
3        -122.25     37.85                52.0       1274.0       558.0   
4        -122.25     37.85                52.0       1627.0       565.0   
...          ...       ...                 ...          ...         ...   
20635    -121.09     39.48                25.0       1665.0       845.0   
20636    -121.21     39.49                18.0        697.0       356.0   
20637    -121.22     39.43                17.0       2254.0      1007.0   
20638    -121.32     39.43                18.0       1860.0       741.0   
20639    -121.24     39.37                16.0       2785.0      1387.0   

       households  median_income  
0           126.0         8.3252  
1          1138.0         8.3

In [27]:
from sklearn.model_selection import train_test_split

# split data and labels into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=None)

#a balanced split, percentage of samples for each class, can be obtained with  StratifiedShuffleSplit 
#note however that the housing dataset is not a good candidate for this approach
# from sklearn.model_selection import StratifiedShuffleSplit
# split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# for train_index, test_index in split.split(X, y):
#   X_train = X[train_index]
#   X_test = X[test_index]
#   y_train = y[train_index]
#   y_test = y[test_index]

In [16]:
# the stratify split works well however for the Iris dataset 

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.datasets import load_iris

# load Iris dataset
iris = load_iris()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(iris.data, iris.target):
    X_iris_train = iris.data[train_index]
    X_iris_test = iris.data[test_index]
    y_iris_train = iris.target[train_index]
    y_iris_test = iris.target[test_index]

In [22]:
from sklearn.linear_model import LinearRegression

# instantiate a model and fit it to the training set
linreg = LinearRegression().fit(X_train, y_train)

In [23]:
# evaluate the model on the test set
print("Test set score: {:.2f}".format(linreg.score(X_test, y_test)))

Test set score: 0.62


# Dealing with missing value

In [28]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X_train)
SimpleImputer()
X_train_new = imp_mean.transform(X_train)
X_test_new = imp_mean.transform(X_test)

In [40]:
# instantiate a model and fit it to the training set
linreg = LinearRegression().fit(X_train_new, y_train)

In [41]:
# evaluate the model on the test set
print("Test set score: {:.2f}".format(linreg.score(X_test_new, y_test)))

Test set score: 0.62


# If you want to see which feature has the missing element  

In [42]:
# you can check each one individually with the following code
housing['longitude'].isnull().values.any()

False

In [43]:
housing['latitude'].isnull().values.any()

False

In [44]:
housing['housing_median_age'].isnull().values.any()

False

In [45]:
housing['total_rooms'].isnull().values.any()

False

In [46]:
housing['total_bedrooms'].isnull().values.any()

True

In [47]:
# although this isn't necessary for running Imputer it may be useful to know where exactly the data is missing
# you can also check how many elements are empty
housing['total_bedrooms'].isnull().values.sum()

207

In [48]:
# or you can generally check if you have any empty elements in your dataframe 
#(and thus whether you need to run Imputer)
housing.isnull().values.any()

True