# Downloading and fetching the housing data

In [None]:
                import os
import tarfile
from six.moves import urllib

PROJECT_ROOT_DIR = "."
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [None]:
fetch_housing_data()

# Now load the data using Pandas. 

In [None]:
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

# Take a Look at the Data Structure

In [None]:
housing = load_housing_data(housing_path=HOUSING_PATH)
housing.head(30)


Each row represents one district. There are 10 attributes: longitude, latitude, housing_median_age, total_rooms, total_bed, rooms, population, households, median_income, median_house_value, and ocean_proximity.

The info() method is useful to get a quick description of the data the total number of rows, and each attribute’s type and number of non-null values

In [None]:
housing.info()

All attributes are numerical, except the ocean_proximity field. 
* It is an object, so it could hold any kind of Python object.
* It is a categorical attribute. 
* You can find out what categories exist and how many districts belong to each category by using the value_counts() method

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
# The describe() method shows a summary of the numerical attributes
housing["ocean_proximity"].describe()

In [None]:
# he describe() method shows a summary of the numerical attributes
housing.describe()

Another way to know the data is to plot a histogram for each numerical attribute.  A histogram shows the number of instances (on the vertical axis) that have a given value range (on the horizontal axis). 
- You can either plot this one attribute at a time, or 
- you can call the hist() method on the whole dataset, and it will plot a histogram for each numerical attribute 

In [None]:
%matplotlib inline 
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

# Creating a test set

In [None]:
import numpy as np
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(housing, 0.2)

or instead you can use the Scikit-Learn  provides few functions to split datasetsinto into multiple subsets.

The simplest function is train_test_split.

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
len(train_set)

stratified sampling is better than random sampling
* Do stratified sampling based on the income category. 
* use Scikit-Learn’s StratifiedShuffleSplit class to do that 

In [None]:
housing["income_cat"] = pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5])
#housing["income_cat"].describe()
housing["income_cat"].hist()
housing["income_cat"].value_counts()

In [None]:
housing


In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

look at the income category proportions in the test set

In [None]:
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [None]:
housing["income_cat"].value_counts() / len(housing["income_cat"])

In [None]:
strat_train_set["income_cat"].value_counts() / len(strat_train_set)

Remove the income_cat attribute so the data is back to its original state

In [None]:
def income_cat_proportions(data):
    return data["income_cat"].value_counts() / len(data)

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({
    "Overall": income_cat_proportions(housing),
    "Stratified": income_cat_proportions(strat_test_set),
    "Random": income_cat_proportions(test_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

In [None]:
compare_props

In [None]:
for set_ in (strat_train_set, strat_test_set): 
    set_.drop("income_cat", axis=1, inplace=True)

# Visualizing Geographical Data
Since there is geographical information (latitude and longitude), it is a good idea to
create a scatterplot of all districts to visualize the data 

* use alpha = 0.1 to better visulize where is a population high density
* make a copy of the training data set the actual training doesn't get changed 

In [None]:
housing = strat_train_set.copy()

In [None]:
housing.columns


In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=.1)
#strat_train_set.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

### Look at the housing prices. 
* The radius of each circle represents the district’s population (option s)
* The color represents the price (option c). 
* We will use a predefined color map (option cmap) called jet, which ranges from blue (low values) to red (high prices)

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, s=housing["population"]/100, label="population", figsize=(10,7),
c=housing['total_rooms']/100, cmap=plt.get_cmap("jet"), colorbar=True, )
plt.legend()

## Load california maps 

In [None]:
# Download the California image
images_path = os.path.join(PROJECT_ROOT_DIR, "images", "end_to_end_project")
os.makedirs(images_path, exist_ok=True)
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
filename = "california.png"
print("Downloading", filename)
url = DOWNLOAD_ROOT + "images/end_to_end_project/" + filename
urllib.request.urlretrieve(url, os.path.join(images_path, filename))

In [None]:
import matplotlib.image as mpimg
california_img=mpimg.imread(os.path.join(images_path, filename))
ax = housing.plot(kind="scatter", x="longitude", y="latitude", figsize=(10,7),
                       s=housing['population']/100, label="Population",
                       c="median_house_value", cmap=plt.get_cmap("jet"),
                       colorbar=False, alpha=0.4,
                      )
plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5,
           cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)

prices = housing["median_house_value"]
tick_values = np.linspace(prices.min(), prices.max(), 11)
cbar = plt.colorbar()
cbar.ax.set_yticklabels(["$%dk"%(round(v/1000)) for v in tick_values], fontsize=14)
cbar.set_label('Median House Value', fontsize=16)

plt.legend(fontsize=16)
plt.show()

## Looking for Correlations
You can easily compute the standard correlation coefficient (also called Pearson’s r) between every pair of attributes using the corr() method

In [None]:
corr_matrix = housing.corr()
corr_matrix

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

Another way to check for correlation between attributes is to use Pandas’ scatter_matrix function, which plots every numerical attribute against every other numerical attribute.

* so let’s just focus on a few promising attributes that seem most correlated with the median housing value

* The main diagonal (top left to bottom right) would be full of straight lines if Pandas
plotted each variable against itself, which would not be very useful. 

* Pandas displays a histogram of each attribute in the main diagonal instead of the corr of the attribute with itself which is one.

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

The median income is the most correlated attribute with the median house value.

* let’s zoom in on their correlation scatterplot

* the plot reveals horizontal lines around $450,000, $350,000, $280,000, and a few more below that.
* You may want to try removing the corresponding districts to prevent the algorithms from learning to reproduce these data quirks.

In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)

## Experimenting with Attribute Combinations

In [None]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

Now Look at the correlation matrix

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

- The bedrooms_per_room is more correlated with the median house value than the total number of rooms or bedrooms. 
- houses with a lower bedroom/room ratio tend to be more expensive. 
- The number of rooms per household is more informative than the total number of rooms in a district—obviously --> the larger the houses, the more expensive they are.

- Repeat this process

In [None]:
housing.plot(kind="scatter", x="rooms_per_household", y="median_house_value",
             alpha=0.2)
plt.axis([0, 5, 0, 520000])
plt.show()

In [None]:
housing.describe()

# Prepare the Data for Machine Learning Algorithms

In [None]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
len(housing)

In [None]:
housing.info()

## Data Cleaning
- Most Machine Learning algorithms cannot work with missing features
- The total_bedrooms attribute has some missing values. We have three options:
<ol>
<li>  Get rid of the corresponding districts.</li>
<li>  Get rid of the whole attribute.</li>
<li>  Set the values to some value (zero, the mean, the median, etc.).</li>
</ol>

Can do this using pandas DataFrame’s dropna(), drop(), and fillna()

- dropna() : drop all rows with NA (not available - NaN, pandas.NaT, None)
- drop() : Remove rows or columns by specifying label names and corresponding axis.
- fillna() : Fill NA/NaN values using the specified method.

In [None]:
housing.dropna(subset=["total_bedrooms"]) # option 1
housing.drop("total_bedrooms", axis=1) # option 2
median = housing["total_bedrooms"].median() # option 3
housing["total_bedrooms"].fillna(median, inplace=True)

Scikit-Learn also provides a class to take care of missing values: SimpleImputer.
- First, you need to create a SimpleImputer instance, specifying
that you want to replace each attribute’s missing values with the median of that attribute:

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1) # a numeric copy
imputer.fit(housing_num)

The imputer has computed the median of each attribute and stored the result in its statistics_ instance variable. 

In [None]:
imputer.statistics_

In [None]:
housing_num.median().values

Now you can use this “trained” imputer to transform the training set by replacing missing values by the learned medians:

In [None]:
X = imputer.transform(housing_num)
type(X)

The result is a plain NumPy array containing the transformed features. Put it back into a Pandas DataFrame, it’s simple:

In [None]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns)
housing_tr.info()

## Handling Text and Categorical Attributes

ocean_proximity is a categorial (nominal, text) attribute that we cannot compute its median.

Many machine learning algorithms require all input and output variables to be numeric (scikit-learn require this)

Two approaches to convert these categories (nominal) from text to number:
1) ordinal-encoding 
2) one-hot-encoding (dummy attributes)


#### Ordinal-encoding
Ordinal variables: consists of a finite set of discrete values with an ordering between values.

Ordinal-encoding: each category value is assigned an integer value. The values have a natural ordered relationship between them.

Use Scikit-Learn’s OrdinalEncoder class

In [None]:
housing_cat = housing[["ocean_proximity"]]
housing_cat.head

In [None]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

In [None]:
#You can get the list of categories using the categories_ instance variable.
ordinal_encoder.categories_

Issue: ML algorithms will assume that two nearby values are more similar than two distant values. It imposes an ordinal relationship where no such relationship may exist.

* In some cases this works: (e.g., for ordered categories such as “bad”, “average”, “good”, “excellent”)
* but it doesn't make sense in other cases: e.g. ocean_proximity column. 

Solution: one-hot-encoding 


#### One-hot-encoding  (dummy attributes)

- One binary attribute per category: one attribute equal to 1 when the category is “<1H OCEAN” (and 0 otherwise)
- Another attribute equal to 1 when the category is “INLAND” (and 0 otherwise), and so on

Scikit-Learn provides a OneHotEncoder class to convert categorical values into one-hot vectors.

In [None]:
housing_cat = housing[["ocean_proximity"]]

from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot.toarray()   # convert from scipy sparse matrix object to numpy

Get the catagories 

In [None]:
cat_encoder.categories_

# Custom Transformers

* To write your own transformers for cleanup operations or combining specific attributes. 
* You need to create a class and implement three methods: 
**  fit() (returning self)
**  transform()
** fit_transform(): obtained by adding TransformerMixin as a base class 
* add BaseEstimator as a base class to get the methods get_params() and set_params()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [None]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=True)
housing_extra_attribs = attr_adder.transform(housing.values)

In [None]:
housing_extra_attribs.shape

In [None]:
housing.columns


In [None]:
housing.min()
%%latex 


# Feature Scaling
* Important transformations to apply to data
* Many machine Learning algorithms don’t perform well when the input attributes have very different scales. 

##### For housing data: 
* The total number of rooms ranges from 6 to 39,320
* The median incomes range from 0 to 15 
* Scaling the target is generally not required.

##### Two ways to get all attributes to have the same scale
* min-max scaling (normalization): values are shifted and rescaled so that they end up ranging from 0 to 1. Subtract the min value and divide by the max minus the min. $ x = \frac {x-min} {max - min} $

    * Scikit-Learn provides a transformer MinMaxScaler for this.  

* Standardization:  subtracts the mean value, then divides by the standard deviation  resulting distribution has unit variance.
$$ y = \frac {(x – mean)} / stddev $$
    * It does not bound values to a specific range. 
    * may be a problem for some algorithms
    * Less affected by outliers
    * Scikit-Learn provides a transformer called StandardScaler for standardization.

##### With all the transformations, it is important to fit the scalers to the training data only, not to the full dataset. Only then can you use them to transform the training set and the test set (and new data).



In [None]:
# example of a normalization
import numpy as np
from sklearn.preprocessing import MinMaxScaler

np.set_printoptions(suppress=True)

data = np.asarray([[100, 0.001],
                [8, 0.05],
                [50, 0.005],
                [88, 0.07],
                [4, 0.1]])
print(data)
# define min max scaler
scaler = MinMaxScaler()
# transform data
scaled = scaler.fit_transform(data)
print(scaled)


In [None]:
from sklearn.preprocessing import MinMaxScaler
housing.columns

In [None]:
housing.describe()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
housing[['total_rooms']] =hscaler.fit_transform(housing[['total_rooms']])
housing.describe()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
housing[['total_rooms']] =scaler.fit_transform(housing[['total_rooms']])
housing.describe()

# Transformation Pipelines
* Many data transformations need to be executed in the right order. Sequences of different transforms can be chained together using the Pipeline, such as imputing missing values, then scaling numerical values, categorial encoding.
* The Pipeline constructor takes a list of name/estimator pairs defining a sequence of steps. 
* All but the last estimator must be transformers (i.e., they must have a fit_transform() method). 
* The names can be anything as they are unique and don’t contain double underscores “__”: they are convenient for hyperparameter tuning.


In [106]:
list(housing_num)

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income']

In [93]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), 
                         ('attribs_adder', CombinedAttributesAdder()), 
                         ('std_scaler', StandardScaler()),])
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [105]:
housing_num_tr = pd.DataFrame(housing_num_tr, columns=list(housing_num.columns)+list("abc"))
housing_num_tr.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,a,b,c
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,-4.353107e-15,2.284564e-15,-4.701235000000001e-17,7.587062000000001e-17,1.360615e-16,-3.700743e-17,2.0789790000000002e-17,-2.076289e-16,8.014691000000001e-17,-1.6217650000000002e-17,-4.878742e-17
std,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003
min,-2.384937,-1.44976,-2.199168,-1.223689,-1.294944,-1.269921,-1.317668,-1.772116,-1.650273,-0.2075303,-2.704542
25%,-1.111083,-0.7949406,-0.8472092,-0.551689,-0.5793145,-0.5698825,-0.5803963,-0.6870806,-0.3822514,-0.05741738,-0.5914834
50%,0.5324379,-0.6452675,0.02758786,-0.2353301,-0.2458409,-0.2292746,-0.2370459,-0.1756999,-0.07966522,-0.02406537,-0.1628895
75%,0.7822131,0.9730728,0.6638039,0.242365,0.2604547,0.2684162,0.2793106,0.4561338,0.2358755,0.01596812,0.4044268
max,2.63055,2.951564,1.856709,17.16114,13.81603,30.71047,12.93803,5.839969,52.25419,107.0603,39.75916


#### Different data preparation techniques on different columns using ColumnTransformer
Sometimes we want to perform different data preparation techniques on different columns. For example, you may want to impute missing values with a median value, then scale the values and impute missing categorical values using the most frequent value and one hot encode the categories.

The ColumnTransformer allows to selectively apply data preparation transforms on different columns. You can apply a specific sequence of transforms to just the numerical columns, and a nother sequence of transforms to the categorical columns.

Each transformer is a three-element tuple that defines: 
1. transformer name
2. the transform to apply
3. the column indices to apply it to.

For example: (Name, Object, Columns)
set remainder=’passthrough’  to keep unspecified columns

In [108]:
housing.describe()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,-119.575834,35.639577,28.653101,-5.647936999999999e-19,533.998123,1419.790819,497.06038,3.875589
std,2.00186,2.138058,12.574726,1.00003,410.839621,1115.686241,375.720845,1.90495
min,-124.35,32.54,1.0,-1.223689,2.0,3.0,2.0,0.4999
25%,-121.8,33.94,18.0,-0.551689,296.0,784.0,279.0,2.566775
50%,-118.51,34.26,29.0,-0.2353301,433.0,1164.0,408.0,3.5409
75%,-118.01,37.72,37.0,0.242365,641.0,1719.25,602.0,4.744475
max,-114.31,41.95,52.0,17.16114,6210.0,35682.0,5358.0,15.0001


In [115]:
# apply simple imputer to total_rooms and pupulation columns 
from sklearn.compose import ColumnTransformer
t = [('roomspop', SimpleImputer(strategy='median'), [3, 5])]
transformer = ColumnTransformer(transformers=t, remainder='passthrough' )
tt = transformer.fit_transform(housing)
tt.shape

(16512, 9)

In [116]:
from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs),])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared.shape

(16512, 16)

# Training and Evaluating on the Training Set

In [121]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
reg = lin_reg.fit(housing_prepared, housing_labels)

[-5.30776372e+04 -5.39096285e+04  1.37166194e+04 -9.61749828e+03
  2.96598024e+04 -4.44381176e+04  2.95164511e+04  7.36319754e+04
 -6.79249357e+02  8.63962387e+02 -8.88446355e+02 -7.68781994e+16
 -7.68781994e+16 -7.68781994e+16 -7.68781994e+16 -7.68781994e+16]


In [124]:
housing_prepared

array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])

In [125]:
# let's try the full preprocessing pipeline on a few training instances
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

Predictions: [211760. 321552. 210864.  61600. 192336.]


compare againest actual values 

In [126]:
print("Labels:", list(some_labels))

Labels: [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]


In [127]:
some_data_prepared

array([[-1.15604281,  0.77194962,  0.74333089, -0.49323393, -0.44543821,
        -0.63621141, -0.42069842, -0.61493744,  0.05964071, -0.08649871,
        -0.01080657,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , -0.90896655, -1.0369278 ,
        -0.99833135, -1.02222705,  1.33645936, -0.44706653, -0.03353391,
        -0.00223757,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, -0.31365989, -0.15334458,
        -0.43363936, -0.0933178 , -0.5320456 ,  0.11932063, -0.09240499,
        -0.02222581,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.01706767,  0.31357576, -0.29052016, -0.36276217, -0.39675594,
         0.03604096, -0.38343559, -1.04556555,  0.09250197,  0.08973561,
        -0.01530315,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.49247384, -0.65929936, -0

In [128]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

69035.8591177743

In [129]:
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(housing_labels, housing_predictions)
lin_mae

49894.03167393411

In [130]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)

DecisionTreeRegressor(random_state=42)

In [131]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

# Better Evaluation Using Cross-Validation

### Tree regression 

In [133]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
tree_rmse_scores

array([69347.70144216, 69853.09606991, 70085.01188048, 69488.44267631,
       72049.19849013, 74715.70639183, 71650.88681675, 69074.07635218,
       74325.648709  , 71999.08930151])

In [134]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

Scores: [69347.70144216 69853.09606991 70085.01188048 69488.44267631
 72049.19849013 74715.70639183 71650.88681675 69074.07635218
 74325.648709   71999.08930151]
Mean: 71258.88581302542
Standard deviation: 1938.9562935871093


### Linear regression 

In [135]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [67442.07735663 67471.71986433 68389.37607147 74858.10236572
 68267.60509178 71591.67036063 65397.8798693  68667.99360518
 73018.62932732 68057.75776951]
Mean: 69316.28116818739
Standard deviation: 2753.697652337107


### Random forest regression 

In [136]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)

RandomForestRegressor(random_state=42)

In [137]:
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

18889.426957929023

In [138]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [50342.32597027 48158.37919093 50431.76893217 53059.2744423
 50824.93619062 54470.64683468 49401.68899025 48587.02754089
 54059.32301416 51396.65758741]
Mean: 51073.2028693681
Standard deviation: 2073.3335629742364


In [139]:
from sklearn.svm import SVR

svm_reg = SVR(kernel="linear")
svm_reg.fit(housing_prepared, housing_labels)
housing_predictions = svm_reg.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

111768.39440197597

### Fine-Tune Your Model
* manual tuning 
* Use GridSearchCV
* 3 * 4 + 2 * 3 = 18 different combination of parameters 
* a total of 18 * 5 (CV) = 90 training 

In [140]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [148]:
grid_search.best_estimator_

RandomForestRegressor(max_features=8, n_estimators=30, random_state=42)

In [149]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [146]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

67578.6539718191 {'max_features': 2, 'n_estimators': 3}
57483.586136217105 {'max_features': 2, 'n_estimators': 10}
54531.493402760345 {'max_features': 2, 'n_estimators': 30}
60977.17003204092 {'max_features': 4, 'n_estimators': 3}
53417.32754699844 {'max_features': 4, 'n_estimators': 10}
50879.37230398769 {'max_features': 4, 'n_estimators': 30}
58292.68942265689 {'max_features': 6, 'n_estimators': 3}
52376.156974235695 {'max_features': 6, 'n_estimators': 10}
50265.881429303976 {'max_features': 6, 'n_estimators': 30}
59158.62976047877 {'max_features': 8, 'n_estimators': 3}
52658.00187724696 {'max_features': 8, 'n_estimators': 10}
50169.10645977438 {'max_features': 8, 'n_estimators': 30}
65142.84908114044 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
56295.55453646887 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
60058.60136087842 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
53781.14851277873 {'bootstrap': False, 'max_features': 3, 'n_estimators':

# Randomized Search

In [150]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
                   param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002A205452A60>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002A205477EE0>},
                   random_state=42, scoring='neg_mean_squared_error')

In [151]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

49252.818341365186 {'max_features': 7, 'n_estimators': 180}
51715.6236357881 {'max_features': 5, 'n_estimators': 15}
51545.03346553677 {'max_features': 3, 'n_estimators': 72}
50971.19618041091 {'max_features': 5, 'n_estimators': 21}
49404.21625325934 {'max_features': 7, 'n_estimators': 122}
51513.16815578037 {'max_features': 3, 'n_estimators': 75}
51408.48915527614 {'max_features': 3, 'n_estimators': 88}
49848.692582994976 {'max_features': 5, 'n_estimators': 100}
51276.40554204698 {'max_features': 3, 'n_estimators': 150}
64913.737262479306 {'max_features': 5, 'n_estimators': 2}


In [152]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.07755933, 0.07068123, 0.04592016, 0.01630995, 0.01579577,
       0.0166108 , 0.01548961, 0.41719945, 0.01654275, 0.11381314,
       0.01670432, 0.01307243, 0.15953212, 0.00004678, 0.00165716,
       0.003065  ])

# Ensemble Methods

# Evaluate Your System on the Test Set

# Analyze the Best Models and Their Errorr

In [153]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.07755933, 0.07068123, 0.04592016, 0.01630995, 0.01579577,
       0.0166108 , 0.01548961, 0.41719945, 0.01654275, 0.11381314,
       0.01670432, 0.01307243, 0.15953212, 0.00004678, 0.00165716,
       0.003065  ])

In [154]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
#cat_encoder = cat_pipeline.named_steps["cat_encoder"] # old solution
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.4171994545836687, 'median_income'),
 (0.1595321228648014, 'INLAND'),
 (0.11381314300339597, 'pop_per_hhold'),
 (0.07755932810965623, 'longitude'),
 (0.07068122855538019, 'latitude'),
 (0.04592016433402573, 'housing_median_age'),
 (0.016704317132672387, 'bedrooms_per_room'),
 (0.01661079654930104, 'population'),
 (0.01654275380085179, 'rooms_per_hhold'),
 (0.016309948421199275, 'total_rooms'),
 (0.015795765212541498, 'total_bedrooms'),
 (0.015489607854418391, 'households'),
 (0.01307243345462553, '<1H OCEAN'),
 (0.0030649953554062427, 'NEAR OCEAN'),
 (0.0016571599815796592, 'NEAR BAY'),
 (4.678078647600746e-05, 'ISLAND')]