# Cap2

In [1]:
import os
import tarfile
import urllib


In [2]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"


In [3]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    

In [4]:
fetch_housing_data()

In [5]:
import pandas as pd


In [15]:
def load_housing_data(housing_path=HOUSING_PATH):
    # return os.path.join(housing_path, "housing.csv")
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)


In [20]:
dataset = load_housing_data()


In [21]:
dataset.describe()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [22]:
dataset.head(5)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [23]:
dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [25]:
dataset["ocean_proximity"].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [31]:
# %matplotlib inline # only in a Jupyter notebook
%matplotlib
import matplotlib.pyplot as plot

Using matplotlib backend: Qt5Agg


In [32]:
dataset.hist(bins=50, figsize=(20,15))
plot.show()


In [33]:
import numpy as np


In [40]:
# def split_train_test(data, test_ratio):
#     shuffled_indices = np.random.permutation(len(data))
#     test_set_size = int(len(data) * test_ratio)
#     test_indices = shuffled_indices[:test_set_size]
#     train_indices = shuffled_indices[test_set_size:]
#     return data.iloc[train_indices], data.iloc[test_indices]

# train_set, test_set = split_train_test(dataset, 0.2)
# print(f"Train set > {len(train_set)}\nTest set > {len(test_set)}")


In [44]:
from sklearn.model_selection import train_test_split


In [49]:
train_set, test_set = train_test_split(dataset, random_state=11, test_size=0.2)

print(f"Train >\t{train_set.shape}\nTest >\t{test_set.shape}")


Train >	(16512, 10)
Test >	(4128, 10)


In [53]:
dataset["income_cat"] = pd.cut(
    dataset["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5])

dataset["income_cat"].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x2833d865fc8>

In [62]:
dataset = train_set.copy()
dataset.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
# dataset.plot(kind="scatter", x="longitude", y="latitude")

<matplotlib.axes._subplots.AxesSubplot at 0x2833f0ad2c8>

In [64]:
dataset.plot(
    kind="scatter", x="longitude", y="latitude",
    alpha=0.4, s=dataset["population"]/100,
    label="population", figsize=(10,7),
    c="median_house_value", cmap=plt.get_cmap("jet"),
    colorbar=True,)

plt.legend()

<matplotlib.legend.Legend at 0x2833d34e988>

In [67]:
corr_matrix = dataset.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value    1.000000
median_income         0.689227
total_rooms           0.135040
housing_median_age    0.098904
households            0.066778
total_bedrooms        0.050834
population           -0.022360
longitude            -0.045353
latitude             -0.143261
Name: median_house_value, dtype: float64

In [69]:
from pandas.plotting import scatter_matrix


In [71]:
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(dataset[attributes], figsize=(12, 8))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000002833D8DDDC8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002833F5A0C08>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002833F69D688>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002833F6DBA48>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000002833F71AB08>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002833F75D388>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002833F79D348>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002833F7DDDC8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000002833F7E4848>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002833F827108>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002833F8C9EC8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000002833FA33F48>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00

In [73]:
dataset.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)

<matplotlib.axes._subplots.AxesSubplot at 0x2834f0debc8>

In [75]:
dataset["rooms_per_household"] = dataset["total_rooms"]/dataset["households"]
dataset["bedrooms_per_room"] = dataset["total_bedrooms"]/dataset["total_rooms"]
dataset["population_per_household"] = dataset["population"]/dataset["households"]


In [78]:
corr_matrix = dataset.corr()

corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value          1.000000
median_income               0.689227
rooms_per_household         0.146773
total_rooms                 0.135040
housing_median_age          0.098904
households                  0.066778
total_bedrooms              0.050834
population                 -0.022360
population_per_household   -0.022813
longitude                  -0.045353
latitude                   -0.143261
bedrooms_per_room          -0.255391
Name: median_house_value, dtype: float64

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


In [None]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
    self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
            
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)