In [2]:
import numpy as np
import pandas as pd
import matplotlib as plt

In [3]:
import os, urllib
import tarfile
download_root = 'https://raw.githubusercontent.com/ageron/handson-ml/master/'
housing_path = os.path.join('zestwy danych', 'mieszkania')
housing_url = download_root + 'datasets/housing/housing.tgz'

In [4]:
def fetch_housing_data(url = housing_url, path = housing_path):
    if not os.path.isdir(path):
        os.makedirs(path)
    tgzpath = os.path.join(path, 'housing.tgz')
    urllib.request.urlretrieve(url, tgzpath)
    housing_tgz = tarfile.open(tgzpath)
    housing_tgz.extractall(path = path)
    housing_tgz.close()

fetch_housing_data()

In [5]:
#Loading data
def load_housing_data(path = housing_path):
    csv_path = os.path.join(path, 'housing.csv')
    return pd.read_csv(csv_path)

houses = load_housing_data()
print(len(houses))

20640


In [6]:
#Split median income into 5 categories.
houses['income_cat'] = np.ceil(houses['median_income'] / 1.5)
houses['income_cat'].where(houses['income_cat'] < 5, 5.0, inplace = True)

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

#Spliting test and train data based on added column income_cat
for train_index, test_index in split.split(houses, houses['income_cat']):
    strat_train_set = houses.loc[train_index]
    strat_test_set = houses.loc[test_index]
    
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis = 1, inplace = True)

In [8]:
housing = strat_train_set.drop('median_house_value', axis = 1)
housing_labels = strat_train_set['median_house_value'].copy()

In [9]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
houses_cat = housing['ocean_proximity']
ocean_encoded = encoder.fit_transform(houses_cat)

In [10]:
#Creating cumstom transforming functin
from sklearn.base import BaseEstimator, TransformerMixin
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room    
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        rooms_per_family = X[:, 3] / X[:, 6]
        population_per_family = X[:, 5] / X[:, 6]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, 4] / X[:, 3]
            return np.c_[X, rooms_per_family, population_per_family, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_family, population_per_family]
        
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y = None):
        return self
    def transform(self, X, y= None):
        return X[self.attribute_names].values
    
class CategoricalEncoder(BaseEstimator, TransformerMixin):        
    def fit(self, X, y =None):
        return self
    
    def transform(self, X, y = None):
        encoder = LabelBinarizer()
        encoded = encoder.fit_transform(X)
        if len(encoded[0]) != 5:
            zeros = [[0]]*len(encoded)
            for i in range(5 - len(encoded[0])):
                encoded = np.append(encoded, zeros, axis = 1)
        return encoded

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.pipeline import FeatureUnion

houses_num = housing.drop('ocean_proximity', axis = 1)
num_attibs = list(houses_num)
cat_attibs = ['ocean_proximity']

num_Pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attibs)),
    ('imputer', Imputer(strategy = 'median')),
    ('attrribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

cat_Pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attibs)),
    ('cat_pipeline', CategoricalEncoder()),
])

full_pipeline = FeatureUnion(transformer_list = [
    ('num_pipeline', num_Pipeline),
    ('cat_pipeline', cat_Pipeline),
])

In [12]:
#Learning Model using Linear Regression

from sklearn.linear_model import LinearRegression
housing_prepered = full_pipeline.fit_transform(housing)
lin_reg = LinearRegression()
lin_reg.fit(housing_prepered, housing_labels)


some_data = housing[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepered = full_pipeline.fit_transform(some_data)

print(f'Predicted data: {lin_reg.predict(some_data_prepered)}')
print(f'Actual data: {list(some_labels)}')

#Checking root mean squered error 

from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepered)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print(f'Root squered mean error: {lin_rmse}')

Predicted data: [ 215615.06923226  346732.1858831   353727.53040633   62616.76604913
  206972.49365904]
Actual data: [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]
Root squered mean error: 68628.19819848922


In [13]:
#Learning Model using DecissionTreeRegression

from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepered, housing_labels)

housing_predictions = tree_reg.predict(housing_prepered)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print(f'Root squered mean error: {tree_rmse} (Overfitting!!)')

Root squered mean error: 0.0 (Overfitting!!)


In [16]:
# Cross Validation tests for DecissionTreeRegression

from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepered, housing_labels, scoring = 'neg_mean_squared_error', cv = 10)
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print('Wyniki: ', scores)
    print('Średnia: ', scores.mean())
    print('Odchylenie standardowe: ', scores.std())
    
display_scores(tree_rmse_scores)

Wyniki:  [ 70981.31040062  68208.31470103  71737.82020923  69075.25666533
  71580.32965997  75340.79652727  69516.25682158  71480.72185148
  77286.75549282  70990.98924519]
Średnia:  71619.8551575
Odchylenie standardowe:  2633.1716172


In [17]:
# Cross Validation tests for linear regression

from sklearn.model_selection import cross_val_score
scores = cross_val_score(lin_reg, housing_prepered, housing_labels, scoring = 'neg_mean_squared_error', cv = 10)
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print('Wyniki: ', scores)
    print('Średnia: ', scores.mean())
    print('Odchylenie standardowe: ', scores.std())
    
display_scores(tree_rmse_scores)


Wyniki:  [ 66782.73843989  66960.118071    70347.95244419  74739.57052552
  68031.13388938  71193.84183426  64969.63056405  68281.61137997
  71552.91566558  67665.10082067]
Średnia:  69052.4613635
Odchylenie standardowe:  2731.6740018
