In [1]:
import os 
import pandas as pd
import numpy as np
def load_housing_data(housing_path = "housing"):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [2]:
housing = load_housing_data()

In [3]:
# housing.describe()

In [4]:
# %matplotlib inline
# import matplotlib.pyplot as plt
# housing.hist(bins=50, figsize=(20,15))
# plt.show()

In [5]:
# import numpy as np
# def split_train_test(data, test_ratio):
#     shuffled_indices = np.random.permutation(len(data))
#     test_set_size = int(len(data)*test_ratio)
#     test_indices = shuffled_indices[:test_set_size]
#     train_indices = shuffled_indices[test_set_size:]
#     return data.iloc[train_indices], data.iloc[test_indices]
    

In [6]:
# разбиваем наброр данных на тренировочный и испытательтный наборы
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split (housing, test_size = 0.2, random_state=42)

In [7]:
#разбил наборы на категории в зависимости от медианного дохода
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace = True)

In [8]:
# разбил на тренировочный и испытательный набор стартифицированно
from sklearn.model_selection import StratifiedShuffleSplit 
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [9]:
#удаляю лишний столбец категории в каждом наборе
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [10]:
housing = strat_train_set.copy()

In [11]:
# housing.plot(kind="scatter", x="longitude", y="latitude")

In [12]:
# housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

In [13]:
# housing.plot(kind="scatter", x="longitude",y="latitude", 
#              alpha=0.3, s=housing["population"]/100, 
#              label="population", 
#              figsize=[10,7], c="median_house_value",
#              cmap=plt.get_cmap("jet"), 
#              colorbar=True, sharex=False)
# plt.legend()

In [14]:
from pandas.plotting import scatter_matrix
attribures = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attribures], figsize=[12,10])

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000001298C630>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000149E47B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000014A0DB38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000014A3D0F0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000014A64668>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000014A8DBE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000014ABE198>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000014AE2748>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000014AE2780>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000014B3D240>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000014B627B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000014B8DD30>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00

In [15]:
# housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.2)

In [16]:
housing["rooms_per_household"]=housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"]= housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [17]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending = False)

median_house_value          1.000000
median_income               0.687160
rooms_per_household         0.146285
total_rooms                 0.135097
housing_median_age          0.114110
households                  0.064506
total_bedrooms              0.047689
population_per_household   -0.021985
population                 -0.026920
longitude                  -0.047432
latitude                   -0.142724
bedrooms_per_room          -0.259984
Name: median_house_value, dtype: float64

In [18]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [19]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer (strategy="median")

In [20]:
housing_num = housing.drop("ocean_proximity", axis = 1)

In [21]:
# вычисляем медианные значения аатрибутов, чтобы впоследствии заполнить ими пустые значения таких атрибутов
imputer.fit(housing_num)
print (imputer.statistics_)
housing_num.median().values

[-118.51     34.26     29.     2119.5     433.     1164.      408.
    3.5409]


array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
        408.    ,    3.5409])

In [22]:
X = imputer.transform(housing_num)

In [23]:
# Для замены текстового атрибута на чисолвые
housing_cat = housing["ocean_proximity"]
housing_cat_encoded, housing_categories = housing_cat.factorize()

In [24]:
from sklearn.preprocessing import OneHotEncoder 

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [25]:
housing_cat_1hot.toarray()

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [26]:
# cat_encoder.categories_

In [27]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

In [28]:
class CombinedAttributesAdder (BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit (self, X, y = None):
        return self
    def transform (self, X, y = None):
        rooms_per_household = X[:, rooms_ix] / X[:,household_ix]
        population_per_household = X[:, population_ix] / X[:,household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:,household_ix]
            return np.c_[X,rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X,rooms_per_household, population_per_household]

In [29]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room = False)

In [30]:
housing_extra_attribs = attr_adder.transform(housing.values)

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),# для заполнения пустых атрибутов медианным значением
        ('attribs_adder', CombinedAttributesAdder()), # для добавления дополнительных атрибутов, которые могут помочь в обучении модели
        ('std_scaler', StandardScaler()),# масштабирует атрибуты
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [32]:
class DataFrameSelector (BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit (self, X, y=None):
        return self
    def transform (self, X):
        return X[self.attribute_names].values

In [33]:
# создаем два  конвейера с трансформаторами, для автоматизации подгтотовки данных для обучения
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('cat_encoder', OneHotEncoder(sparse=False)),
    ])

In [34]:
# объединяем два конвейера в один
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [35]:
housing_prepared = full_pipeline.fit_transform(housing)
# housing_prepared

# Обучение моделью линейной регрессии

In [30]:
from sklearn.linear_model import LinearRegression

In [31]:
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [32]:
some_data = housing.iloc[:5]
some_labels = housing_labels[:5]
some_data_prepared = full_pipeline.transform(some_data)

In [33]:
print('Прогнозы: ', lin_reg.predict(some_data_prepared))

Прогнозы:  [211881.21811279 321219.24211009 210877.63065012  62198.25451316
 194847.8414579 ]


In [34]:
print('Метки: ', list(some_labels))

Метки:  [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]


In [35]:
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

68911.49637588045

# Обучение Дерева принятий решений

In [36]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [37]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [38]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                        scoring = "neg_mean_squared_error", cv = 10)
tree_rmse_scores = np.sqrt(-scores)

In [39]:
def display_scores (scores):
    print ("Суммы оценок: ", scores)
    print("Среднее: ", scores.mean())
    print("Стандартное отклонение: ", scores.std())

In [40]:
display_scores(tree_rmse_scores)

Суммы оценок:  [70347.03410968 67326.7054967  70491.68504414 73316.52211103
 69689.10658621 75069.37031403 73934.2617154  69515.97125759
 77632.20708746 68779.14285934]
Среднее:  71610.20065815756
Стандартное отклонение:  3063.16003536938


In [41]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                            scoring = "neg_mean_squared_error", cv = 10)
lin_rmse_scores = np.sqrt(-lin_scores)

In [42]:
display_scores(lin_rmse_scores)

Суммы оценок:  [67474.11780426 67233.22466524 69301.86479972 74716.01783105
 68426.80214612 71609.98356263 65200.14338307 68687.78826919
 72262.43484426 68111.81213342]
Среднее:  69302.41894389638
Стандартное отклонение:  2653.460699447049


# Обучение RandomForrestRegressor "случайные леса"

In [43]:
from sklearn.ensemble import RandomForestRegressor

In [44]:
forest_reg =  RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [45]:
# осуществляется оценка на обучающем наборе с метками
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

22291.365604221395

In [46]:
forest_scores = cross_val_score (forest_reg, housing_prepared, housing_labels,
                        scoring = "neg_mean_squared_error", cv = 10)
forest_scores_rmse = np.sqrt(-forest_scores)
forest_scores_rmse

array([52415.60504968, 49436.8069034 , 52473.81680546, 55820.30166245,
       52251.17602412, 55968.56017022, 51429.34158739, 51353.33201005,
       56429.39333604, 52404.84140493])

In [47]:
# осуществляется оценка на проверочном наборе с оценками
display_scores(forest_scores_rmse)

Суммы оценок:  [52415.60504968 49436.8069034  52473.81680546 55820.30166245
 52251.17602412 55968.56017022 51429.34158739 51353.33201005
 56429.39333604 52404.84140493]
Среднее:  52998.317495373994
Стандартное отклонение:  2189.4137793189566


# Решетчатый поиск значений гиперпараметров

In [48]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 
     'max_features': [2, 3, 4]}    
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring = 'neg_mean_squared_error')
grid_search.fit (housing_prepared, housing_labels)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [49]:
cvres= grid_search.cv_results_

In [50]:
for mean_score, params in zip(cvres["mean_test_score"],
                             cvres["params"]):
    print(np.sqrt(-mean_score), params)

65826.86039538817 {'max_features': 2, 'n_estimators': 3}
55942.767653187635 {'max_features': 2, 'n_estimators': 10}
52872.4263750197 {'max_features': 2, 'n_estimators': 30}
60129.68027742184 {'max_features': 4, 'n_estimators': 3}
53161.46525893699 {'max_features': 4, 'n_estimators': 10}
50629.944962064175 {'max_features': 4, 'n_estimators': 30}
58770.49963396639 {'max_features': 6, 'n_estimators': 3}
52150.00714231254 {'max_features': 6, 'n_estimators': 10}
49511.92137323985 {'max_features': 6, 'n_estimators': 30}
59089.02527234803 {'max_features': 8, 'n_estimators': 3}
51761.17771487678 {'max_features': 8, 'n_estimators': 10}
50104.622469176145 {'max_features': 8, 'n_estimators': 30}
61740.10884120749 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54677.55907302626 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59540.09561085927 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52067.26959801878 {'bootstrap': False, 'max_features': 3, 'n_estimators': 

In [51]:
final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis = 1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print('Среднее квадратическое отклонение:  ', final_mse, 'Стандартное отклонение:  ', final_rmse)

Среднее квадратическое отклонение:   2291045032.3632436 Стандартное отклонение:   47864.862188908926


# Метод опорных векторов

In [36]:
from sklearn.svm import SVC

In [37]:
vector_reg = SVC(kernel="linear")

In [38]:
vector_reg.fit(housing_prepared, housing_labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [None]:
housing_predictions = vector_reg.predict(housing_prepared)

In [None]:
vector_reg_mse = mean_squared_error(housing_labels, housing_predictions)

In [None]:
vector_reg_rmse = np.sqrt(vector_reg_mse)

In [None]:
print ('Среднеквадратическое отклонение: ', vector_reg_mse,'Стандартное отклонение: ', vector_reg_rmse)

In [54]:
X_test = strat_test_set.drop("median_house_value", axis = 1) # обучающая выборка без метки "средняя стоимость дома"
y_test = strat_test_set["median_house_value"].copy() # метки в обучающем наборе

In [55]:
vector_reg = SVC(kernel="linear")

In [64]:
X_test_prepared = full_pipeline.transform(X_test) # с помощью трансформаторов подготавливаю данные для обучения
vector_reg.fit(X_test_prepared, y_test)# методом .fit осуществляется обучение
vector_reg_predictions = vector_reg.predict(X_test_prepared)
vector_reg_mse = mean_squared_error(y_test, vector_reg_predictions)
vector_reg_rmse = np.sqrt(vector_reg_mse)