In [1]:
import pandas as pd
import numpy as np
import sklearn

###California Inc. The company aimed to simplify the method of valuing homes for sale in the area using machine learning. dataset consists of more than 20,000 settlements
###Калифорния Инк. Компания стремилась упростить метод оценки домов для продажи в этом районе с помощью машинного обучения. набор данных состоит из более чем 20 000 населенных пунктов

In [2]:
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


###we can divide the dataset into two parts in the form of 70 to 30.
###разделить набор данных на две части в виде 70 на 30.

In [3]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.3, random_state=42)

X_train = train_set.drop("median_house_value", axis=1)
y = train_set["median_house_value"].copy()

X_num = X_train.drop("ocean_proximity", axis=1)

###We write the functions required for the model and pass the allocated dataset through the container.
###Мы пишем необходимые для модели функции и пропускаем выделенный набор данных через контейнер.

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self 
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: 
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='median')),
          ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room = True)),
          ('std_scaler', StandardScaler())             
])

In [6]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

###sorted and cleared of unnecessary values, the dataset is ready for machine learning
###отсортирован и очищен от ненужных значений, набор данных готов к машинному обучению

In [7]:
X_prepared = full_pipeline.fit_transform(X_train)
X_prepared[0:5,:]

array([[ 0.78093406, -0.80568191,  0.50935748, -0.11324158, -0.33786962,
        -0.18411678, -0.24350772,  0.13350629,  0.18106017, -0.01082519,
        -0.80919934,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 1.24526986, -1.33947268, -0.67987313, -0.21356615, -0.01388439,
        -0.37619075, -0.01326659, -0.53221805, -0.42262953, -0.08931585,
         0.5409245 ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.27755183, -0.49664515, -0.36274497, -0.48263943, -0.61420997,
        -0.61124018, -0.56532203,  0.1709897 ,  0.07312833, -0.04480037,
        -0.63257554,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.70693761,  1.69002403, -1.15556537, -0.84833868, -0.92628398,
        -0.98749467, -0.94992937, -0.40291602,  0.17584811, -0.07522954,
        -0.45640997,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ],
       [-1.43090202,  0.99235014,  1

###We create the model and give it a dataset.
###Мы создаем модель и даем ей набор данных.

In [8]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [9]:
LR_model.fit(X_prepared, y)

LinearRegression()

###We first test the finished material with random values
###Сначала мы тестируем готовый материал со случайными значениями

In [10]:
test_data = X_train.sample(10)
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
14506,-117.22,32.87,5.0,3511.0,1008.0,1599.0,918.0,3.8542,NEAR OCEAN
18267,-122.07,37.36,21.0,3244.0,426.0,1158.0,415.0,7.5,<1H OCEAN
7106,-118.01,33.9,26.0,2968.0,674.0,1655.0,628.0,4.6094,<1H OCEAN
8975,-118.41,34.01,43.0,2000.0,529.0,1290.0,514.0,4.7031,<1H OCEAN
4188,-118.22,34.12,28.0,3306.0,1025.0,2670.0,942.0,3.0919,<1H OCEAN
18773,-122.29,40.47,20.0,2858.0,612.0,1422.0,589.0,1.9657,INLAND
9420,-122.62,37.85,30.0,833.0,164.0,358.0,143.0,6.8198,NEAR OCEAN
1214,-120.67,37.97,9.0,7450.0,1475.0,2233.0,930.0,2.6528,INLAND
14792,-117.11,32.57,32.0,2723.0,586.0,1702.0,562.0,3.3371,NEAR OCEAN
5205,-118.29,33.94,47.0,1782.0,338.0,1003.0,329.0,2.5398,<1H OCEAN


In [11]:
test_label = y.loc[test_data.index]
test_label

14506    176600.0
18267    500001.0
7106     201000.0
8975     302500.0
4188     185400.0
18773     63000.0
9420     493800.0
1214     133000.0
14792    140500.0
5205     105700.0
Name: median_house_value, dtype: float64

In [12]:
test_data_prepared = full_pipeline.transform(test_data)
test_data_prepared[0]

array([ 1.1803627 , -1.30201368, -1.86910373,  0.40040173,  1.11529944,
        0.15003931,  1.09084428, -0.01191256, -0.65763854, -0.1113217 ,
        1.26890736,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ])

In [13]:
predicted_data = LR_model.predict(test_data_prepared)
predicted_data

array([261515.72688296, 370319.17181485, 256241.00887358, 293797.19152798,
       213439.68302925,  53044.95964969, 364039.89741226, 123078.51889207,
       209321.94223901, 180124.38961324])

###Our results from the first test
###Наши результаты первого теста

In [14]:
pd.DataFrame({'prediction прогноз |':predicted_data, 'real price реальная цена': test_label})

Unnamed: 0,prediction прогноз |,real price реальная цена
14506,261515.726883,176600.0
18267,370319.171815,500001.0
7106,256241.008874,201000.0
8975,293797.191528,302500.0
4188,213439.683029,185400.0
18773,53044.95965,63000.0
9420,364039.897412,493800.0
1214,123078.518892,133000.0
14792,209321.942239,140500.0
5205,180124.389613,105700.0


###Let's take the second test
###Пройдем второй тест

In [15]:
test_set.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.8,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.725,278000.0,NEAR OCEAN


In [16]:
X_test = test_set.drop('median_house_value', axis=1)
X_test.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.8,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.725,NEAR OCEAN


In [17]:
y_test = test_set['median_house_value'].copy()
y_test.head()

20046     47700.0
3024      45800.0
15663    500001.0
20484    218600.0
9814     278000.0
Name: median_house_value, dtype: float64

#LinearRegression

In [18]:
X_test_prepared = full_pipeline.transform(X_test)
y_predicted = LR_model.predict(X_test_prepared)

from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_test, y_predicted)
lin_rmse = np.sqrt(lin_mse)
print(f"The mean square error of the LinearRegression model: {lin_rmse}\nСреднеквадратическая ошибка модели LinearRegression: {lin_rmse}")
pd.DataFrame({'prediction прогноз |':y_predicted, 'real price реальная цена': y_test})

The mean square error of the LinearRegression model: 70459.9779093803
Среднеквадратическая ошибка модели LinearRegression: 70459.9779093803


Unnamed: 0,prediction прогноз |,real price реальная цена
20046,62428.763431,47700.0
3024,120663.677238,45800.0
15663,264859.040996,500001.0
20484,264124.945642,218600.0
9814,258419.497771,278000.0
...,...,...
17505,224483.400523,237500.0
13512,68524.677879,67300.0
10842,270462.788753,218400.0
16559,112748.416545,119400.0


#DecisionTreeRegressor

In [19]:
from sklearn.tree import DecisionTreeRegressor

Tree_model = DecisionTreeRegressor()
Tree_model.fit(X_prepared, y)
y_predicted = Tree_model.predict(X_test_prepared)

lin_mse = mean_squared_error(y_test, y_predicted)
lin_rmse = np.sqrt(lin_mse)
print(f"The mean square error of the DecisionTreeRegressor model: {lin_rmse}\nСреднеквадратическая ошибка модели DecisionTreeRegressor: {lin_rmse}")
pd.DataFrame({'prediction прогноз |':y_predicted, 'real price реальная цена': y_test})

The mean square error of the DecisionTreeRegressor model: 70831.22897724107
Среднеквадратическая ошибка модели DecisionTreeRegressor: 70831.22897724107


Unnamed: 0,prediction прогноз |,real price реальная цена
20046,49100.0,47700.0
3024,132800.0,45800.0
15663,500001.0,500001.0
20484,269100.0,218600.0
9814,346000.0,278000.0
...,...,...
17505,279200.0,237500.0
13512,67800.0,67300.0
10842,188100.0,218400.0
16559,129300.0,119400.0


#RandomForestRegressor

In [20]:
from sklearn.ensemble import RandomForestRegressor

RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)
y_predicted = RF_model.predict(X_test_prepared)

lin_mse = mean_squared_error(y_test, y_predicted)
lin_rmse = np.sqrt(lin_mse)
print(f"The mean square error of the RandomForestRegressor model: {lin_rmse}\nСреднеквадратическая ошибка модели RandomForestRegressor: {lin_rmse}")
pd.DataFrame({'prediction прогноз':y_predicted, '| real price реальная цена': y_test})

The mean square error of the RandomForestRegressor model: 49880.63011180657
Среднеквадратическая ошибка модели RandomForestRegressor: 49880.63011180657


Unnamed: 0,prediction прогноз,| real price реальная цена
20046,49524.00,47700.0
3024,114435.00,45800.0
15663,451875.35,500001.0
20484,274100.01,218600.0
9814,242227.00,278000.0
...,...,...
17505,223843.01,237500.0
13512,65750.00,67300.0
10842,241023.00,218400.0
16559,144944.00,119400.0


###The third final test
###Третий финальный тест

In [21]:
from sklearn.model_selection import cross_val_score

X = df.drop("median_house_value", axis=1)
y = df["median_house_value"].copy()

X_prepared = full_pipeline.transform(X)

def display_scores(scores):
    # print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std.dev:", scores.std())

### LinearRegression

In [22]:
scores = cross_val_score(LR_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Mean: 71891.71307941685
Std.dev: 13249.525989445005


#### Decision Tree

In [23]:
scores = cross_val_score(Tree_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Mean: 85751.99415972605
Std.dev: 15191.43691746529


#### Random Forest

In [24]:
scores = cross_val_score(RF_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Mean: 63961.49672609223
Std.dev: 15317.158261585339


In [106]:
import joblib

filename = 'LR_model.jbl'
joblib.dump(LR_model, filename)

['LR_model.jbl']