In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [2]:
data = fetch_california_housing()

In [177]:
print(data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [180]:
df = pd.DataFrame(data.data, columns=data.feature_names)
df["MedianHouseValue"] = data.target
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
0,8.3252,41.0000,6.9841,1.0238,322.0000,2.5556,37.8800,-122.2300,4.5260
1,8.3014,21.0000,6.2381,0.9719,2401.0000,2.1098,37.8600,-122.2200,3.5850
2,7.2574,52.0000,8.2881,1.0734,496.0000,2.8023,37.8500,-122.2400,3.5210
3,5.6431,52.0000,5.8174,1.0731,558.0000,2.5479,37.8500,-122.2500,3.4130
4,3.8462,52.0000,6.2819,1.0811,565.0000,2.1815,37.8500,-122.2500,3.4220
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0000,5.0455,1.1333,845.0000,2.5606,39.4800,-121.0900,0.7810
20636,2.5568,18.0000,6.1140,1.3158,356.0000,3.1228,39.4900,-121.2100,0.7710
20637,1.7000,17.0000,5.2055,1.1201,1007.0000,2.3256,39.4300,-121.2200,0.9230
20638,1.8672,18.0000,5.3295,1.1719,741.0000,2.1232,39.4300,-121.3200,0.8470


In [13]:
X = pd.DataFrame(data['data'], columns=data['feature_names'])
y = pd.DataFrame(data['target'], columns=['target'])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [25]:
display((X_train.shape, y_train.shape), (X_test.shape, y_test.shape))

((16512, 8), (16512, 1))

((4128, 8), (4128, 1))

# Модели без масштабирования

## Линейная регрессия

In [32]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [44]:
lin_reg_train = lin_reg.predict(X_train)

In [35]:
lin_reg_pred = lin_reg.predict(X_test)

## KNN

In [36]:
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

In [45]:
knn_train = knn.predict(X_train)

In [37]:
knn_pred = knn.predict(X_test)

## Дерево решений

In [38]:
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)

In [46]:
tree_train = tree.predict(X_train)

In [39]:
tree_pred = tree.predict(X_test)

## Метрики

In [40]:
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

### Для регрессии

In [59]:
display(
    f'rmse(train): {root_mean_squared_error(y_train, lin_reg_train):.2f}',
    f'rmse(test): {root_mean_squared_error(y_test, lin_reg_pred):.2f}',
    '',
    f'r2(train): {r2_score(y_train, lin_reg_train):.2f}',
    f'r2(test): {r2_score(y_test, lin_reg_pred):.2f}'
)

'rmse(train): 0.72'

'rmse(test): 0.73'

''

'r2(train): 0.60'

'r2(test): 0.61'

### Для KNN

In [60]:
display(
    f'rmse(train): {root_mean_squared_error(y_train, knn_train):.2f}',
    f'rmse(test): {root_mean_squared_error(y_test, knn_pred):.2f}',
    '',
    f'r2(train): {r2_score(y_train, knn_train):.2f}',
    f'r2(test): {r2_score(y_test, knn_pred):.2f}'
)

'rmse(train): 0.86'

'rmse(test): 1.05'

''

'r2(train): 0.44'

'r2(test): 0.17'

### Для Дерева

In [61]:
display(
    f'rmse(train): {root_mean_squared_error(y_train, tree_train):.2f}',
    f'rmse(test): {root_mean_squared_error(y_test, tree_pred):.2f}',
    '',
    f'r2(train): {r2_score(y_train, tree_train):.2f}',
    f'r2(test): {r2_score(y_test, tree_pred):.2f}'
)

'rmse(train): 0.00'

'rmse(test): 0.73'

''

'r2(train): 1.00'

'r2(test): 0.61'

# Масштабирование признаков

In [62]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

## Нормализация ( MinMaxScaler )

Загоняем значения в рамки от **нуля** до **единицы**
$$x = \frac{x - min}{max - min}$$

In [63]:
mms = MinMaxScaler()
mms.fit(X_train)

In [65]:
X_train_norm = pd.DataFrame(mms.transform(X_train),
                           columns=X_train.columns)
X_train_norm

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,0.437291,0.470588,0.100498,0.043708,0.020937,0.001847,0.136170,0.644422
1,0.156329,0.431373,0.059473,0.048723,0.064856,0.001439,0.613830,0.205179
2,0.242948,0.490196,0.080405,0.046578,0.025449,0.001971,0.656383,0.286853
3,0.153819,0.411765,0.088609,0.054430,0.028588,0.002224,0.687234,0.277888
4,0.317382,0.274510,0.091017,0.052795,0.032540,0.001557,0.031915,0.713147
...,...,...,...,...,...,...,...,...
16507,0.201597,0.098039,0.061031,0.048130,0.020432,0.001126,0.579787,0.298805
16508,0.144170,0.431373,0.060179,0.044330,0.052748,0.001444,0.455319,0.463147
16509,0.312361,0.333333,0.089446,0.045351,0.049048,0.001994,0.502128,0.247012
16510,0.142895,0.607843,0.055420,0.046619,0.074946,0.002852,0.143617,0.613546


In [66]:
X_train_norm.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,0.232891,0.540864,0.072718,0.050731,0.039713,0.001938,0.329075,0.475663
std,0.130726,0.246993,0.034241,0.024799,0.031838,0.009337,0.227329,0.199298
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.14286,0.333333,0.056799,0.044873,0.021974,0.001401,0.147872,0.25498
50%,0.210149,0.54902,0.069626,0.047763,0.032456,0.00171,0.181915,0.581673
75%,0.293106,0.705882,0.083175,0.0512,0.048124,0.00208,0.55,0.631474
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [69]:
X_test_norm = pd.DataFrame(mms.transform(X_test),
                           columns=X_test.columns)
X_test_norm

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,0.332278,0.509804,0.087508,0.039008,0.017573,0.001431,0.580851,0.300797
1,0.182115,0.607843,0.056045,0.045329,0.066285,0.002053,0.154255,0.617530
2,0.475021,0.274510,0.109969,0.054520,0.026879,0.001862,0.105319,0.661355
3,0.137343,0.568627,0.043953,0.044350,0.051963,0.002934,0.162766,0.627490
4,0.117040,0.568627,0.067502,0.045949,0.053000,0.001795,0.452128,0.458167
...,...,...,...,...,...,...,...,...
4123,0.118971,0.784314,0.068372,0.048020,0.001822,0.002323,0.541489,0.192231
4124,0.287644,0.313725,0.083399,0.044278,0.028112,0.002206,0.156383,0.700199
4125,0.173080,0.803922,0.046219,0.038784,0.016957,0.001190,0.507447,0.243028
4126,0.052806,0.352941,0.045910,0.054925,0.006867,0.000596,0.430851,0.245020


## Стандартизация ( StandartScaler )

Загоняем под **среднее значение** = 0 и **Стандартное отклонение** = 1
$$x = \frac{x - mean}{std}$$

In [72]:
ss = StandardScaler()
ss.fit(X_train)

In [73]:
X_train_stand = pd.DataFrame(ss.transform(X_train),
                           columns=X_train.columns)
X_train_stand

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,1.563623,-0.284534,0.811335,-0.283200,-0.589770,-0.009735,-0.848598,0.846798
1,-0.585688,-0.443311,-0.386846,-0.080968,0.789734,-0.053461,1.252652,-1.357223
2,0.076937,-0.205145,0.224504,-0.167477,-0.448034,0.003480,1.439846,-0.947405
3,-0.604891,-0.522700,0.464086,0.149149,-0.349435,0.030604,1.575561,-0.992385
4,0.646341,-1.078420,0.534433,0.083206,-0.225306,-0.040808,-1.307222,1.191645
...,...,...,...,...,...,...,...,...
16507,-0.239393,-1.792918,-0.341352,-0.104883,-0.605616,-0.087015,1.102897,-0.887431
16508,-0.678698,-0.443311,-0.366229,-0.258127,0.409424,-0.052937,0.555355,-0.062798
16509,0.607934,-0.840254,0.488536,-0.216947,0.293218,0.005946,0.761268,-1.147316
16510,-0.688458,0.271187,-0.505206,-0.165832,1.106659,0.097805,-0.815839,0.691867


In [76]:
pd.set_option('display.float_format', lambda x: '%0.4f' % x)

In [78]:
X_train_stand.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.7816,-2.1899,-2.1238,-2.0457,-1.2474,-0.2076,-1.4476,-2.3868
25%,-0.6887,-0.8403,-0.4649,-0.2362,-0.5572,-0.0576,-0.7971,-1.1073
50%,-0.174,0.033,-0.0903,-0.1197,-0.2279,-0.0245,-0.6474,0.5319
75%,0.4606,0.6681,0.3054,0.0189,0.2642,0.0152,0.9719,0.7818
max,5.8683,1.859,27.0821,38.2792,30.1625,106.8956,2.9514,2.631


In [74]:
X_test_stand = pd.DataFrame(ss.transform(X_test),
                           columns=X_test.columns)
X_test_stand

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,0.760295,-0.125756,0.431935,-0.472724,-0.695411,-0.054335,1.107577,-0.877436
1,-0.388430,0.271187,-0.486972,-0.217834,0.834632,0.012311,-0.769040,0.711858
2,1.852255,-1.078420,1.087939,0.152787,-0.403136,-0.008167,-0.984313,0.931760
3,-0.730927,0.112410,-0.840119,-0.257314,0.384774,0.106685,-0.731602,0.761836
4,-0.886243,0.112410,-0.152365,-0.192862,0.417347,-0.015356,0.541316,-0.087787
...,...,...,...,...,...,...,...,...
4123,-0.871471,0.985685,-0.126932,-0.109336,-1.190167,0.041194,0.934423,-1.422194
4124,0.418854,-0.919643,0.311920,-0.260212,-0.364401,0.028654,-0.759681,1.126674
4125,-0.457542,1.065073,-0.773931,-0.481773,-0.714779,-0.080120,0.784668,-1.167307
4126,-1.377620,-0.760866,-0.782963,0.169126,-1.031704,-0.143720,0.447719,-1.157312


# Модели на масштабированных признаках

## Регрессия

In [79]:
lin_reg_norm = LinearRegression()
lin_reg_stand = LinearRegression()

In [80]:
lin_reg_norm.fit(X_train_norm, y_train)
lin_reg_stand.fit(X_train_stand, y_train)

In [82]:
lin_reg_norm_train = lin_reg_norm.predict(X_train_norm)
lin_reg_stand_train = lin_reg_stand.predict(X_train_stand)

In [83]:
lin_reg_norm_test = lin_reg_norm.predict(X_test_norm)
lin_reg_stand_test = lin_reg_stand.predict(X_test_stand)

## KNN

In [84]:
knn_norm = KNeighborsRegressor()
knn_stand = KNeighborsRegressor()

In [85]:
knn_norm.fit(X_train_norm, y_train)
knn_stand.fit(X_train_stand, y_train)

In [86]:
knn_norm_train = knn_norm.predict(X_train_norm)
knn_stand_train = knn_stand.predict(X_train_stand)

In [87]:
knn_norm_test = knn_norm.predict(X_test_norm)
knn_stand_test = knn_stand.predict(X_test_stand)

## Дерево

In [88]:
tree_norm = DecisionTreeRegressor()
tree_stand = DecisionTreeRegressor()

In [89]:
tree_norm.fit(X_train_norm, y_train)
tree_stand.fit(X_train_stand, y_train)

In [90]:
tree_norm_train = tree_norm.predict(X_train_norm)
tree_stand_train = tree_stand.predict(X_train_stand)

In [91]:
tree_norm_test = tree_norm.predict(X_test_norm)
tree_stand_test = tree_stand.predict(X_test_stand)

## Метрики

### Для регрессии

In [103]:
print(
    'До масштабирования: ',
    f'rmse(train): {root_mean_squared_error(y_train, lin_reg_train):.2f}',
    f'rmse(test): {root_mean_squared_error(y_test, lin_reg_pred):.2f}',
    '',
    f'r2(train): {r2_score(y_train, lin_reg_train):.2f}',
    f'r2(test): {r2_score(y_test, lin_reg_pred):.2f}',
    '-----------------------------------------------------------------------',
    'После масштабирования: ',
    f'rmse(norm train): {root_mean_squared_error(y_train, lin_reg_norm_train):.2f}',
    f'rmse(stand train): {root_mean_squared_error(y_train, lin_reg_stand_train):.2f}',
    '',
    f'rmse(norm test): {root_mean_squared_error(y_test, lin_reg_norm_test):.2f}',
    f'rmse(stand test): {root_mean_squared_error(y_test, lin_reg_stand_test):.2f}',
    '',
    f'r2(norm train): {r2_score(y_train, lin_reg_norm_train):.2f}',
    f'r2(stand train): {r2_score(y_train, lin_reg_stand_train):.2f}',
    '',
    f'r2(norm test): {r2_score(y_test, lin_reg_norm_test):.2f}',
    f'r2(stand test): {r2_score(y_test, lin_reg_stand_test):.2f}',
    sep='\n'
)

До масштабирования: 
rmse(train): 0.72
rmse(test): 0.73

r2(train): 0.60
r2(test): 0.61
-----------------------------------------------------------------------
После масштабирования: 
rmse(norm train): 0.72
rmse(stand train): 0.72

rmse(norm test): 0.73
rmse(stand test): 0.73

r2(norm train): 0.60
r2(stand train): 0.60

r2(norm test): 0.61
r2(stand test): 0.61


### Для KNN

In [104]:
print(
    'До масштабирования: ',
    f'rmse(train): {root_mean_squared_error(y_train, knn_train):.2f}',
    f'rmse(test): {root_mean_squared_error(y_test, knn_pred):.2f}',
    '',
    f'r2(train): {r2_score(y_train, knn_train):.2f}',
    f'r2(test): {r2_score(y_test, knn_pred):.2f}',
    '-----------------------------------------------------------------------',
    'После масштабирования: ',
    f'rmse(norm train): {root_mean_squared_error(y_train, knn_norm_train):.2f}',
    f'rmse(stand train): {root_mean_squared_error(y_train, knn_stand_train):.2f}',
    '',
    f'rmse(norm test): {root_mean_squared_error(y_test, knn_norm_test):.2f}',
    f'rmse(stand test): {root_mean_squared_error(y_test, knn_stand_test):.2f}',
    '',
    f'r2(norm train): {r2_score(y_train, knn_norm_train):.2f}',
    f'r2(stand train): {r2_score(y_train, knn_stand_train):.2f}',
    '',
    f'r2(norm test): {r2_score(y_test, knn_norm_test):.2f}',
    f'r2(stand test): {r2_score(y_test, knn_stand_test):.2f}',
    sep='\n'
)

До масштабирования: 
rmse(train): 0.86
rmse(test): 1.05

r2(train): 0.44
r2(test): 0.17
-----------------------------------------------------------------------
После масштабирования: 
rmse(norm train): 0.51
rmse(stand train): 0.53

rmse(norm test): 0.61
rmse(stand test): 0.64

r2(norm train): 0.81
r2(stand train): 0.79

r2(norm test): 0.72
r2(stand test): 0.69


### Для дерева

In [105]:
print(
    'До масштабирования: ',
    f'rmse(train): {root_mean_squared_error(y_train, tree_train):.2f}',
    f'rmse(test): {root_mean_squared_error(y_test, tree_pred):.2f}',
    '',
    f'r2(train): {r2_score(y_train, tree_train):.2f}',
    f'r2(test): {r2_score(y_test, tree_pred):.2f}',
    '-----------------------------------------------------------------------',
    'После масштабирования: ',
    f'rmse(norm train): {root_mean_squared_error(y_train, tree_norm_train):.2f}',
    f'rmse(stand train): {root_mean_squared_error(y_train, tree_stand_train):.2f}',
    '',
    f'rmse(norm test): {root_mean_squared_error(y_test, tree_norm_test):.2f}',
    f'rmse(stand test): {root_mean_squared_error(y_test, tree_stand_test):.2f}',
    '',
    f'r2(norm train): {r2_score(y_train, tree_norm_train):.2f}',
    f'r2(stand train): {r2_score(y_train, tree_stand_train):.2f}',
    '',
    f'r2(norm test): {r2_score(y_test, tree_norm_test):.2f}',
    f'r2(stand test): {r2_score(y_test, tree_stand_test):.2f}',
    sep='\n'
)

До масштабирования: 
rmse(train): 0.00
rmse(test): 0.73

r2(train): 1.00
r2(test): 0.61
-----------------------------------------------------------------------
После масштабирования: 
rmse(norm train): 0.00
rmse(stand train): 0.00

rmse(norm test): 0.73
rmse(stand test): 0.73

r2(norm train): 1.00
r2(stand train): 1.00

r2(norm test): 0.61
r2(stand test): 0.61


# Предсказание

In [106]:
knn_norm

In [158]:
X_test_norm.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,0.3323,0.5098,0.0875,0.039,0.0176,0.0014,0.5809,0.3008
1,0.1821,0.6078,0.056,0.0453,0.0663,0.0021,0.1543,0.6175
2,0.475,0.2745,0.11,0.0545,0.0269,0.0019,0.1053,0.6614
3,0.1373,0.5686,0.044,0.0444,0.052,0.0029,0.1628,0.6275
4,0.117,0.5686,0.0675,0.0459,0.053,0.0018,0.4521,0.4582


In [145]:
y_test = y_test.reset_index().drop(columns='index')

In [159]:
y_test.head()

Unnamed: 0,target
0,1.551
1,1.753
2,3.183
3,1.592
4,0.745


In [179]:
knn_norm.predict(X_test_norm.loc[[1]])

array([[1.7246]])

In [165]:
knn_norm.predict(X_test_norm.loc[[2]])

array([[3.2838]])

In [166]:
knn_norm.predict(X_test_norm.loc[[3]])

array([[1.6414]])