# `Масштабирование данных`

## Нормализация

Для начала импортируем датасет:

In [1]:
import pandas as pd
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()
X = pd.DataFrame(data['data'], columns=data['feature_names'])
y = data['target']

X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


## Разбиение выборки:

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size=0.2)


##  Тренируем модель:

In [3]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score

knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print(f'R2 score: {r2_score(y_test, y_pred)}') # метрика ужасная

R2 score: 0.17733954829415377


## Причина тому - отсутсвие нормализации. Произведем нормализацию идентичную `MinMaxScaler`:

$$
x_s = \frac{x - min}{max - min}
$$

In [4]:
max_val = X_train.max()
min_val = X_train.min()
X_train_norm = X_train.copy()

for i in X_train.columns:
    X_train_norm[i] = (X_train[i] - min_val[i])/(max_val[i] - min_val[i]) # для обучающей выборки

# Или (X_train - min_val)/(max_val - min_val), реализуем это на тестовой выборке

X_test_norm = (X_test - min_val)/(max_val - min_val)

X_train_norm.head()
X_test_norm.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0
mean,0.234496,0.544198,0.032414,0.02244,0.039816,0.004262,0.325244,0.478847
std,0.13333,0.243659,0.013532,0.010172,0.031321,0.032298,0.225655,0.198421
min,0.0,0.0,0.002015,0.003294,5.6e-05,0.000465,0.002125,0.00498
25%,0.143938,0.333333,0.025715,0.01991,0.022142,0.002904,0.148778,0.258715
50%,0.21039,0.54902,0.03124,0.021195,0.03268,0.003552,0.180659,0.584661
75%,0.294279,0.705882,0.036861,0.022743,0.047619,0.004325,0.548353,0.631474
max,1.0,1.0,0.436515,0.408432,0.451778,2.07445,0.98831,0.982072


## Заново обучим модель:

In [5]:
knn_norm = KNeighborsRegressor()
knn_norm.fit(X_train_norm, y_train)
y_pred_norm = knn_norm.predict(X_test_norm)

print(f'R2 score: {r2_score(y_test, y_pred_norm)}') # красота

R2 score: 0.6999378722006064


# Для линейных моделей особой разницы не наблюдается:

In [6]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred_lin = linreg.predict(X_test) # Обучение на начальных данных

linreg_norm = LinearRegression()
linreg_norm.fit(X_train_norm, y_train)
y_pred_lin_norm = linreg_norm.predict(X_test_norm) # Обучение на нормализованных данных

print(f'R2 without normalization: {r2_score(y_test, y_pred_lin)}')
print(f'R2 with normalization: {r2_score(y_test, y_pred_lin_norm)}')

R2 without normalization: 0.6009790143129103
R2 with normalization: 0.6009790143129106


# Стандартизация

## Делаем подобие `StandardScaler`:

$$
x_s = \frac{x - mean}{std}
$$


In [7]:
mean_val = X_train.mean()
std_var = X_train.std()
X_train_std = X_train.copy()

for i in X_train.columns:
    X_train_std = (X_train - mean_val) / (std_var)

X_test_std = X_test.copy()

for i in X_test.columns:
    X_test_std = (X_test - mean_val) / (std_var)

## Обучение моделей: 

In [9]:
knn2 = KNeighborsRegressor()
knn2.fit(X_train_std, y_train)
y_pred_std_knn = knn2.predict(X_test_std)


print(f'R2 score without standartization: {r2_score(y_test, y_pred)}')
print(f'R2 score with it: {r2_score(y_test, y_pred_std_knn)}')

R2 score without standartization: 0.17733954829415377
R2 score with it: 0.6995524587495723


In [10]:
lin2 = LinearRegression()
lin2.fit(X_train_std, y_train)
y_pred_std_lin = lin2.predict(X_test_std)


print(f'R2 score without standartization: {r2_score(y_test, y_pred_lin)}')
print(f'R2 score with it: {r2_score(y_test, y_pred_std_lin)}')

R2 score without standartization: 0.6009790143129103
R2 score with it: 0.6009790143129106
