### Обучающая и тестовая выборки

- В качестве обучающей выборки будем использовать наборы, полученные при помощи train_test_split из scikit-learn.
- Конечно, алгоритм может работать с многомерным пространством, но для удобства демонстраиции алгоритма мы используем двухмерный набор данных.
- Если разные признаки имеют сильно отличающиеся диапазоны значений, то применяют масштабирование исходных данных.

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

data_full = pd.read_csv('data/housing.csv', sep=",")
data_full

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


Проверим датасет на пустые значения. 

In [21]:
# проверим есть ли пропущенные значения
data_full.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [22]:
from sklearn.impute import SimpleImputer
from sklearn.impute import MissingIndicator

# Фильтр для проверки заполнения пустых значений
indicator = MissingIndicator()
mask_missing_values_only = indicator.fit_transform(data_full)

In [23]:
strategies=['mean', 'median', 'most_frequent']

# Более сложная функция, которая позволяет задавать колонку и вид импьютации
def test_num_impute_col(dataset, column, strategy_param):
    temp_data = dataset[[column]]
    
    indicator = MissingIndicator()
    mask_missing_values_only = indicator.fit_transform(temp_data)
    
    imp_num = SimpleImputer(strategy=strategy_param)
    data_num_imp = imp_num.fit_transform(temp_data)
    
    return data_num_imp

In [24]:
imp2 = SimpleImputer(missing_values=np.nan, strategy='mean')
data_imp2 = imp2.fit_transform(data_full[['total_bedrooms']])
data_full[['total_bedrooms']] = data_imp2
np.unique(data_full[['total_bedrooms']])

array([1.000e+00, 2.000e+00, 3.000e+00, ..., 5.471e+03, 6.210e+03,
       6.445e+03])

In [25]:
data_full['total_bedrooms'] = test_num_impute_col(data_full, 'total_bedrooms', strategies[0])

In [26]:
# проверим есть ли пропущенные значения
data_full.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

Заметим, что ocean_proximity является категориальным признаком, рассмотрим уникальные значения для него:

In [27]:
np.unique(data_full[['ocean_proximity']])

array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
      dtype=object)

Для простоты объединим '<1H OCEAN>','INLAND'; 'NEAR BAY', 'NEAR OCEAN'; 'ISLAND'.

In [28]:
def class_to_regr(y: float) -> str:
    if y=='<1H OCEAN' or y=='INLAND':
        result = 0
    elif y=='NEAR BAY' or y=='NEAR OCEAN':
        result = 1
    else:
        result = 2        
    return result 

In [29]:
data_full['ocean_proximity_regr'] = \
data_full.apply(lambda row: class_to_regr(row['ocean_proximity']),axis=1)

In [30]:
X = data_full.drop(columns=['median_house_value'])  # Let's say column 'target' is the target\class\label column
y = data_full['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0016,
                                                    train_size=0.015, 
                                                    random_state=42)

In [31]:
data_train_to_csv=X_train
data_train_to_csv['median_house_value']=y_train
data_train_to_csv

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,ocean_proximity_regr,median_house_value
16105,-122.50,37.75,44.0,1819.0,537.870553,1137.0,354.0,3.4919,NEAR OCEAN,1,271800.0
9149,-118.50,34.46,17.0,10267.0,537.870553,4956.0,1483.0,5.5061,<1H OCEAN,0,239400.0
16879,-122.39,37.60,34.0,707.0,537.870553,381.0,156.0,4.3750,NEAR OCEAN,1,340900.0
6253,-117.96,34.03,35.0,2093.0,537.870553,1755.0,403.0,3.4115,<1H OCEAN,0,150400.0
11741,-121.13,38.87,48.0,1127.0,537.870553,530.0,186.0,3.0917,INLAND,0,128100.0
...,...,...,...,...,...,...,...,...,...,...,...
19553,-121.00,37.62,28.0,1153.0,420.000000,1043.0,357.0,1.0801,INLAND,0,75000.0
3481,-118.52,34.30,17.0,4542.0,621.000000,2144.0,597.0,8.8467,<1H OCEAN,0,450700.0
1457,-121.97,37.97,26.0,1977.0,264.000000,817.0,273.0,5.7512,INLAND,0,240200.0
2698,-115.38,32.82,38.0,1892.0,394.000000,1175.0,374.0,1.9939,INLAND,0,65800.0


In [32]:
data_test_to_csv=X_test
data_test_to_csv

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,ocean_proximity_regr
20046,-119.01,36.06,25.0,1505.0,537.870553,1392.0,359.0,1.6812,INLAND,0
3024,-119.46,35.14,30.0,2943.0,537.870553,1565.0,584.0,2.5313,INLAND,0
15663,-122.44,37.8,52.0,3830.0,537.870553,1310.0,963.0,3.4801,NEAR BAY,1
20484,-118.72,34.28,17.0,3051.0,537.870553,1705.0,495.0,5.7376,<1H OCEAN,0
9814,-121.93,36.62,34.0,2351.0,537.870553,1063.0,428.0,3.725,NEAR OCEAN,1
13311,-117.61,34.08,12.0,4427.0,537.870553,2400.0,843.0,4.7147,INLAND,0
7113,-118.02,33.89,36.0,1375.0,537.870553,670.0,221.0,5.0839,<1H OCEAN,0
7668,-118.08,33.92,38.0,1335.0,537.870553,1011.0,269.0,3.6908,<1H OCEAN,0
18246,-122.08,37.39,4.0,2292.0,537.870553,1050.0,584.0,4.8036,NEAR BAY,1
5723,-118.23,34.18,45.0,2332.0,537.870553,943.0,339.0,8.1132,<1H OCEAN,0


Запишем полученные датасеты в .csv и запишем в новые переменные (для чистоты эксперимента).

In [33]:
pd.DataFrame.to_csv(data_test_to_csv, './data/housing_test.csv', sep=",", index=False)
pd.DataFrame.to_csv(data_train_to_csv, './data/housing_train.csv', sep=",", index=False)