In [1]:
import pandas as pd
import tarfile
import requests
from io import BytesIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import numpy as np
import dill

# Загрузка и извлечение датасета
url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz"
response = requests.get(url)

# Извлечение содержимого
with tarfile.open(fileobj=BytesIO(response.content), mode='r:gz') as tar:
    tar.extractall(path='./housing_data')

# Загрузка данных в DataFrame
df = pd.read_csv('./housing_data/housing.csv')

# Определение признаков и целевой переменной
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

# Определение числовых и категориальных признаков
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Создание предобработчика
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Обработка пропусков
    ('scaler', StandardScaler())                    # Стандартизация
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Обработка пропусков
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Преобразование в дамми-переменные
])

# Объединение предобработчиков
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Создание полного пайплайна
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обучение модели
pipeline.fit(X_train, y_train)

# Предсказание и оценка модели
y_pred = pipeline.predict(X_test)

# Оценка производительности
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

# Вычисление относительного RMSE
rrmse = (rmse/(sum([x ** 2 for x in y_pred])**0.5))

# Печать результатов
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R²: {r2}")
print(f"Relative RMSE: {rrmse}")

RMSE: 48941.70034309343
MAE: 31628.40731104651
R²: 0.8172104989933294
Relative RMSE: 0.0033053495170107957


In [2]:
pipeline.steps

[('preprocessor',
  ColumnTransformer(transformers=[('num',
                                   Pipeline(steps=[('imputer',
                                                    SimpleImputer(strategy='median')),
                                                   ('scaler', StandardScaler())]),
                                   Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
         'total_bedrooms', 'population', 'households', 'median_income'],
        dtype='object')),
                                  ('cat',
                                   Pipeline(steps=[('imputer',
                                                    SimpleImputer(fill_value='missing',
                                                                  strategy='constant')),
                                                   ('onehot',
                                                    OneHotEncoder(handle_unknown='ignore'))]),
                                   Index(['ocean_proximity'], dty

In [3]:
with open("regression.dill", "wb") as f:
    dill.dump(pipeline, f)