## Загрузка данных и первичный анализ

In [None]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

train = pd.read_csv('train (1).csv')
test = pd.read_csv('test (1).csv')

print("Первые строки обучающего датасета:")
print(train.head())

print("Количество пропусков:")
print(train.isnull().sum())

print("Статистика по числовым признакам:")
print(train.describe())
    

## Чистка данных и заполнение пропусков

In [None]:

train_clean = train.drop(columns=['Id'])

categorical_cols = train_clean.select_dtypes(include=['object']).columns
numerical_cols = train_clean.select_dtypes(include=['int64', 'float64']).columns

train_clean[categorical_cols] = train_clean[categorical_cols].fillna('None')
train_clean[numerical_cols] = train_clean[numerical_cols].fillna(train_clean[numerical_cols].median())
    

## Визуализация

In [None]:

sns.histplot(train_clean['SalePrice'], kde=True)
plt.title('Распределение SalePrice')
plt.show()

sns.scatterplot(x=train_clean['GrLivArea'], y=train_clean['SalePrice'])
plt.title('GrLivArea vs SalePrice')
plt.show()

corr_matrix = train_clean.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Корреляционная матрица')
plt.show()
    

## Удаление выбросов

In [None]:

Q1 = train_clean['GrLivArea'].quantile(0.25)
Q3 = train_clean['GrLivArea'].quantile(0.75)
IQR = Q3 - Q1
outliers = train_clean[(train_clean['GrLivArea'] < (Q1 - 1.5 * IQR)) | (train_clean['GrLivArea'] > (Q3 + 1.5 * IQR))]

print(f"Количество аномалий GrLivArea: {len(outliers)}")

train_clean = train_clean[~train_clean.index.isin(outliers.index)]
    

## Кодирование категориальных признаков

In [None]:

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
for col in train_clean.select_dtypes(include=['object']).columns:
    train_clean[col] = encoder.fit_transform(train_clean[col].astype(str))
    

## Деление данных и стандартизация

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = train_clean.drop(columns=['SalePrice'])
y = train_clean['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
    

## Линейная регрессия

In [None]:

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model_lr = LinearRegression()
model_lr.fit(X_train_scaled, y_train)
y_pred = model_lr.predict(X_test_scaled)

print(f"Линейная регрессия MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"Линейная регрессия R2: {r2_score(y_test, y_pred):.2f}")
    

## Случайный лес + важность признаков

In [None]:

from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(n_estimators=100, random_state=42)
forest.fit(X_train_scaled, y_train)

importances = forest.feature_importances_
feature_importance_df = pd.DataFrame({'Признак': X.columns, 'Важность': importances}).sort_values(by='Важность', ascending=False)

print(feature_importance_df.head(15))

import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance_df.head(15), x='Важность', y='Признак')
plt.title('Топ-15 важных признаков')
plt.show()
    

## Кросс-валидация

In [None]:

from sklearn.model_selection import cross_val_score
import numpy as np

scores = cross_val_score(forest, X_train_scaled, y_train, cv=5, scoring='r2')

print(f"R2 по фолдам: {scores}")
print(f"Среднее R2: {np.mean(scores):.2f}")
    

## XGBoost и нейросеть

In [None]:

from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb.fit(X_train_scaled, y_train)
xgb_pred = xgb.predict(X_test_scaled)
print(f"XGBoost MSE: {mean_squared_error(y_test, xgb_pred):.2f}")
print(f"XGBoost R2: {r2_score(y_test, xgb_pred):.2f}")

mlp = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=2000, random_state=42)
mlp.fit(X_train_scaled, y_train)
mlp_pred = mlp.predict(X_test_scaled)
print(f"Нейросеть MSE: {mean_squared_error(y_test, mlp_pred):.2f}")
print(f"Нейросеть R2: {r2_score(y_test, mlp_pred):.2f}")
    