In [44]:
import pandas as pd
import numpy as np
import catboost as ct
import matplotlib as plt
from  sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [49]:
df = pd.read_csv('train.csv').drop('Id', axis=1)
# Разделение данных на числовые и категориальные
numeric_df = df.select_dtypes(include=[np.number])
categorical_df = df.select_dtypes(include=['object'])

# Интерполяция числовых данных
numeric_df = numeric_df.interpolate()

# Заполнение пропусков в категориальных данных
categorical_df = categorical_df.fillna('Unknown')

# Объединение обработанных данных
df= pd.concat([numeric_df, categorical_df], axis=1)
df.isna().sum()

MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
OverallCond      0
                ..
PoolQC           0
Fence            0
MiscFeature      0
SaleType         0
SaleCondition    0
Length: 80, dtype: int64

In [51]:
train_df, test_df = train_test_split(df, train_size=.7, random_state=42)

X_train, y_train = train_df.drop('SalePrice', axis=1), train_df['SalePrice']

X_test, y_test = test_df.drop('SalePrice', axis=1), test_df['SalePrice']

In [52]:
# Список категориальных признаков
categorical_features = df.select_dtypes(include=['object']).columns.values
categorical_features

array(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
       'MiscFeature', 'SaleType', 'SaleCondition'], dtype=object)

In [56]:
model = ct.CatBoostRegressor(learning_rate=0.01, verbose=False, iterations=1500)

model.fit(X_train, y_train, cat_features=categorical_features,  plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x1fade281cd0>

In [69]:
predict_train = model.predict(X_train)
mse_train = mean_squared_error(y_train, predict_train)
r_2_train = r2_score(y_train, predict_train)

In [58]:
mse_train**0.5, r_2_train

(14586.767494588921, 0.9646814361011586)

In [19]:
mse_train**0.5, r_2_train

(6074.435014671552, 0.9938751325334425)

In [59]:
predict_test = model.predict(X_test)
mse_test = mean_squared_error(y_test, predict_test)
r_2_test = r2_score(y_test, predict_test)
mse_test**0.5, r_2_test

(24083.833925845945, 0.9166895001832144)

# make submission 4

In [80]:
test = pd.read_csv('test.csv')

submission4 = pd.DataFrame()
submission4['Id'] = test['Id']

test.drop('Id', axis=1, inplace=True)


In [81]:
# Разделение данных на числовые и категориальные
numeric_df = test.select_dtypes(include=[np.number])
categorical_df = test.select_dtypes(include=['object'])

# Интерполяция числовых данных
numeric_df = numeric_df.interpolate()

# Заполнение пропусков в категориальных данных
categorical_df = categorical_df.fillna('Unknown')

# Объединение обработанных данных
test= pd.concat([numeric_df, categorical_df], axis=1)

In [83]:
submission4['SalePrice'] = model.predict(test)
submission4.to_csv('submission5.csv', index=False)