In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [3]:
data = pd.read_csv('train.csv')
data.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
#удалим столбцы с минимиальным количеством данных
exclude_columns = ['PoolQC', 'MiscFeature','FireplaceQu', 'Alley', 'Fence', 'Id']
data = data.drop(columns=exclude_columns)

In [6]:
#для столбцов с численными признаками заполним пропуски данных медианами значений

numeric_columns = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

for col in numeric_columns:
    med = data[col].median()
    data[col] = data[col].fillna(med)

In [7]:
#для столбцов с категориальными данными заполним пропуски наиболее встречающимися значениями

categorical_columns = ['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond' ]

for col in categorical_columns:
    top_ = data[col].describe()['top']
    data[col] = data[col].fillna(top_)



In [8]:
#для столбцов с категориальными признаками выполним перевод в числовые значения

cat_cols = data.select_dtypes(include = "object").columns

le = LabelEncoder()

for col in cat_cols:
    data[col] = le.fit_transform(data[col].astype(str))


In [15]:
#выбираем Х и У

X = data.iloc[:, :73]
y = data.SalePrice 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)


In [17]:
#строим случайный лес

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)   

RandomForestRegressor()

In [18]:
#попытаемся оценить точность модели
rf.score(X_train, y_train)

0.9760065983388003

In [19]:
#посмотрим значимость признаков
imp = pd.Series(rf.feature_importances_)
imp.sort_values(ascending=False)

15    5.061977e-01
44    1.220790e-01
58    4.373641e-02
36    3.814917e-02
41    2.740234e-02
          ...     
37    7.419824e-05
61    4.232153e-05
69    2.978029e-05
4     2.779875e-07
7     2.077615e-08
Length: 73, dtype: float64

In [26]:
#построим модель, используя 5 топ-признаков

X1 = data.iloc[ :, [15, 44, 58, 36, 41]]
y = data.SalePrice 
X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size = 0.3, random_state = 1)
rf_1 = RandomForestRegressor()
rf_1.fit(X1_train, y_train)   

RandomForestRegressor()

In [27]:
#оценим точность новой модели
rf_1.score(X1_test, y_test)

0.8660612490218458

In [23]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.svm import LinearSVR

In [24]:
#зададим модели для стекинга

estimators = [('lr', RidgeCV()), ('svr', LinearSVR(random_state=42)), ('lasso',  LassoCV())]
reg = StackingRegressor(estimators = estimators, final_estimator = RandomForestRegressor())

In [30]:
#оценим точность модели 

reg.fit(X_train, y_train).score(X_test, y_test)



0.8707244053627406