In [40]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
 

In [83]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

сделаем предобработку данных

In [5]:
exclude_columns = ['PoolQC', 'MiscFeature','FireplaceQu', 'Alley', 'Fence', 'Id']
data = data.drop(columns=exclude_columns)

In [6]:
numeric_columns = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

for col in numeric_columns:
    med = data[col].median()
    data[col] = data[col].fillna(med)
    
    
categorical_columns = ['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond' ]

for col in categorical_columns:
    top_ = data[col].describe()['top']
    data[col] = data[col].fillna(top_)

cat_cols = data.select_dtypes(include = "object").columns

le = LabelEncoder()

for col in cat_cols:
    data[col] = le.fit_transform(data[col].astype(str))


In [7]:
X = data.iloc[:, :73]
y = data.SalePrice 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [81]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
rf = RandomForestRegressor()
rf.fit(X_train, y_train)  

RandomForestRegressor()

In [14]:
scores = cross_val_score(RandomForestRegressor(),
                X_train, y_train, 
                cv = 10)
print("mean accuracy: {:.2f}".format(scores.mean()))


mean accuracy: 0.82


In [16]:
imp = pd.Series(rf.feature_importances_)

imp.sort_values(ascending=False)

15    5.123721e-01
44    1.133273e-01
58    4.473317e-02
36    3.630550e-02
32    2.926246e-02
          ...     
61    6.796339e-05
69    6.357730e-05
12    3.443400e-05
4     6.166479e-08
7     0.000000e+00
Length: 73, dtype: float64

In [79]:
#отберем признаки
X1 = data.iloc[ :, [15, 44, 58, 36, 32]]
y1 = data.SalePrice 
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.3, random_state = 1)
scores = cross_val_score(RandomForestRegressor(),
                X1_train, y1_train, 
                cv = 10)
print("mean accuracy: {:.2f}".format(scores.mean()))

mean accuracy: 0.80


In [35]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV, LogisticRegression, LassoCV
from sklearn.svm import LinearSVR

In [60]:
#обучим стекинг
estimators = [('rf', RandomForestRegressor()), ('svr', LinearSVR(max_iter = 1000, random_state=42)), ('lasso',  LassoCV(max_iter=1000))]
reg = StackingRegressor(estimators = estimators, final_estimator = RidgeCV(), cv = 10)
reg.fit(X1_train, y1_train)



StackingRegressor(cv=10,
                  estimators=[('rf', RandomForestRegressor()),
                              ('svr', LinearSVR(random_state=42)),
                              ('lasso', LassoCV())],
                  final_estimator=RidgeCV(alphas=array([ 0.1,  1. , 10. ])))

In [61]:
reg.fit(X1_train, y1_train).score(X1_test, y1_test)



0.8912585314524084

проводим аналогичную предобработку для тестового датасета и обучаем стекинг

In [78]:
df = pd.read_csv('train.csv')
df.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [84]:
df = df.drop(columns = exclude_columns)

for col in numeric_columns:
    med = df[col].median()
    df[col] = df[col].fillna(med)
    
for col in categorical_columns:
    top_ = df[col].describe()['top']
    df[col] = df[col].fillna(top_)

cat_cols = df.select_dtypes(include = "object").columns

le = LabelEncoder()

for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))


In [57]:
df.head(10)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,3,65.0,8450,1,3,3,0,4,0,...,0,0,0,0,0,2,2008,8,4,208500
1,20,3,80.0,9600,1,3,3,0,2,0,...,0,0,0,0,0,5,2007,8,4,181500
2,60,3,68.0,11250,1,0,3,0,4,0,...,0,0,0,0,0,9,2008,8,4,223500
3,70,3,60.0,9550,1,0,3,0,0,0,...,272,0,0,0,0,2,2006,8,0,140000
4,60,3,84.0,14260,1,0,3,0,2,0,...,0,0,0,0,0,12,2008,8,4,250000
5,50,3,85.0,14115,1,0,3,0,4,0,...,0,320,0,0,700,10,2009,8,4,143000
6,20,3,75.0,10084,1,3,3,0,4,0,...,0,0,0,0,0,8,2007,8,4,307000
7,60,3,69.0,10382,1,0,3,0,0,0,...,228,0,0,0,350,11,2009,8,4,200000
8,50,4,51.0,6120,1,3,3,0,4,0,...,205,0,0,0,0,4,2008,8,0,129900
9,190,3,50.0,7420,1,3,3,0,0,0,...,0,0,0,0,0,1,2008,8,4,118000


In [67]:
Xdf = df.iloc[ :, [15, 44, 58, 36, 32]]
ydf = df.SalePrice 

In [68]:
reg.fit(Xdf, ydf).score(Xdf, ydf)



0.9702943589557439

построим несколько простых моделей для сравнения

In [71]:
svr = LinearSVR()
svr.fit(Xdf, ydf).score(Xdf, ydf)



0.6210890742132908

In [72]:
lasso = LassoCV()
lasso.fit(Xdf, ydf).score(Xdf, ydf)

0.6250252611019174

In [73]:
rg = RidgeCV()
rg.fit(Xdf, ydf).score(Xdf, ydf)

0.7724208807047744