# Загрузка датасета и выявление численных и категориальных признаков

In [1]:
import numpy as np
import pandas as pd
import catboost
from sklearn.model_selection import GridSearchCV

In [2]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
num_features = []
for col in train.columns:
    if train[col].dtype != 'object':
        num_features.append(col)
num_features

['Id',
 'MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SalePrice']

In [5]:
cat_features = []
for col in train.columns:
    if train[col].dtype == 'object':
        cat_features.append(col)
cat_features

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

# Поиск Nan значений

In [7]:
num_na = []
for _ in num_features:
    if sum(train[_].isna()) > 0:
        num_na.append(_)
num_na

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [8]:
cat_na = []
for _ in cat_features:
    if sum(train[_].isna()) > 0:
        cat_na.append(_)
cat_na

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [9]:
for _ in cat_na:
    test[_] = train[_].fillna('Not Stated')

In [10]:
for _ in num_na:
    train[_] = train[_].fillna(np.mean(test[_]))

In [101]:
num_features = []
for col in test.columns:
    if test[col].dtype != 'object':
        num_features.append(col)
num_features

['Id',
 'MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [103]:
num_na = []
for _ in num_features:
    if sum(test[_].isna()) > 0:
        num_na.append(_)
num_na

['LotFrontage',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea']

In [106]:
for _ in num_na:
    test[_] = test[_].fillna(np.mean(test[_]))

In [None]:
for _ in cat_na:
    test[_] = test[_].fillna('Not Stated')

# One-hot-encoding для некоторых категориальных признаков

In [114]:
new_train = train
new_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Alley,Neighborhood,Condition1,Condition2,HouseStyle,...,Fa,Gd,TA,CBlock,PConc,Slab,Stone,Wood,P,Y
0,1,60,RL,65.0,8450,Not Stated,CollgCr,Norm,Norm,2Story,...,0,1,0,0,1,0,0,0,0,1
1,2,20,RL,80.0,9600,Not Stated,Veenker,Feedr,Norm,1Story,...,0,0,1,1,0,0,0,0,0,1
2,3,60,RL,68.0,11250,Not Stated,CollgCr,Norm,Norm,2Story,...,0,1,0,0,1,0,0,0,0,1
3,4,70,RL,60.0,9550,Not Stated,Crawfor,Norm,Norm,2Story,...,0,0,1,0,0,0,0,0,0,1
4,5,60,RL,84.0,14260,Not Stated,NoRidge,Norm,Norm,2Story,...,0,1,0,0,1,0,0,0,0,1


In [None]:
OH_features = ['LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'BldgType', 'ExterQual',
              'Foundation','PavedDrive']

In [None]:
new_train = new_train.join(pd.get_dummies(train['Street']))
new_train = new_train.drop('Street', axis = 1)

In [74]:
for _ in OH_features:
    one_hot = pd.get_dummies(train[_], drop_first=True )
    new_train = new_train.join(one_hot)
    new_test = new_test.drop(_, axis = 1)

In [107]:
new_test = test

In [113]:
new_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Alley,Neighborhood,Condition1,Condition2,HouseStyle,...,Fa,Gd,TA,CBlock,PConc,Slab,Stone,Wood,P,Y
0,1461,20,RH,80.0,11622,Not Stated,NAmes,Feedr,Norm,1Story,...,0,0,1,1,0,0,0,0,0,1
1,1462,20,RL,81.0,14267,Not Stated,NAmes,Norm,Norm,1Story,...,0,0,1,1,0,0,0,0,0,1
2,1463,60,RL,74.0,13830,Not Stated,Gilbert,Norm,Norm,2Story,...,0,0,1,0,1,0,0,0,0,1
3,1464,60,RL,78.0,9978,Not Stated,Gilbert,Norm,Norm,2Story,...,0,0,1,0,1,0,0,0,0,1
4,1465,120,RL,43.0,5005,Not Stated,StoneBr,Norm,Norm,1Story,...,0,1,0,0,1,0,0,0,0,1


In [111]:
OH_features = ['LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'BldgType', 'ExterQual',
              'Foundation','PavedDrive']

In [109]:
new_test = new_test.join(pd.get_dummies(train['Street']))

In [110]:
new_test = new_test.drop('Street', axis = 1)

In [112]:
for _ in OH_features:
    one_hot = pd.get_dummies(test[_], drop_first=True )
    new_test = new_test.join(one_hot)
    new_test = new_test.drop(_, axis = 1)

# Модель catboost

In [122]:
X_train = new_train.drop(['SalePrice', 'Id'], axis = 1)
Y_train = new_train['SalePrice']
X_test = new_test.drop('Id', axis = 1)

In [123]:
X_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Alley,Neighborhood,Condition1,Condition2,HouseStyle,OverallQual,...,Fa,Gd,TA,CBlock,PConc,Slab,Stone,Wood,P,Y
0,60,RL,65.0,8450,Not Stated,CollgCr,Norm,Norm,2Story,7,...,0,1,0,0,1,0,0,0,0,1
1,20,RL,80.0,9600,Not Stated,Veenker,Feedr,Norm,1Story,6,...,0,0,1,1,0,0,0,0,0,1
2,60,RL,68.0,11250,Not Stated,CollgCr,Norm,Norm,2Story,7,...,0,1,0,0,1,0,0,0,0,1
3,70,RL,60.0,9550,Not Stated,Crawfor,Norm,Norm,2Story,7,...,0,0,1,0,0,0,0,0,0,1
4,60,RL,84.0,14260,Not Stated,NoRidge,Norm,Norm,2Story,8,...,0,1,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Not Stated,Gilbert,Norm,Norm,2Story,6,...,0,0,1,0,1,0,0,0,0,1
1456,20,RL,85.0,13175,Not Stated,NWAmes,Norm,Norm,1Story,6,...,0,0,1,1,0,0,0,0,0,1
1457,70,RL,66.0,9042,Not Stated,Crawfor,Norm,Norm,2Story,7,...,0,0,0,0,0,0,1,0,0,1
1458,20,RL,68.0,9717,Not Stated,NAmes,Norm,Norm,1Story,5,...,0,0,1,1,0,0,0,0,0,1


In [124]:
X_test

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Alley,Neighborhood,Condition1,Condition2,HouseStyle,OverallQual,...,Fa,Gd,TA,CBlock,PConc,Slab,Stone,Wood,P,Y
0,20,RH,80.0,11622,Not Stated,NAmes,Feedr,Norm,1Story,5,...,0,0,1,1,0,0,0,0,0,1
1,20,RL,81.0,14267,Not Stated,NAmes,Norm,Norm,1Story,6,...,0,0,1,1,0,0,0,0,0,1
2,60,RL,74.0,13830,Not Stated,Gilbert,Norm,Norm,2Story,5,...,0,0,1,0,1,0,0,0,0,1
3,60,RL,78.0,9978,Not Stated,Gilbert,Norm,Norm,2Story,6,...,0,0,1,0,1,0,0,0,0,1
4,120,RL,43.0,5005,Not Stated,StoneBr,Norm,Norm,1Story,8,...,0,1,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,RM,21.0,1936,Not Stated,MeadowV,Norm,Norm,2Story,4,...,0,0,1,1,0,0,0,0,0,1
1455,160,RM,21.0,1894,Not Stated,MeadowV,Norm,Norm,2Story,4,...,0,0,1,1,0,0,0,0,0,1
1456,20,RL,160.0,20000,Not Stated,Mitchel,Norm,Norm,1Story,5,...,0,0,1,1,0,0,0,0,0,1
1457,85,RL,62.0,10441,Not Stated,Mitchel,Norm,Norm,SFoyer,5,...,0,0,1,0,1,0,0,0,0,1


## Получение индексов категориальных признаков

In [126]:
categ_features = []
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        categ_features.append(X_train.columns.get_loc(col))

In [127]:
categ_features

[1,
 4,
 5,
 6,
 7,
 8,
 13,
 14,
 15,
 16,
 17,
 19,
 20,
 21,
 22,
 23,
 25,
 29,
 30,
 31,
 32,
 43,
 45,
 47,
 48,
 50,
 53,
 54,
 61,
 62,
 63,
 67,
 68]

In [139]:
Cat_model = catboost.CatBoostRegressor(n_estimators=200,
                                       cat_features=categ_features)

In [140]:
Cat_model.fit(X_train, Y_train)

y_train_predicted = Cat_model.predict(X_train)
y_test_predicted = Cat_model.predict(X_test)

Learning rate set to 0.160846
0:	learn: 71636.7933787	total: 108ms	remaining: 21.5s
1:	learn: 64769.6581728	total: 143ms	remaining: 14.1s
2:	learn: 58699.0960359	total: 188ms	remaining: 12.3s
3:	learn: 53642.6654200	total: 230ms	remaining: 11.2s
4:	learn: 50062.1556610	total: 274ms	remaining: 10.7s
5:	learn: 46384.6684129	total: 309ms	remaining: 9.99s
6:	learn: 43086.2740125	total: 351ms	remaining: 9.68s
7:	learn: 40137.5135525	total: 387ms	remaining: 9.28s
8:	learn: 37833.0467124	total: 433ms	remaining: 9.18s
9:	learn: 35752.7566820	total: 468ms	remaining: 8.89s
10:	learn: 34286.8793604	total: 510ms	remaining: 8.76s
11:	learn: 32692.9404639	total: 545ms	remaining: 8.54s
12:	learn: 31308.2188428	total: 586ms	remaining: 8.42s
13:	learn: 30091.3495571	total: 625ms	remaining: 8.31s
14:	learn: 29063.1824266	total: 661ms	remaining: 8.15s
15:	learn: 28098.8622788	total: 709ms	remaining: 8.15s
16:	learn: 27089.9615038	total: 749ms	remaining: 8.06s
17:	learn: 26181.4621357	total: 804ms	remaini

152:	learn: 12458.7895122	total: 6.35s	remaining: 1.95s
153:	learn: 12447.6637261	total: 6.39s	remaining: 1.91s
154:	learn: 12363.7646840	total: 6.42s	remaining: 1.86s
155:	learn: 12362.1352670	total: 6.47s	remaining: 1.82s
156:	learn: 12336.7342061	total: 6.5s	remaining: 1.78s
157:	learn: 12279.8779827	total: 6.55s	remaining: 1.74s
158:	learn: 12277.4422669	total: 6.58s	remaining: 1.7s
159:	learn: 12274.2920690	total: 6.63s	remaining: 1.66s
160:	learn: 12272.6147713	total: 6.67s	remaining: 1.61s
161:	learn: 12223.5331600	total: 6.71s	remaining: 1.57s
162:	learn: 12201.5504484	total: 6.74s	remaining: 1.53s
163:	learn: 12199.7361588	total: 6.79s	remaining: 1.49s
164:	learn: 12150.1519396	total: 6.83s	remaining: 1.45s
165:	learn: 12059.2279392	total: 6.87s	remaining: 1.41s
166:	learn: 12051.1840986	total: 6.91s	remaining: 1.36s
167:	learn: 11965.9890733	total: 6.95s	remaining: 1.32s
168:	learn: 11963.2809223	total: 6.99s	remaining: 1.28s
169:	learn: 11883.1758913	total: 7.03s	remaining: 

# Создание и сохранение финального датасета с прдесказанными ценами

In [141]:
new_test['SalePrice'] = pd.Series(y_test_predicted)

In [142]:
cat_result = new_test[['Id' ,'SalePrice']]

In [144]:
cat_result.to_csv('CatBoost.csv')