# Предсказание цены на недвижимость 

Соревнование Kaggle: https://www.kaggle.com/c/house-prices-advanced-regression-techniques

### Данные

In [157]:
#Загружаем pandas и numpy
import pandas as pd
import numpy as np

In [158]:
#Загружаем данные из учебной выборке и те, которые нужно предсказать в соревновании
X = pd.read_csv('train.csv')
X_pred = pd.read_csv('test.csv')

In [159]:
X.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [160]:
#Отделяем целевую переменную и удаляем ее из выборки
y = X.SalePrice
X.drop('SalePrice', axis=1, inplace=True)

In [161]:
#Добавляем к выборкам столбец is_test, по которому будем отличать тренеровочные данных от данных, 
#для которых нужно предсказать цену

train=X
test=X_pred

train['is_test'] = 0
test['is_test'] = 1

In [162]:
#Соединяем две выборки
df = pd.concat([train, test])

In [163]:
#Удаляем из выборки параметры, с которымим не будем работать. Оставляем описание, чтобы можно было к ним вернуться
df.drop(["LotFrontage", #Linear feet of street connected to property
         "Street", #Type of road access to property
         "Alley", #Type of alley access to property
         "LotShape", #General shape of property
         "LandContour", #Flatness of the property
         "Utilities", #Type of utilities available
         "LotConfig", #Lot configuration
         "LandSlope", #Slope of property
         "OverallQual", #Rates the overall material and finish of the house
         "OverallCond", #Rates the overall condition of the house
         "YearBuilt", #Original construction date
         "RoofStyle", #Type of roof
         "RoofMatl", #Roof material
         "Exterior2nd", #Exterior covering on house (if more than one material)
         "MasVnrType", #Masonry veneer type
         "MasVnrArea", #Masonry veneer area in square feet
         "ExterQual", #Evaluates the quality of the material on the exterior
         "ExterCond", #Evaluates the present condition of the material on the exterior
         "Foundation", #Type of foundation
         "BsmtCond", #Evaluates the general condition of the basement
         "BsmtExposure", #Refers to walkout or garden level walls
         "BsmtFinType1", #Rating of basement finished area
         "BsmtFinSF1", #Type 1 finished square feet
         "BsmtFinType2", #Rating of basement finished area (if multiple types)
         "BsmtFinSF2", #Type 2 finished square feet
         "BsmtUnfSF", #Unfinished square feet of basement area
         "TotalBsmtSF", #Total square feet of basement area
         "Heating", #Type of heating
         "Electrical", #Electrical system
         "1stFlrSF", #First Floor square feet
         "2ndFlrSF", #Second floor square feet
         "FireplaceQu", #Fireplace quality
         "GarageType", #Garage location
         "GarageYrBlt", #Year garage was built
         "GarageFinish", #Interior finish of the garage
         "GarageCond", #Garage condition
         "PavedDrive", #Paved driveway
         "WoodDeckSF", #Wood deck area in square feet
         "OpenPorchSF", #Open porch area in square feet
         "EnclosedPorch", #Enclosed porch area in square feet
         "3SsnPorch", #Three season porch area in square feet
         "ScreenPorch", #Screen porch area in square feet
         "PoolQC", #Pool quality
         "Fence", #Fence quality
         "MiscFeature", #Miscellaneous feature not covered in other categories
         "MiscVal", #$Value of miscellaneous feature
        ], 
        axis=1, inplace=True)

In [164]:
#Признаки, значения которых составляют небольшой перечислимый набор, закодируем в отдельные столбцы
df_dummies = pd.get_dummies(df, columns=['MSSubClass', 'MSZoning', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
                                        'HouseStyle', 'YearRemodAdd', 'Exterior1st', 'BsmtQual', 'HeatingQC', 'CentralAir',
                                        'KitchenQual', 'Functional', 'GarageQual', 'SaleType', 'SaleCondition'])

In [165]:
df_dummies.head()

Unnamed: 0,Id,LotArea,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,8450,0,1710,1.0,0.0,2,1,3,1,...,0,0,0,1,0,0,0,0,1,0
1,2,9600,0,1262,0.0,1.0,2,0,3,1,...,0,0,0,1,0,0,0,0,1,0
2,3,11250,0,1786,1.0,0.0,2,1,3,1,...,0,0,0,1,0,0,0,0,1,0
3,4,9550,0,1717,1.0,0.0,1,0,3,1,...,0,0,0,1,1,0,0,0,0,0
4,5,14260,0,2198,1.0,0.0,2,1,4,1,...,0,0,0,1,0,0,0,0,1,0


In [166]:
#Смотрим параметры в которых есть неопределенные значения
d=df_dummies.isnull().sum()

In [167]:
d

Id                       0
LotArea                  0
LowQualFinSF             0
GrLivArea                0
BsmtFullBath             2
BsmtHalfBath             2
FullBath                 0
HalfBath                 0
BedroomAbvGr             0
KitchenAbvGr             0
TotRmsAbvGrd             0
Fireplaces               0
GarageCars               1
GarageArea               1
PoolArea                 0
MoSold                   0
YrSold                   0
is_test                  0
MSSubClass_20            0
MSSubClass_30            0
MSSubClass_40            0
MSSubClass_45            0
MSSubClass_50            0
MSSubClass_60            0
MSSubClass_70            0
MSSubClass_75            0
MSSubClass_80            0
MSSubClass_85            0
MSSubClass_90            0
MSSubClass_120           0
                        ..
KitchenQual_Fa           0
KitchenQual_Gd           0
KitchenQual_TA           0
Functional_Maj1          0
Functional_Maj2          0
Functional_Min1          0
F

In [168]:
#Разделяем выборки на ту, по которой будем учиться и ту, на которой будем предсказывать на основе ранее созданного столбца
X_tr = df_dummies[df_dummies.is_test==0].drop('is_test', axis=1)
X_pred = df_dummies[df_dummies.is_test==1].drop('is_test', axis=1)

In [169]:
train_share = 0.8

In [170]:
#Разделяем обучающую выборку на тренировочную и тестовую
train_size = int(train_share * X_tr.shape[0])
X_train = X_tr[:train_size]
X_test = X_tr[train_size:]
y_train = y[:train_size]
y_test = y[train_size:]

In [171]:
columns = X_train.columns

In [172]:
X_train.head()

Unnamed: 0,Id,LotArea,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,8450,0,1710,1.0,0.0,2,1,3,1,...,0,0,0,1,0,0,0,0,1,0
1,2,9600,0,1262,0.0,1.0,2,0,3,1,...,0,0,0,1,0,0,0,0,1,0
2,3,11250,0,1786,1.0,0.0,2,1,3,1,...,0,0,0,1,0,0,0,0,1,0
3,4,9550,0,1717,1.0,0.0,1,0,3,1,...,0,0,0,1,1,0,0,0,0,0
4,5,14260,0,2198,1.0,0.0,2,1,4,1,...,0,0,0,1,0,0,0,0,1,0


### Заполнение пустых значений

In [173]:
from sklearn.preprocessing import Imputer

In [174]:
#Заполняем пустые значения средними
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True)

In [175]:
imputer.fit(X_train)

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [176]:
X_train_imputed = imputer.transform(X_train)
X_train_imputed = pd.DataFrame(X_train_imputed, columns=columns)

### Нормировка значений

In [177]:
from sklearn.preprocessing import StandardScaler

In [178]:
scaler = StandardScaler()

In [179]:
scaler.fit(X_train_imputed)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [180]:
#Производим нормирование значений из тестовой выборки
X_train_scaled = scaler.transform(X_train_imputed)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=columns)

In [181]:
X_train_scaled.head(10)

Unnamed: 0,Id,LotArea,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-1.730569,-0.194076,-0.124196,0.410598,1.109467,-0.250969,0.804696,1.241387,0.181361,-0.206141,...,-0.058621,-0.304319,-0.041416,0.399851,-0.276582,-0.058621,-0.097506,-0.121531,0.477683,-0.30933
1,-1.727603,-0.086338,-0.124196,-0.474095,-0.810322,3.764532,0.804696,-0.761293,0.181361,-0.206141,...,-0.058621,-0.304319,-0.041416,0.399851,-0.276582,-0.058621,-0.097506,-0.121531,0.477683,-0.30933
2,-1.724637,0.068243,-0.124196,0.56068,1.109467,-0.250969,0.804696,1.241387,0.181361,-0.206141,...,-0.058621,-0.304319,-0.041416,0.399851,-0.276582,-0.058621,-0.097506,-0.121531,0.477683,-0.30933
3,-1.721671,-0.091022,-0.124196,0.424421,1.109467,-0.250969,-1.016786,-0.761293,0.181361,-0.206141,...,-0.058621,-0.304319,-0.041416,0.399851,3.615562,-0.058621,-0.097506,-0.121531,-2.093439,-0.30933
4,-1.718705,0.350235,-0.124196,1.374281,1.109467,-0.250969,0.804696,1.241387,1.39877,-0.206141,...,-0.058621,-0.304319,-0.041416,0.399851,-0.276582,-0.058621,-0.097506,-0.121531,0.477683,-0.30933
5,-1.715739,0.336651,-0.124196,-0.276619,1.109467,-0.250969,-1.016786,1.241387,-2.253458,-0.206141,...,-0.058621,-0.304319,-0.041416,0.399851,-0.276582,-0.058621,-0.097506,-0.121531,0.477683,-0.30933
6,-1.712773,-0.040995,-0.124196,0.379002,1.109467,-0.250969,0.804696,-0.761293,0.181361,-0.206141,...,-0.058621,-0.304319,-0.041416,0.399851,-0.276582,-0.058621,-0.097506,-0.121531,0.477683,-0.30933
7,-1.709808,-0.013076,-0.124196,1.161007,1.109467,-0.250969,0.804696,1.241387,0.181361,-0.206141,...,-0.058621,-0.304319,-0.041416,0.399851,-0.276582,-0.058621,-0.097506,-0.121531,0.477683,-0.30933
8,-1.706842,-0.412363,-0.124196,0.536983,-0.810322,-0.250969,0.804696,-0.761293,-1.036049,4.336749,...,-0.058621,-0.304319,-0.041416,0.399851,3.615562,-0.058621,-0.097506,-0.121531,-2.093439,-0.30933
9,-1.703876,-0.290572,-0.124196,-0.839425,1.109467,-0.250969,-1.016786,-0.761293,-1.036049,4.336749,...,-0.058621,-0.304319,-0.041416,0.399851,-0.276582,-0.058621,-0.097506,-0.121531,0.477683,-0.30933


In [182]:
#Производим нормирование значений из тестовой выборки
X_test_imputed_scaled = scaler.transform(imputer.transform(X_test))

In [183]:
#Производим нормирование значений из выборки для которой будем делать предсказания
X_pred_scaled = scaler.transform(imputer.transform(X_pred))

### Разделение на обучающую и тестирующую выборки

In [184]:
from sklearn.model_selection import train_test_split

In [185]:
X_train_fin, X_val, y_train_fin, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2)

In [186]:
X_train_fin.shape

(934, 211)

### Обучение с кросс-валидацией

In [187]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [188]:
cs = 10**np.linspace(-3,1,5)
cs

array([  1.00000000e-03,   1.00000000e-02,   1.00000000e-01,
         1.00000000e+00,   1.00000000e+01])

In [189]:
grid = {'C': cs}
gridsearch = GridSearchCV(LogisticRegression(), grid, scoring='accuracy', cv=5)

In [190]:
%%time
gridsearch.fit(X_train_fin, y_train_fin)



Wall time: 7min 56s


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [191]:
#Подбираем лучший параметр C
sorted(gridsearch.grid_scores_, key = lambda x: -x.mean_validation_score)



[mean: 0.02034, std: 0.01390, params: {'C': 1.0},
 mean: 0.01927, std: 0.00724, params: {'C': 10.0},
 mean: 0.01820, std: 0.00593, params: {'C': 0.10000000000000001},
 mean: 0.01713, std: 0.00532, params: {'C': 0.001},
 mean: 0.01606, std: 0.00983, params: {'C': 0.01}]

In [192]:
gridsearch.best_params_

{'C': 1.0}

In [193]:
best_C = gridsearch.best_params_["C"]

### Оценка точности

In [194]:
from sklearn.metrics import accuracy_score

In [195]:
clf = LogisticRegression(C=best_C)

In [196]:
clf.fit(X_train_fin, y_train_fin)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [197]:
y_val_pred = clf.predict(X_val)

In [198]:
accuracy_score(y_val, y_val_pred)

0.0042735042735042739

### Предсказание на тесте

In [199]:
clf.fit(X_train_scaled, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [200]:
clf.predict_proba(X_test_imputed_scaled)[:10]

array([[ 0.00042727,  0.00059632,  0.00035187, ...,  0.00184151,
         0.00143003,  0.00231405],
       [ 0.00131742,  0.00122703,  0.00080594, ...,  0.00206398,
         0.00156859,  0.01745682],
       [ 0.00110536,  0.00072811,  0.00065559, ...,  0.00081471,
         0.00089602,  0.00028126],
       ..., 
       [ 0.00085725,  0.00051943,  0.00186126, ...,  0.00149228,
         0.00173264,  0.00756853],
       [ 0.00145213,  0.00215138,  0.00118782, ...,  0.00091697,
         0.00123599,  0.00087353],
       [ 0.00106883,  0.0012837 ,  0.00221289, ...,  0.00060035,
         0.00076998,  0.0008831 ]])

In [201]:
#Получаем предсказания для тестовой выборки
predictions = clf.predict(X_test_imputed_scaled)
predictions

array([168000, 268000, 135000, 136000, 146000, 315000, 165000, 268000,
       178000, 143000, 160000, 165000, 187500, 182000, 181000, 144000,
       132250, 110000, 118000, 290000, 287000, 199900, 170000, 165000,
       135000, 143000, 135000, 174000, 240000, 207500, 231500,  99500,
       150000, 200500, 149000, 189000, 170000, 238000, 144000, 180000,
       130250, 320000, 175900, 174000,  80000, 130000, 160000, 127000,
       130500, 370878, 102000,  88000, 160000, 142000, 133000, 102000,
       162000, 136000, 245000, 297000, 446261, 155000, 135000, 160000,
       160000, 128950, 145000, 165000, 154000, 216837, 158000, 315000,
       237000, 245000, 147000, 318000, 215000, 180000, 197000, 146800,
       163000, 128000, 142000, 180000, 142000, 218000, 190000, 110000,
       341000,  85000, 184100, 138800, 222500, 135000, 160000, 159500,
       133000, 176000, 133900, 116000, 240000, 135000,  99500, 155000,
       129500, 345000, 135000, 110000, 135000, 190000, 255000, 108000,
      

In [202]:
#Формируем строку в специальном формате
submussion = 'Id,SalePrice\n'
submussion += "\n".join(["{},{}".format(pid, prediction) for pid, prediction in zip(test.Id, predictions)])

In [203]:
#Создаем файл с предсказаниями
with open('submission.txt', 'w') as file:
    file.write(submussion)

In [204]:
#Записываем предсказания в словарь, чтобы была возможность удобного вывода результатов
slovar=dict()
for col, val in zip(X_train.columns, clf.coef_[0]):
    slovar[val]=col

In [205]:
#Выводим параметры и их вклад в предсказание
for k in sorted(slovar.keys()):
    print (slovar[k], ':', k)

GarageQual_TA : -0.205958305592
Exterior1st_AsbShng : -0.13592138703
SaleType_ConLD : -0.123606715899
GarageQual_Fa : -0.106007825912
HouseStyle_2.5Unf : -0.102796411105
KitchenQual_Fa : -0.102546153611
GarageArea : -0.080525059538
YearRemodAdd_1952 : -0.0786726791874
TotRmsAbvGrd : -0.0733458032909
Exterior1st_BrkFace : -0.0668201286011
SaleType_COD : -0.0551626316643
SaleCondition_Alloca : -0.0548715397188
SaleCondition_Normal : -0.0546328659366
MSSubClass_70 : -0.0511452659649
CentralAir_Y : -0.0498077411003
Neighborhood_Edwards : -0.0491339345545
GarageQual_Gd : -0.0483323887043
Condition1_Feedr : -0.0480856510238
Exterior1st_MetalSd : -0.0449208015367
Neighborhood_BrkSide : -0.0445816036537
Exterior1st_WdShing : -0.0429368309255
MSSubClass_50 : -0.0413002909071
HouseStyle_2.5Fin : -0.039066507755
LowQualFinSF : -0.0386089201575
YearRemodAdd_1982 : -0.0381616614868
SaleType_Oth : -0.0371363418239
GarageQual_Ex : -0.0357369752149
Functional_Maj2 : -0.0330264676542
HeatingQC_Gd : -0.

### Предсказания для Kaggle

In [206]:
#Проделываем те же манипуляции, что и для тестовой выборки для данных по которым необходимо сделать прогноз
clf.predict_proba(X_pred_scaled)[:10]

array([[ 0.00080047,  0.00105408,  0.00183942, ...,  0.0015126 ,
         0.00214294,  0.00101578],
       [ 0.00220581,  0.00153444,  0.00154546, ...,  0.00135865,
         0.0016833 ,  0.00096242],
       [ 0.00101935,  0.00096216,  0.00139914, ...,  0.00189461,
         0.00121455,  0.00097242],
       ..., 
       [ 0.00105887,  0.00103198,  0.00145828, ...,  0.00171924,
         0.00137314,  0.00069447],
       [ 0.00086198,  0.00161631,  0.00196243, ...,  0.00085808,
         0.00208931,  0.0008718 ],
       [ 0.00122304,  0.00141059,  0.00124491, ...,  0.00122672,
         0.00095843,  0.00088394]])

In [208]:
#Делаем предсказания
predictions2 = clf.predict(X_pred_scaled)
predictions2

array([139950, 110000, 181000, ..., 135000, 135000, 135000], dtype=int64)

In [209]:
#Формируем строку
submussion2 = 'Id,SalePrice\n'
submussion2 += "\n".join(["{},{}".format(pid, prediction2) for pid, prediction2 in zip(test.Id, predictions2)])

In [211]:
#Записываем данные в файл
with open('submission2.csv', 'w') as file:
    file.write(submussion2)