# Предсказание цены на недвижимость 

Соревнование Kaggle: https://www.kaggle.com/c/house-prices-advanced-regression-techniques

### Данные

In [1]:
#Загружаем pandas и numpy
import pandas as pd
import numpy as np

In [2]:
#Загружаем данные из учебной выборке и те, которые нужно предсказать в соревновании
X = pd.read_csv('train.csv')
X_pred = pd.read_csv('test.csv')

In [3]:
X.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [4]:
#Отделяем целевую переменную и удаляем ее из выборки
y = X.SalePrice
X.drop('SalePrice', axis=1, inplace=True)

In [5]:
#Добавляем к выборкам столбец is_test, по которому будем отличать тренеровочные данных от данных, 
#для которых нужно предсказать цену

train=X
test=X_pred

train['is_test'] = 0
test['is_test'] = 1

In [6]:
#Соединяем две выборки
df = pd.concat([train, test])

In [7]:
#Удаляем из выборки параметры, с которымим не будем работать. Оставляем описание, чтобы можно было к ним вернуться
df.drop(["LotFrontage", #Linear feet of street connected to property
         "Street", #Type of road access to property
         "Alley", #Type of alley access to property
         "LotShape", #General shape of property
         "LandContour", #Flatness of the property
         "Utilities", #Type of utilities available
         "LotConfig", #Lot configuration
         "LandSlope", #Slope of property
         "OverallQual", #Rates the overall material and finish of the house
         "OverallCond", #Rates the overall condition of the house
         "YearBuilt", #Original construction date
         "RoofStyle", #Type of roof
         "RoofMatl", #Roof material
         "Exterior2nd", #Exterior covering on house (if more than one material)
         "MasVnrType", #Masonry veneer type
         "MasVnrArea", #Masonry veneer area in square feet
         "ExterQual", #Evaluates the quality of the material on the exterior
         "ExterCond", #Evaluates the present condition of the material on the exterior
         "Foundation", #Type of foundation
         "BsmtCond", #Evaluates the general condition of the basement
         "BsmtExposure", #Refers to walkout or garden level walls
         "BsmtFinType1", #Rating of basement finished area
         "BsmtFinSF1", #Type 1 finished square feet
         "BsmtFinType2", #Rating of basement finished area (if multiple types)
         "BsmtFinSF2", #Type 2 finished square feet
         "BsmtUnfSF", #Unfinished square feet of basement area
         "TotalBsmtSF", #Total square feet of basement area
         "Heating", #Type of heating
         "Electrical", #Electrical system
         "1stFlrSF", #First Floor square feet
         "2ndFlrSF", #Second floor square feet
         "FireplaceQu", #Fireplace quality
         "GarageType", #Garage location
         "GarageYrBlt", #Year garage was built
         "GarageFinish", #Interior finish of the garage
         "GarageCond", #Garage condition
         "PavedDrive", #Paved driveway
         "WoodDeckSF", #Wood deck area in square feet
         "OpenPorchSF", #Open porch area in square feet
         "EnclosedPorch", #Enclosed porch area in square feet
         "3SsnPorch", #Three season porch area in square feet
         "ScreenPorch", #Screen porch area in square feet
         "PoolQC", #Pool quality
         "Fence", #Fence quality
         "MiscFeature", #Miscellaneous feature not covered in other categories
         "MiscVal", #$Value of miscellaneous feature
        ], 
        axis=1, inplace=True)

In [8]:
#Признаки, значения которых составляют небольшой перечислимый набор, закодируем в отдельные столбцы
df_dummies = pd.get_dummies(df, columns=['MSSubClass', 'MSZoning', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
                                        'HouseStyle', 'YearRemodAdd', 'Exterior1st', 'BsmtQual', 'HeatingQC', 'CentralAir',
                                        'KitchenQual', 'Functional', 'GarageQual', 'SaleType', 'SaleCondition'])

In [9]:
df_dummies.head()

Unnamed: 0,Id,LotArea,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,8450,0,1710,1.0,0.0,2,1,3,1,...,0,0,0,1,0,0,0,0,1,0
1,2,9600,0,1262,0.0,1.0,2,0,3,1,...,0,0,0,1,0,0,0,0,1,0
2,3,11250,0,1786,1.0,0.0,2,1,3,1,...,0,0,0,1,0,0,0,0,1,0
3,4,9550,0,1717,1.0,0.0,1,0,3,1,...,0,0,0,1,1,0,0,0,0,0
4,5,14260,0,2198,1.0,0.0,2,1,4,1,...,0,0,0,1,0,0,0,0,1,0


In [10]:
#Смотрим параметры в которых есть неопределенные значения
d=df_dummies.isnull().sum()

In [11]:
d

Id                       0
LotArea                  0
LowQualFinSF             0
GrLivArea                0
BsmtFullBath             2
BsmtHalfBath             2
FullBath                 0
HalfBath                 0
BedroomAbvGr             0
KitchenAbvGr             0
TotRmsAbvGrd             0
Fireplaces               0
GarageCars               1
GarageArea               1
PoolArea                 0
MoSold                   0
YrSold                   0
is_test                  0
MSSubClass_20            0
MSSubClass_30            0
MSSubClass_40            0
MSSubClass_45            0
MSSubClass_50            0
MSSubClass_60            0
MSSubClass_70            0
MSSubClass_75            0
MSSubClass_80            0
MSSubClass_85            0
MSSubClass_90            0
MSSubClass_120           0
                        ..
KitchenQual_Fa           0
KitchenQual_Gd           0
KitchenQual_TA           0
Functional_Maj1          0
Functional_Maj2          0
Functional_Min1          0
F

In [12]:
#Разделяем выборки на ту, по которой будем учиться и ту, на которой будем предсказывать на основе ранее созданного столбца
X_tr = df_dummies[df_dummies.is_test==0].drop('is_test', axis=1)
X_pred = df_dummies[df_dummies.is_test==1].drop('is_test', axis=1)

In [20]:
columns = X_tr.columns

In [21]:
X_tr.head()

Unnamed: 0,Id,LotArea,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,8450,0,1710,1.0,0.0,2,1,3,1,...,0,0,0,1,0,0,0,0,1,0
1,2,9600,0,1262,0.0,1.0,2,0,3,1,...,0,0,0,1,0,0,0,0,1,0
2,3,11250,0,1786,1.0,0.0,2,1,3,1,...,0,0,0,1,0,0,0,0,1,0
3,4,9550,0,1717,1.0,0.0,1,0,3,1,...,0,0,0,1,1,0,0,0,0,0
4,5,14260,0,2198,1.0,0.0,2,1,4,1,...,0,0,0,1,0,0,0,0,1,0


### Заполнение пустых значений

In [22]:
from sklearn.preprocessing import Imputer

In [23]:
#Заполняем пустые значения средними
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True)

In [24]:
imputer.fit(X_tr)

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [25]:
X_train_imputed = imputer.transform(X_tr)
X_train_imputed = pd.DataFrame(X_train_imputed, columns=columns)

### Нормировка значений

In [26]:
from sklearn.preprocessing import StandardScaler

In [27]:
scaler = StandardScaler()

In [28]:
scaler.fit(X_train_imputed)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [29]:
#Производим нормирование значений из тестовой выборки
X_train_scaled = scaler.transform(X_train_imputed)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=columns)

In [30]:
X_train_scaled.head(10)

Unnamed: 0,Id,LotArea,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-1.730865,-0.207142,-0.120242,0.370333,1.10781,-0.241061,0.789741,1.227585,0.163779,-0.211454,...,-0.058621,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
1,-1.728492,-0.091886,-0.120242,-0.482512,-0.819964,3.948809,0.789741,-0.761621,0.163779,-0.211454,...,-0.058621,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
2,-1.72612,0.07348,-0.120242,0.515013,1.10781,-0.241061,0.789741,1.227585,0.163779,-0.211454,...,-0.058621,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
3,-1.723747,-0.096897,-0.120242,0.383659,1.10781,-0.241061,-1.026041,-0.761621,0.163779,-0.211454,...,-0.058621,-0.301962,-0.045376,0.390293,3.668167,-0.052414,-0.091035,-0.117851,-2.138345,-0.305995
4,-1.721374,0.375148,-0.120242,1.299326,1.10781,-0.241061,0.789741,1.227585,1.390023,-0.211454,...,-0.058621,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
5,-1.719002,0.360616,-0.120242,-0.292145,1.10781,-0.241061,-1.026041,1.227585,-2.288708,-0.211454,...,-0.058621,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
6,-1.716629,-0.043379,-0.120242,0.339875,1.10781,-0.241061,0.789741,-0.761621,0.163779,-0.211454,...,-0.058621,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
7,-1.714256,-0.013513,-0.120242,1.093729,1.10781,-0.241061,0.789741,1.227585,0.163779,-0.211454,...,-0.058621,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
8,-1.711883,-0.440659,-0.120242,0.492168,-0.819964,-0.241061,0.789741,-0.761621,-1.062465,4.328579,...,-0.058621,-0.301962,-0.045376,0.390293,3.668167,-0.052414,-0.091035,-0.117851,-2.138345,-0.305995
9,-1.709511,-0.31037,-0.120242,-0.834691,1.10781,-0.241061,-1.026041,-0.761621,-1.062465,4.328579,...,-0.058621,-0.301962,-0.045376,0.390293,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995


In [31]:
#Производим нормирование значений из выборки для которой будем делать предсказания
X_pred_scaled = scaler.transform(imputer.transform(X_pred))

### Разделение на обучающую и тестирующую выборки

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
X_train_fin, X_val, y_train_fin, y_val = train_test_split(X_train_scaled, y, test_size=0.2)

In [35]:
X_train_fin.shape

(1168, 211)

### Обучение с кросс-валидацией

In [36]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [37]:
cs = 10**np.linspace(-3,1,5)
cs

array([  1.00000000e-03,   1.00000000e-02,   1.00000000e-01,
         1.00000000e+00,   1.00000000e+01])

In [38]:
grid = {'C': cs}
gridsearch = GridSearchCV(LogisticRegression(), grid, scoring='accuracy', cv=5)

In [39]:
%%time
gridsearch.fit(X_train_fin, y_train_fin)



Wall time: 7min 10s


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [40]:
#Подбираем лучший параметр C
sorted(gridsearch.grid_scores_, key = lambda x: -x.mean_validation_score)



[mean: 0.01370, std: 0.00678, params: {'C': 1.0},
 mean: 0.01370, std: 0.00400, params: {'C': 10.0},
 mean: 0.01027, std: 0.00591, params: {'C': 0.001},
 mean: 0.00942, std: 0.00647, params: {'C': 0.01},
 mean: 0.00942, std: 0.00427, params: {'C': 0.10000000000000001}]

In [41]:
gridsearch.best_params_

{'C': 1.0}

In [42]:
best_C = gridsearch.best_params_["C"]

### Оценка точности

In [43]:
from sklearn.metrics import accuracy_score

In [44]:
clf = LogisticRegression(C=best_C)

In [45]:
clf.fit(X_train_fin, y_train_fin)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [46]:
y_val_pred = clf.predict(X_val)

In [47]:
accuracy_score(y_val, y_val_pred)

0.010273972602739725

### Предсказание для Kaggle

In [49]:
clf.fit(X_train_scaled, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [50]:
clf.predict_proba(X_pred_scaled)[:10]

array([[ 0.00063888,  0.00084542,  0.00124627, ...,  0.00061557,
         0.00063251,  0.00081088],
       [ 0.00210353,  0.00128729,  0.00101882, ...,  0.00090034,
         0.00106129,  0.00074996],
       [ 0.00090458,  0.00078255,  0.0011234 , ...,  0.00037411,
         0.0009873 ,  0.00088788],
       ..., 
       [ 0.00092224,  0.00082451,  0.00116696, ...,  0.00031844,
         0.00094677,  0.00064439],
       [ 0.00084179,  0.00124661,  0.0014203 , ...,  0.00050451,
         0.00056322,  0.00076069],
       [ 0.00103018,  0.00118021,  0.00103169, ...,  0.00094273,
         0.0008731 ,  0.00076013]])

In [51]:
#Делаем предсказания
predictions = clf.predict(X_pred_scaled)
predictions

array([139950, 136000, 181000, ..., 240000, 135000, 135000], dtype=int64)

In [52]:
#Формируем строку
submussion = 'Id,SalePrice\n'
submussion += "\n".join(["{},{}".format(pid, prediction) for pid, prediction in zip(test.Id, predictions)])

In [54]:
#Записываем данные в файл
with open('submission3.csv', 'w') as file:
    file.write(submussion)

In [56]:
#Записываем предсказания в словарь, чтобы была возможность удобного вывода результатов
slovar=dict()
for col, val in zip(X_tr.columns, clf.coef_[0]):
    slovar[val]=col

In [205]:
#Выводим параметры и их вклад в предсказание
for k in sorted(slovar.keys()):
    print (slovar[k], ':', k)

GarageQual_TA : -0.205958305592
Exterior1st_AsbShng : -0.13592138703
SaleType_ConLD : -0.123606715899
GarageQual_Fa : -0.106007825912
HouseStyle_2.5Unf : -0.102796411105
KitchenQual_Fa : -0.102546153611
GarageArea : -0.080525059538
YearRemodAdd_1952 : -0.0786726791874
TotRmsAbvGrd : -0.0733458032909
Exterior1st_BrkFace : -0.0668201286011
SaleType_COD : -0.0551626316643
SaleCondition_Alloca : -0.0548715397188
SaleCondition_Normal : -0.0546328659366
MSSubClass_70 : -0.0511452659649
CentralAir_Y : -0.0498077411003
Neighborhood_Edwards : -0.0491339345545
GarageQual_Gd : -0.0483323887043
Condition1_Feedr : -0.0480856510238
Exterior1st_MetalSd : -0.0449208015367
Neighborhood_BrkSide : -0.0445816036537
Exterior1st_WdShing : -0.0429368309255
MSSubClass_50 : -0.0413002909071
HouseStyle_2.5Fin : -0.039066507755
LowQualFinSF : -0.0386089201575
YearRemodAdd_1982 : -0.0381616614868
SaleType_Oth : -0.0371363418239
GarageQual_Ex : -0.0357369752149
Functional_Maj2 : -0.0330264676542
HeatingQC_Gd : -0.