In [1]:
# Теперь решаем задачу регрессии - предскажем цены на недвижимость. 
# Использовать датасет https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data (train.csv)

import pandas as pd

In [2]:
data_or = pd.read_csv('train.csv')  
# len(data) = 1460

target = data_or['SalePrice']
data_or = data_or.drop('SalePrice', axis = 1)
data = data_or
data.shape

(1460, 80)

In [3]:
int64_feat = list(data_or.dtypes[data_or.dtypes=='int64'].index)
# None

cat_feat = list(data_or.dtypes[data_or.dtypes==object].index)
# Alley            91   non-null object      # MasVnrType       1452 non-null object
# BsmtQual         1423 non-null object      # BsmtCond         1423 non-null object
# BsmtExposure     1422 non-null object      # BsmtFinType1     1423 non-null object
# BsmtFinType2     1422 non-null object      # Electrical       1459 non-null object
# FireplaceQu      770  non-null object      # GarageType       1379 non-null object
# GarageFinish     1379 non-null object      # GarageQual       1379 non-null object
# GarageCond       1379 non-null object      # PoolQC           7    non-null object
# Fence            281  non-null object      # MiscFeature      54   non-null object

float64_feat = list(data_or.dtypes[data_or.dtypes=='float64'].index)
# LotFrontage    1201 non-null float64
# MasVnrArea     1452 non-null float64
# GarageYrBlt    1379 non-null float64

In [4]:
data[cat_feat] = data_or[cat_feat].fillna('nan')
# data[cat_feat].info()
data[float64_feat] = data_or[float64_feat].fillna(-999)
# data[float64_feat].info()

In [5]:
dummy_data_cat = pd.get_dummies(data[cat_feat], columns=cat_feat)
cat_nunique = data[cat_feat].nunique()
# len(cat_nunique) = 43
cat_feat_reduced = list(cat_nunique[cat_nunique<10].index)
# len(cat_feat_reduced) = 40

In [6]:
processed_data = pd.concat([data[int64_feat], data[float64_feat], dummy_data_cat],axis=1)

Linear Regression

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
train_X, test_X, train_y, test_y = train_test_split(processed_data, target, test_size = 0.3, random_state = 12)

In [8]:
print('train_X.shape = ', train_X.shape)
print('train_y.shape = ', train_y.shape)
print('test_X.shape = ', test_X.shape)
print('test_y.shape = ', test_y.shape)

train_X.shape =  (1022, 305)
train_y.shape =  (1022,)
test_X.shape =  (438, 305)
test_y.shape =  (438,)


In [9]:
model = LinearRegression()
model.fit(train_X,train_y)
pred_y = model.predict(test_X)
print('     AVG_cost = ',test_y.mean())
print('pred_AVG_cost = ',pred_y.mean())

     AVG_cost =  182380.0
pred_AVG_cost =  179754.5249522874


In [10]:
df = pd.concat([processed_data,target], axis=1)

In [11]:
from sklearn.cross_validation import *
from sklearn.metrics import mean_squared_error
from pandas import *
from sklearn.model_selection import cross_val_score  



# Данных немного, поэтому необходимо использовать 10-fold кросс-валидацию для оценки качества моделей

kf = StratifiedKFold(df['SalePrice'],n_folds=10) # 

mse = []

fold_count = 0

for train, test in kf:
    print("Processing fold %s" % fold_count)
    train_fold = df.ix[train]
    test_fold = df.ix[test]
 
    # find best features
    corr = train_fold.corr()['SalePrice'][train_fold.corr()['SalePrice'] < 1].abs()
    corr.sort_values(ascending=False)
    features = corr.index[[0,1]].values
 
    # Get training examples
    train_fold_input = train_fold[features].values
    train_fold_output = train_fold['SalePrice']
 
    # Fit logistic regression
    logreg = LogisticRegression()
    logreg.fit(train_fold_input, train_fold_output)
 
    # Check MSE on test set
    pred = logreg.predict(test_fold[features])
    mse.append(mean_squared_error(test_fold['SalePrice'], pred))
 
    # Done with the fold
    fold_count += 1

print(DataFrame(mse).mean())


In [12]:
regressor = LinearRegression()

In [13]:
all_accuracies = cross_val_score(estimator=regressor, X=processed_data, y=target, cv=10)  

In [16]:
all_accuracies

array([ 0.87702367,  0.80943146,  0.91181343,  0.74960212,  0.9005552 ,
        0.66918168,  0.88461507,  0.89585085,  0.44714696, -1.1593656 ])

In [122]:
# Построить случайный лес, вывести важность признаков
    
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(n_estimators=10, max_depth=5, min_samples_leaf=20, max_features=0.5, n_jobs=-1)
clf_rf.fit(train_X, train_y)   

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=0.5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [119]:
imp = pd.Series(clf_rf.feature_importances_)
imp = imp.sort_values(ascending=False)
imp = imp[imp.values>0.01]
print(train_X.columns[imp.index])

Index(['GrLivArea', '1stFlrSF', 'OpenPorchSF', 'GarageArea', 'LotArea',
       '2ndFlrSF', 'BsmtUnfSF', 'OverallQual', 'YearRemodAdd', 'TotalBsmtSF',
       'BsmtFinSF1', 'Id', 'WoodDeckSF', 'YearBuilt', 'EnclosedPorch',
       'MasVnrArea', 'TotRmsAbvGrd', 'FullBath', 'GarageType_Attchd',
       'ScreenPorch', 'GarageFinish_Fin', 'GarageYrBlt', 'LotFrontage',
       'Neighborhood_NWAmes', 'MasVnrType_Stone', 'KitchenQual_TA'],
      dtype='object')


In [None]:
# Обучить стекинг как минимум 3х моделей, использовать хотя бы 1 линейную модель и 1 нелинейную



In [None]:
# Для валидации модели 2-го уровня использовать отдельный hold-out датасет, как на занятии



In [None]:
# Показать, что использование ансамблей моделей действительно улучшает качество 
# (стекинг vs другие модели сравнивать на hold-out)