In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from mlxtend.regressor import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.linear_model import Lasso



## Feature Engineering


In [None]:
train_set = pd.read_csv("../input/train.csv")
test_set = pd.read_csv("../input/test.csv")
train_set.drop(train_set[(train_set['GrLivArea']>4000) & (train_set['SalePrice']<300000)].index, inplace=True)
train_set.drop(train_set[(train_set['GarageArea']>1200) & (train_set['SalePrice']<100000)].index, inplace=True)
train_set.drop(train_set[(train_set['TotalBsmtSF']>6000) & (train_set['SalePrice']<200000)].index, inplace=True)
y_train = train_set.SalePrice

train_set = train_set.drop(["SalePrice"], axis=1)
train_set = train_set.set_index('Id')
test_set = test_set.set_index('Id')


In [None]:
weak=['BedroomAbvGr', 'ScreenPorch', 'PoolArea', 'MoSold', '3SsnPorch',
       'BsmtFinSF2', 'BsmtHalfBath', 'MiscVal', 'LowQualFinSF', 'YrSold',
       'OverallCond', 'MSSubClass', 'EnclosedPorch', 'KitchenAbvGr']
train_set.drop(weak, axis=1, inplace=True)
test_set.drop(weak, axis=1, inplace=True)

In [None]:
firstmerge = pd.concat([train_set, test_set], axis=0)
data_dummies= pd.get_dummies(firstmerge)
data_dummies = data_dummies.reset_index(drop=True)
train_set = data_dummies.loc[0:1456]
test_set = data_dummies.loc[1457:]
my_imputer = SimpleImputer()
train_set = my_imputer.fit_transform(train_set)
test_set = my_imputer.fit_transform(test_set)
scaler = MinMaxScaler()
scaler.fit(train_set)
train_scaled = scaler.transform(train_set)
test_scaled = scaler.transform(test_set)

y_train = np.log(y_train)

## Basic Stacked Regression with GridSearch

In [None]:
# lr = LinearRegression()
# svr_lin = SVR(kernel='linear')
# ridge = Ridge(random_state=1)
# lasso = Lasso(random_state=1)
# svr_rbf = SVR(kernel='rbf')


# params = {'lasso__alpha': [0.1, 1.0],
#           'ridge__alpha': [0.1, 1.0],
#           'svr__C': [0.1, 1.0],
#           'meta-svr__C': [0.1, 1.0],
#           'meta-svr__gamma': [0.1, 1.0]}


# stregr = StackingRegressor(regressors=[svr_lin, lr, ridge, lasso], 
#                            meta_regressor=svr_rbf, verbose=1)
# grid = GridSearchCV(estimator=stregr, 
#                     param_grid=params, 
#                     cv=5,
#                     refit=True)
# grid.fit(train_scaled, y_train)

## Stacked Regression w/o GridSearch

In [None]:

# GBoost = GradientBoostingRegressor(n_estimators=5000, learning_rate=0.005,
#                                    max_depth=4, max_features='sqrt',
#                                    min_samples_leaf=15, min_samples_split=10, 
#                                    loss='huber', random_state =5)
# lr = LinearRegression()
# svr_lin = SVR(kernel='linear')
# ridge = Ridge(random_state=1)
# svr_rbf = SVR(kernel='rbf')


In [None]:
# stregr = StackingRegressor(regressors=[svr_lin, lr, ridge, GBoost], 
#                            meta_regressor=svr_rbf)

# # Training the stacking classifier

# stregr.fit(train_scaled, y_train)


## Sklearn Bagging Regressor

In [None]:
# bag_reg = BaggingRegressor(GradientBoostingRegressor(learning_rate=0.005,
#                                 max_depth=4, max_features='sqrt',
#                                 min_samples_leaf=15, min_samples_split=10, 
#                                 loss='huber', random_state =5, n_estimators=400), n_estimators=500, max_samples=100, bootstrap=True)

In [None]:
# bag_reg.fit(train_scaled, y_train)

## Random Forest

In [27]:
# from sklearn.ensemble import RandomForestRegressor
# rnd_clf = RandomForestRegressor(n_estimators=2500, max_depth=3, max_features='sqrt', warm_start=True, verbose=1)
# rnd_clf.fit(train_scaled, y_train)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2500 out of 2500 | elapsed:    1.7s finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=2500, n_jobs=None,
           oob_score=False, random_state=None, verbose=1, warm_start=True)

## Gradient Boosted Regressor

In [None]:
GBoost = GradientBoostingRegressor(n_estimators=5000, learning_rate=0.001,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

In [None]:
GBoost.fit(train_scaled, y_train)

In [None]:

test_result = GBoost.predict(test_scaled)
test_result = np.exp(test_result)-1
df= pd.DataFrame({'SalePrice': test_result})
df.index.name='Id'
df.index +=1461
df.to_csv('gridsearch.csv')