In [1]:
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import ElasticNet, Lasso, LassoLarsIC, LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.svm import SVR
#import lightgbm as lgb

from xgboost import XGBRegressor

%matplotlib inline
import matplotlib.pyplot as plt

import importlib

import numpy as np

from DataFrameImputer import DataFrameImputer
from StackedModels import StackingModels

#### 1. Load data

In [2]:
# Read the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
# preprocessing
# Drop houses where the target is missing
train.dropna(axis=0, subset=['SalePrice'], inplace=True)

# transformation of target variable
train["SalePrice"] = np.log1p(train["SalePrice"])

# drop target data in train data
candidate_train_predictors = train.drop(['Id', 'SalePrice'], axis=1)
candidate_test_predictors = test.drop(['Id'], axis=1)

#### 2. NaN handling - Imputing

In [4]:
# Imputing
train_imputed = DataFrameImputer().fit_transform(candidate_train_predictors)
test_imputed = DataFrameImputer().fit_transform(candidate_test_predictors)

#### 3. One-hot encoding

In [5]:
one_hot_encoded_training_predictors = pd.get_dummies(train_imputed)
one_hot_encoded_test_predictors = pd.get_dummies(test_imputed)

In [6]:
col_diff = list(set(one_hot_encoded_training_predictors) - set(one_hot_encoded_test_predictors.columns))

In [7]:
final_train_cat, final_test_cat = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors,
                                                                    join='left', 
                                                                    axis=1)

In [8]:
#train_new = final_train_cat.drop(col_diff, axis = 1)
#test_new = final_test_cat.drop(col_diff, axis = 1)

train_new = final_train_cat

final_test_cat[col_diff] = 0
test_new = final_test_cat

#### 4. Split into training and validation data

In [9]:
# pull data into target (y) and predictors (X)
y = train.SalePrice

# Create training predictors data
#X = numeric_predictors
X = train_new

In [10]:
# split into train and validation data
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)

#### 5. Model

In [11]:
# linear regression
lr = LinearRegression()

In [12]:
# LASSO Regression
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=5))

In [13]:
# Gradient Boosting Regression
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.01,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

In [14]:
# Elastic Net Regression
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=5))

In [15]:
stack = StackingModels(base_models = [GBoost, ENet, lasso], meta_model = lr)

In [16]:
stack.fit(train_X, train_y)
predictions_stack = stack.predict(val_X)

In [17]:
print("Mean Absolute Error : " + str(mean_absolute_error(predictions_stack, val_y)))

Mean Absolute Error : 0.07783567517905879


#### 6. Submission

In [18]:
# Treat the test data in the same way as training data. In this case, pull same columns.
test_X = test_new

# # Use the model to make predictions
predicted_prices = np.exp(stack.predict(test_X))

In [19]:
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predicted_prices})

my_submission.to_csv('submission.csv', index=False)