In [8]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

#GET THE TRAINING DATA AND PREPROCESS IT
train = pd.read_csv("train.csv")

#Drop the most useless features, due to low correlation with the output, high percentage of nans
#or with low categorical variavility
train.drop(["Id","3SsnPorch","BsmtFinSF2","BsmtHalfBath","MSZoning","RoofStyle",
                     "LowQualFinSF","MiscVal", "KitchenAbvGr","SaleType",
                    "MiscFeature","Fence","PoolQC","PoolArea","EnclosedPorch",
                     "FireplaceQu","Alley","ScreenPorch","GarageQual",
                    "MoSold","YrSold","Street","BedroomAbvGr",
                    "Utilities","LandSlope","Condition2","RoofMatl",
                    "BsmtCond","BsmtFinType2","Heating","CentralAir",
                    "Electrical","Functional","GarageCond","PavedDrive",
                    "BldgType","SaleCondition","RoofStyle","HouseStyle","Exterior1st","Exterior2nd"],axis = 1,inplace=True)

#Process the numerical features through a pipeline:
#1. Replace every nan with the mean of each feature
#2. Scale the values of the features to make it easier for the models to predict
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())])

#Process the categorical features through a pipeline:
#1. Replace every nan with the most frequent value of each feature
#2. One hot encode all the categorical values to turn them into numbers
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

#Get the name of all the numerical features of the dataframe
numeric_features = list(train.drop(["SalePrice"],axis = 1).select_dtypes(exclude=['object']).columns)
#Get the name of all the categorical features of the dataframe
categorical_features = list(train.select_dtypes(include=['object']).columns)

#Concatenate both numerical and categorical pipelines into one
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

#Get data and labels from the dataframe
X = train.drop('SalePrice', axis=1)
y = train['SalePrice']

#Split the data into train 80% and test 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Process train and test data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)#The preprocess pipeline is not fitted on the test data


In [9]:
#There are 122 features after the preprocessing
X_train.shape

(1168, 121)

In [10]:
from keras.models import Sequential
from keras.layers import Dense

# number of neural networks
n_models = 7
models = []

#Define the standard network architecture, fit the network and return the model
def create_model():
    model = Sequential()
    model.add(Dense(128, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))
    model.add(Dense(256, kernel_initializer='normal',activation='relu'))
    model.add(Dense(256, kernel_initializer='normal',activation='relu'))
    model.add(Dense(1, kernel_initializer='normal',activation='linear'))
    model.compile(loss='mean_absolute_error', optimizer="adam", metrics=['mean_absolute_error'])
    model.fit(X_train, y_train, epochs=150, batch_size=32, verbose = 0)

    predictions = model.predict(X_test)
    print("NN:", np.sqrt(mean_squared_log_error(y_test, predictions)))
    
    return model

#Append the networks into a list
for i in range(n_models):
    print(i)
    models.append(create_model())



0
NN: 0.12694775800417862
1
NN: 0.12688778538990841
2
NN: 0.12828317012498164
3
NN: 0.12711055421328973
4
NN: 0.1260764632692008
5
NN: 0.12620179276868007
6
NN: 0.1264933031888481


In [12]:
models[0].summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 128)               15616     
_________________________________________________________________
dense_14 (Dense)             (None, 256)               33024     
_________________________________________________________________
dense_15 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 257       
Total params: 114,689
Trainable params: 114,689
Non-trainable params: 0
_________________________________________________________________


In [13]:
#Define and fit the sklearn regressors
clf = []
clf.append(Ridge(alpha=10))
clf.append(Lasso(alpha=10))
clf.append(KNeighborsRegressor(n_neighbors=10))

for p in range(len(clf)):
    clf[p].fit(X_train, y_train)
    pred = clf[p].predict(X_test)
    print(np.sqrt(mean_squared_log_error(y_test, pred)))

0.18897195597084424
0.22444168497066694
0.1542449951229291


In [14]:
#Get the test data and preprocess it
test = pd.read_csv("test.csv")
test.drop(["Id","3SsnPorch","BsmtFinSF2","BsmtHalfBath","MSZoning","RoofStyle",#
                     "LowQualFinSF","MiscVal", "KitchenAbvGr","SaleType",
                    "MiscFeature","Fence","PoolQC","PoolArea","EnclosedPorch",
                     "FireplaceQu","Alley","ScreenPorch","GarageQual",
                    "MoSold","YrSold","Street","BedroomAbvGr",
                    "Utilities","LandSlope","Condition2","RoofMatl",
                    "BsmtCond","BsmtFinType2","Heating","CentralAir",
                    "Electrical","Functional","GarageCond","PavedDrive",
                    "BldgType","SaleCondition","RoofStyle","HouseStyle","Exterior1st","Exterior2nd"],axis = 1,inplace=True)

X_real = preprocessor.transform(test)

#Append the predictions into a list
predictions = []
for t in range(n_models):
    predictions.append(models[t].predict(X_real))

for l in range(len(clf)):
    pred = clf[l].predict(X_real)
    pred = pred.reshape(pred.shape[0], 1)
    predictions.append(pred)

#Get the mean of the predictions 
result = sum(predictions) / len(predictions)

#Save the results into a csv file with correct kaggle format
submit = pd.DataFrame(result)
submit.rename(columns={0:"SalePrice"}, inplace=True)
submit.index.name = "Id"
submit.index += 1461
submit.to_csv("submission_best.csv")
submit.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,116455.268403
1462,153625.481807
1463,184499.928988
1464,193037.248677
1465,199698.810334
