In [1]:
import numpy as np 
import pandas as pd
import sklearn

import os

In [2]:
import seaborn as sns
import matplotlib as plt
from plotnine import *
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from scipy.stats import norm    # used in plotting

In [3]:
trainData = pd.read_csv("./data/train.csv")
testData = pd.read_csv("./data/test.csv")

In [4]:
outliers = ((trainData.GrLivArea > 4000) & (trainData.SalePrice < 5E5))
trainData = trainData[~(outliers)]

In [6]:
zoning = trainData.groupby("Neighborhood").MSZoning.apply(lambda x: x.value_counts().sort_values().index[0]).to_dict()
utilities = trainData.groupby("Neighborhood").Utilities.apply(lambda x: x.value_counts().sort_values().index[0]).to_dict()
frontage = trainData.groupby("Neighborhood").LotFrontage.apply(lambda x: x.value_counts().sort_values().index[0]).to_dict()

In [9]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [11]:
# Don't know what to do with these yet
# Maybe harmonic transform on month. 
# Maybe diff between year sold and built
timeVars = ["MoSold","YrSold","YearBuilt"]

# CentralAir needs to be examined for correlation
toInclude = ["LotFrontage","CentralAir","Electrical","Functional","MSZoning","Utilities",
            "KitchenQual","SaleType","Exterior1st","Exterior2nd","CentralAir"]

examine =  ["MSSubClass", "LotArea","Street","Alley","LotShape","LandContour",
          "LotConfig","LandSlope","Neighborhood","Condition1","Condition2","BldgType","HouseStyle",
          "OverallQual","OverallCond","RoofStyle","RoofMatl","ExterQual",
          "ExterCond","Foundation","Heating","HeatingQC","TotsRmsAbvGr","PavedDrive","EnclosedPorch",
          "3SsnPorch","ScreenPorch","MiscVal","SaleCondition"]

In [80]:
## These variables differ for nonlinear models (linearly dependent features)
# values that null is filled with "None"
fillNone = ["Alley","BsmtQual","BsmtCond","MasVnrType","BsmtExposure","BsmtFinType1","BsmtFinType2",
            "FireplaceQu","GarageType","GarageFinish","GarageQual","GarageCond",
            "PoolQC","Fence","MiscFeature","MasVnrType"]

# ordinal categorical variables
fillZeroCat = ["BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr","GarageCars"]

# continuous variables with missing values that are zero
fillZeroCont = ["MasVnrArea","GarageArea","GrLivArea","1stFlrSF","2ndFlrSF",
                "BsmtFinSF1","BsmtFinSF2","BsmtUnfSF", "TotalBsmtSF"]

# variables that need differences between reference engineered
imputeDiff = [("GarageYrBlt","YearBuilt"),("YearRemodAdd","YearBuilt")]

# categories that we need to know if they were imputed
imputeUnknown = []

# to be dropped
dropList = ["Id"]



In [59]:
## These are for the linear model
selected=[]
# values that null is filled with "None"
fillNone = ["Alley","BsmtQual","BsmtCond","MasVnrType","BsmtExposure","BsmtFinType1","BsmtFinType2",
            "FireplaceQu","GarageType","GarageFinish","GarageQual","GarageCond",
            "PoolQC","Fence","MiscFeature","MasVnrType","MSZoning","Utilities"]

# ordinal categorical variables
fillZeroCat = ["BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr","GarageCars","Fireplaces"]

# continuous variables with missing values that are zero
fillZeroCont = ["MasVnrArea","GarageArea","GrLivArea","1stFlrSF","2ndFlrSF","LotFrontage",I used all the categorical variables
                "BsmtFinSF1","BsmtFinSF2","BsmtUnfSF","WoodDeckSF","OpenPorchSF","PoolArea"]

# variables that need differences between reference engineered
imputeDiff = [("GarageYrBlt","YearBuilt"),("YearRemodAdd","YearBuilt")]

# categories that we need to know if they were imputed
imputeUnknown = []

# to be dropped
dropList = ["TotalBsmtSF","Id","GarageYrBlt","YearRemodAdd"]

imputeDict = {"Electrical": "SBrkr", 
              "Functional": "Typ", 
              "CentralAir":"Y",
              "KitchenQual":"Po",
              "SaleType":"Oth",
              "Exterior1st":"Other",
              "Exterior2nd":"Other"}

In [14]:
trainData["garageDiff"] = trainData.GarageYrBlt-trainData.YearBuilt

In [16]:
trainData["remodDiff"] = trainData.YearRemodAdd - trainData.YearBuilt

In [19]:
def imputeVals2(in_df):
    df = in_df.copy()
    df.LotFrontage = df.LotFrontage.fillna(df.Neighborhood.map(frontage))  
    df.MSZoning = df.MSZoning.fillna(df.Neighborhood.map(zoning))
    df.Utilities = df.Utilities.fillna(df.Neighborhood.map(utilities))
    return df

In [23]:
train_X = trainData.drop(columns=["SalePrice"])
train_y = trainData.SalePrice

In [24]:
pipe_X = imputeVals2(train_X)

In [46]:
from sklearn.base import BaseEstimator, TransformerMixin

class dictImputer(BaseEstimator,TransformerMixin):
    def __init__(self,dict_: dict):
        self.dict_ = dict_
         
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        for k,v in self.dict_.items():
            X[k] = X[k].fillna(v)
        return X[self.dict_.keys()]  
    
## Example:
# dictImputer(imputeDict).fit_transform(pipe_X)

In [102]:
nonePipeline = make_pipeline(SimpleImputer(strategy="constant",fill_value="None"),OneHotEncoder(drop="first",handle_unknown="ignore"))
zeroPipeline = make_pipeline(SimpleImputer(strategy="constant",fill_value=0),OneHotEncoder(drop="first",handle_unknown="ignore"))
scalePipeline = make_pipeline(SimpleImputer(strategy="constant",fill_value=0),StandardScaler())

regressionPipeline = ColumnTransformer([
    ("setNone", nonePipeline,fillNone),
    ("setZero", zeroPipeline,fillZeroCat),
    ("transformed", scalePipeline, fillZeroCont),
    ("dictImputed", make_pipeline(dictImputer(imputeDict),OneHotEncoder(drop="first",handle_unknown="ignore")),list(imputeDict.keys())),
    #("selected", "passthrough", selected),
    ("dropped", "drop", dropList)
],remainder="drop")



TypeError: __init__() got an unexpected keyword argument 'drop'

In [95]:
piped_X = regressionPipeline.fit_transform(pipe_X)

In [76]:
targetScaler =  StandardScaler()
pipe_y = targetScaler.fit_transform(np.log(train_y.values.reshape(-1,1)))

In [71]:
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.model_selection import GridSearchCV

In [78]:
lm = ElasticNet()

lm_params = {"alpha": np.logspace(-5,2,50), "l1_ratio": np.linspace(0,1,50)}

lm_grid = GridSearchCV(lm, lm_params, cv = 5, scoring = "neg_mean_squared_error", n_jobs = -1, verbose=50)
lm_grid.fit(piped_X,pipe_y)

lm_grid.best_params_

lm_grid.best_score_

In [91]:
lm_params = {'alpha': 0.013894954943731374, 'l1_ratio': 0.02040816326530612}

test_net = ElasticNet(**lm_params)

test_net.fit(piped_X,pipe_y)
test_net.score(piped_X,pipe_y)

0.8869333586280218

In [96]:
piped_test_X = regressionPipeline.transform(imputeVals2(testData))

In [98]:
raw_preds = test_net.predict(piped_test_X)

In [99]:
outvals = np.exp(targetScaler.inverse_transform(raw_preds))

In [100]:
submit_frame = pd.DataFrame()
submit_frame['Id'] = testData.Id
submit_frame['SalePrice'] = outvals
submit_frame.to_csv('submission.csv',index=False)

{'alpha': 0.013894954943731374, 'l1_ratio': 0.02040816326530612}