In [1]:
import numpy as np 
import pandas as pd
import sklearn

import os

In [2]:
import seaborn as sns
import matplotlib as plt
from plotnine import *
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from scipy.stats import norm    # used in plotting

In [3]:
trainData = pd.read_csv("./data/train.csv")
testData = pd.read_csv("./data/test.csv")

In [5]:
outliers = ((trainData.GrLivArea > 4000) & (trainData.SalePrice < 5E5))
trainData = trainData[~(outliers)]

In [None]:
impCols = ["OverallQual","TotalBsmtSF", "1stFlrSF", "GrLivArea", "GarageCars", "GarageArea", "MSSubClass",
          "KitchenAbvGr","OverallCond"]

In [None]:
zoning = trainData.groupby("Neighborhood").MSZoning.apply(lambda x: x.value_counts().sort_values().index[0]).to_dict()
utilities = trainData.groupby("Neighborhood").Utilities.apply(lambda x: x.value_counts().sort_values().index[0]).to_dict()
frontage = trainData.groupby("Neighborhood").LotFrontage.apply(lambda x: x.value_counts().sort_values().index[0]).to_dict()

In [19]:
# values that null is filled with "None"
fillNone = ["Alley","BsmtQual","BsmtCond","MasVnrType","BsmtExposure","BsmtFinType1","BsmtFinType2","FireplaceQu","GarageType","GarageFinish","GarageQual",
           "GarageCond","PoolQC","Fence","MiscFeature","MasVnrType"]

# For categorical data, it is 
fillZero = ["BsmtFinSF1","BsmtFinSF2","BsmtUnfSF","TotalBsmtSF","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr","2ndFlrSF","GrLivArea",
           "MasVnrArea","GarageArea","GarageCars", "GarageYrBlt"]

def imputeVals0(in_df):
    df = in_df.copy()
    for i in fillNone:
        df[i] = df[i].fillna("None")
    for i in fillZero:
        df["null_%s" % (i)] = df[i].isnull()                           # mark which zeros are imputed
        df[i] = df[i].fillna(0)
    df.Electrical = df.Electrical.fillna("SBrkr")
    df.Functional = df.Functional.fillna("Typ")                        # Documentation instructs to assume "typical" unless otherwise noted
    df.CentralAir = df.CentralAir.fillna("Y")
    df["null_LotFrontage"] = df.LotFrontage.isnull()
    df.LotFrontage = df.LotFrontage.fillna(0)                             
    df.MSZoning = df.MSZoning.fillna(df.Neighborhood.map(zoning))
    df.Utilities = df.Utilities.fillna(df.Neighborhood.map(utilities))
    df.KitchenQual = df.KitchenQual.fillna("Po")                      #one house missing kitchen data
    df.SaleType = df.SaleType.fillna("Oth")                           # only one missing value, fill the already defined "other"
    df.Exterior1st = df.Exterior1st.fillna("Other")
    df.Exterior2nd = df.Exterior2nd.fillna("Other")                  # the same house is responsible for the missing exterior 1st and 2nd, other is predefined
    df = df.drop(columns=["Id"])
    return(df)

def imputeVals1(in_df):
    df = in_df.copy()
    for i in fillNone:
        df[i] = df[i].fillna("None")
    for i in fillZero:
        df["null_%s" % (i)] = df[i].isnull()
        df[i] = df[i].fillna(0)
    df.Electrical = df.Electrical.fillna("SBrkr")
    df.Functional = df.Functional.fillna("Typ")                        
    df.CentralAir = df.CentralAir.fillna("Y")
    df.LotFrontage = df.LotFrontage.fillna(df.Neighborhood.map(frontage))            # This is the only line different in these two functions, maybe a more elegant solution is possible               
    df.MSZoning = df.MSZoning.fillna(df.Neighborhood.map(zoning))
    df.Utilities = df.Utilities.fillna(df.Neighborhood.map(utilities))
    df.KitchenQual = df.KitchenQual.fillna("Po")                     
    df.SaleType = df.SaleType.fillna("Oth")                           
    df.Exterior1st = df.Exterior1st.fillna("Other")
    df.Exterior2nd = df.Exterior2nd.fillna("Other")                 
    df = df.drop(columns=["Id"])
    return(df)

In [None]:
# Need these values for inverse transform before submission

normedPrice = np.log(trainData.SalePrice)
sp_mean = np.mean(normedPrice)from sklear.
sp_std = np.std(normedPrice)
normedPrice = (normedPrice - sp_mean) / sp_std

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [None]:
# Don't know what to do with these yet
# Maybe harmonic transform on month. 
# Maybe diff between year sold and built
timeVars = ["MoSold","YrSold","YearBuilt"]

# CentralAir needs to be examined for correlation
toInclude = ["LotFrontage","CentralAir","Electrical","Functional","MSZoning","Utilities",
            "KitchenQual","SaleType","Exterior1st","Exterior2nd","CentralAir"]

examine =  ["MSSubClass", "LotArea","Street","Alley","LotShape","LandContour",
          "LotConfig","LandSlope","Neighborhood","Condition1","Condition2","BldgType","HouseStyle",
          "OverallQual","OverallCond","RoofStyle","RoofMatl","ExterQual",
          "ExterCond","Foundation","Heating","HeatingQC","TotsRmsAbvGr","PavedDrive","EnclosedPorch",
          "3SsnPorch","ScreenPorch","MiscVal","SaleCondition"]

In [None]:
## These are for the linear model

# values that null is filled with "None"
fillNone = ["Alley","BsmtQual","BsmtCond","MasVnrType","BsmtExposure","BsmtFinType1","BsmtFinType2",
            "FireplaceQu","GarageType","GarageFinish","GarageQual","GarageCond",
            "PoolQC","Fence","MiscFeature","MasVnrType"]

# ordinal categorical variables
fillZeroCat = ["BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr","GarageCars","Fireplaces"]

# continuous variables with missing values that are zero
fillZeroCont = ["MasVnrArea","GarageArea","GrLivArea","1stFlrSF","2ndFlrSF",
                "BsmtFinSF1","BsmtFinSF2","BsmtUnfSF","WoodDeckSF","OpenPorchSF","PoolArea"]

# variables that need differences between reference engineered
imputeDiff = [("GarageYrBlt","YearBuilt"),("YearRemodAdd","YearBuilt")]

# categories that we need to know if they were imputed
imputeUnknown = []

# to be dropped
dropList = ["TotalBsmtSF","Id"]



In [None]:
## These variables differ for nonlinear models (linearly dependent features)
# values that null is filled with "None"
fillNone = ["Alley","BsmtQual","BsmtCond","MasVnrType","BsmtExposure","BsmtFinType1","BsmtFinType2",
            "FireplaceQu","GarageType","GarageFinish","GarageQual","GarageCond",
            "PoolQC","Fence","MiscFeature","MasVnrType"]

# ordinal categorical variables
fillZeroCat = ["BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr","GarageCars"]

# continuous variables with missing values that are zero
fillZeroCont = ["MasVnrArea","GarageArea","GrLivArea","1stFlrSF","2ndFlrSF",
                "BsmtFinSF1","BsmtFinSF2","BsmtUnfSF", "TotalBsmtSF"]

# variables that need differences between reference engineered
imputeDiff = [("GarageYrBlt","YearBuilt"),("YearRemodAdd","YearBuilt")]

# categories that we need to know if they were imputed
imputeUnknown = []

# to be dropped
dropList = ["Id"]



In [42]:

scalePipeline = make_pipeline(StandardScaler,SimpleImputer)



regressionPipeline = ColumnTransformer([
    ("setNone",   SimpleImputer(strategy="constant",fill_value="None"),fillNone),
    ("setZero", SimpleImputer(strategy="constant",fill_value=0),fillZeroCat),
    #("scaled",  StandardScaler(), rescale),
    ("transformed", scalePipeline, fillZeroCont)
    #("norm", Normalizer(..), numerical cols)
])



In [38]:
pipetest.fit_transform(train_X)

array([['None', 'Gd', 'TA', ..., 548.0, 2.0, 2003.0],
       ['None', 'Gd', 'TA', ..., 460.0, 2.0, 1976.0],
       ['None', 'Gd', 'TA', ..., 608.0, 2.0, 2001.0],
       ...,
       ['None', 'TA', 'Gd', ..., 252.0, 1.0, 1941.0],
       ['None', 'TA', 'TA', ..., 240.0, 1.0, 1950.0],
       ['None', 'TA', 'TA', ..., 276.0, 1.0, 1965.0]], dtype=object)

In [46]:
train_X = trainData.drop(columns=["SalePrice"])
train_y = trainData.SalePrice

test = pd.DataFrame(pipetest.fit_transform(train_X))
test.columns = fillNone + fillZero

test.head()

Unnamed: 0,Alley,BsmtQual,BsmtCond,MasVnrType,BsmtExposure,BsmtFinType1,BsmtFinType2,FireplaceQu,GarageType,GarageFinish,...,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,2ndFlrSF,GrLivArea,MasVnrArea,GarageArea,GarageCars,GarageYrBlt
0,,Gd,TA,BrkFace,No,GLQ,Unf,,Attchd,RFn,...,0,2,1,3,854,1710,196,548,2,2003
1,,Gd,TA,,Gd,ALQ,Unf,TA,Attchd,RFn,...,1,2,0,3,0,1262,0,460,2,1976
2,,Gd,TA,BrkFace,Mn,GLQ,Unf,TA,Attchd,RFn,...,0,2,1,3,866,1786,162,608,2,2001
3,,TA,Gd,,No,ALQ,Unf,Gd,Detchd,Unf,...,0,1,0,3,756,1717,0,642,3,1998
4,,Gd,TA,BrkFace,Av,GLQ,Unf,TA,Attchd,RFn,...,0,2,1,4,1053,2198,350,836,3,2000


In [None]:
class dictImputer(BaseEstimator,TransformerMixin):
    def __init__(self,dict_):
        self._dict = dict_
         
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        ## Edit this
        pass
        
        
        
## can probably be deleted
class columnSelector(BaseEstimator,TransformerMixin):
    def __init__(self,columns):
        self.columns = columns
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        assert isinstance(X,pd.DataFrame)
        
        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)
        
        
        