# Ames Iowa Housing Data Set

The goal in this notebook is to predict the housing prices in Ames Iowa from the years 2006 to 2010.

In [181]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso, Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, FunctionTransformer, Imputer
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline

In [182]:
import warnings
warnings.filterwarnings('ignore')

Loading the data and doing some basic EDA to make it easier to work with.

In [183]:
test = pd.read_csv("test.csv")
test = test.drop("PID", axis=1)
test.head()

Unnamed: 0,Id,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,0,,,,0,4,2006,WD
1,2718,90,RL,,9662,Pave,,IR1,Lvl,AllPub,...,0,0,0,,,,0,8,2006,WD
2,2414,60,RL,58.0,17104,Pave,,IR1,Lvl,AllPub,...,0,0,0,,,,0,9,2006,New
3,1989,30,RM,60.0,8520,Pave,,Reg,Lvl,AllPub,...,0,0,0,,,,0,7,2007,WD
4,625,20,RL,,9500,Pave,,IR1,Lvl,AllPub,...,0,185,0,,,,0,7,2009,WD


In [184]:
test.shape

(879, 79)

In [185]:
train = pd.read_csv("train.csv")
train = train.drop("PID", axis=1)
train.head()

Unnamed: 0,Id,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,109,60,RL,,13517,Pave,,IR1,Lvl,AllPub,...,0,,,,0,3,2010,WD,Normal,130500
1,544,60,RL,43.0,11492,Pave,,IR1,Lvl,AllPub,...,0,,,,0,4,2009,WD,Normal,220000
2,153,20,RL,68.0,7922,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2010,WD,Abnorml,109000
3,318,60,RL,73.0,9802,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,174000
4,255,50,RL,82.0,14235,Pave,,IR1,Lvl,AllPub,...,0,,,,0,3,2010,WD,Normal,138500


In [186]:
train.columns.sort_values()

Index(['1st Flr SF', '2nd Flr SF', '3Ssn Porch', 'Alley', 'Bedroom AbvGr',
       'Bldg Type', 'Bsmt Cond', 'Bsmt Exposure', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Bsmt Qual', 'Bsmt Unf SF', 'BsmtFin SF 1',
       'BsmtFin SF 2', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Central Air',
       'Condition 1', 'Condition 2', 'Electrical', 'Enclosed Porch',
       'Exter Cond', 'Exter Qual', 'Exterior 1st', 'Exterior 2nd', 'Fence',
       'Fireplace Qu', 'Fireplaces', 'Foundation', 'Full Bath', 'Functional',
       'Garage Area', 'Garage Cars', 'Garage Cond', 'Garage Finish',
       'Garage Qual', 'Garage Type', 'Garage Yr Blt', 'Gr Liv Area',
       'Half Bath', 'Heating', 'Heating QC', 'House Style', 'Id',
       'Kitchen AbvGr', 'Kitchen Qual', 'Land Contour', 'Land Slope',
       'Lot Area', 'Lot Config', 'Lot Frontage', 'Lot Shape',
       'Low Qual Fin SF', 'MS SubClass', 'MS Zoning', 'Mas Vnr Area',
       'Mas Vnr Type', 'Misc Feature', 'Misc Val', 'Mo Sold', 'Neighborhood',
       

cell to turn my dummy columns into categories

In [187]:
train["Land Slope"] = train["Land Slope"].astype('category')
train["Neighborhood"] = train["Neighborhood"].astype('category')
train["Bldg Type"] = train["Bldg Type"].astype('category')
train["Overall Qual"] = train["Overall Qual"].astype('category')
test["Land Slope"] = test["Land Slope"].astype('category')
test["Neighborhood"] = test["Neighborhood"].astype('category')
test["Bldg Type"] = test["Bldg Type"].astype('category')
test["Overall Qual"] = test["Overall Qual"].astype('category')

# Train Test Split

In [188]:
X_train, X_test, y_train, y_test = train_test_split(train, train['SalePrice'],random_state = 33)

In [189]:
X_train.shape

(1538, 81)

In [190]:
y_train.shape

(1538,)

# Creating Features

In [191]:
def basic_features(df):
    return df[["Overall Qual","Garage Cars","Garage Area", "1st Flr SF", "Gr Liv Area","Total Bsmt SF"]]
basic_features_tf = FunctionTransformer(basic_features, validate=False)
basic_features(X_train).shape
#basic_features(X_train)

(1538, 6)

In [192]:
def num_features(df):
    return df[['MS SubClass', 'Lot Frontage', 'Lot Area', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area',
       'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area',
       'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath',
       'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces',
       'Garage Yr Blt', 'Garage Cars', 'Garage Area', 'Wood Deck SF',
       'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch',
       'Pool Area', 'Misc Val', 'Mo Sold', 'Yr Sold']]
num_features_tf = FunctionTransformer(num_features, validate=False)

In [193]:
def mis_features(df):
    misc = df["Misc Feature"].unique()
    return pd.DataFrame(df["Misc Feature"].fillna(0).replace(misc, 1))
mis_features_tf = FunctionTransformer(mis_features, validate=False)
print(mis_features(X_train).shape)
#mis_features(X_train)

(1538, 1)


In [194]:
def land_slope_d(df):
    columns = pd.get_dummies(train["Land Slope"]).columns
#    print(columns)
    return pd.get_dummies(df["Land Slope"],columns={"Land Slope":{'Gtl', 'Mod', 'Sev'}})
land_slope_d_tf = FunctionTransformer(land_slope_d, validate=False)
print(land_slope_d(X_train).shape)
#land_slope_d(X_train)

(1538, 3)


In [195]:
def bldg_type_d(df):
    columns = pd.get_dummies(train["Bldg Type"]).columns
#    print(columns)
    return pd.get_dummies(df["Bldg Type"],columns={"Bldg Type": {'1Fam', '2fmCon', 'Duplex', 'Twnhs', 'TwnhsE'}})
bldg_type_d_tf = FunctionTransformer(bldg_type_d, validate=False)
bldg_type_d(X_train).shape

(1538, 5)

In [196]:
def overall_qual_d(df):
    columns = pd.get_dummies(train["Overall Qual"]).columns
#    print(columns)
    return pd.get_dummies(df["Overall Qual"],columns={"Overall Qual":{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}})
overall_qual_d_tf = FunctionTransformer(overall_qual_d, validate=False)
overall_qual_d(X_train).shape

(1538, 10)

In [197]:
def add_baths(df):
    df["bathrooms"] = df["Full Bath"] + .5 * df["Half Bath"]
    return pd.DataFrame(df["bathrooms"])
add_baths_tf = FunctionTransformer(add_baths, validate=False)
add_baths(X_train).shape

(1538, 1)

In [198]:
def poly_sf(df):
    df_sf = df[["Lot Area", "1st Flr SF", "2nd Flr SF", "Bsmt Unf SF", "BsmtFin SF 1", "BsmtFin SF 2", "Gr Liv Area","Low Qual Fin SF","Total Bsmt SF","Wood Deck SF"]]
    df_sf.replace(0,1,inplace=True)
    df_sf.fillna(1,inplace=True)
    return df_sf
#i turned all of the zeros and NaN to 1's so what when they multiply with eachother they do not all come out to 0
#poly_sf(X_train).isnull().sum()
#poly_sf(X_train).info()
#poly_sf(X_train).shape

In [199]:
def poly_t6(df):
    df_t6 = df[["Overall Qual","Garage Cars","Garage Area", "1st Flr SF", "Gr Liv Area","Total Bsmt SF"]]
    df_t6.replace(0,1,inplace=True)
    df_t6.fillna(1,inplace=True)
    return df_t6

Function to make the test and train the same

In [200]:
def same_columns(train,test):
    feature_difference = set(train) - set(test)
    feature_difference_df = pd.DataFrame(data=np.zeros((test.shape[0], len(feature_difference))), columns=list(feature_difference))
    test = test.join(feature_difference_df)
    return test

# Linear model

In [201]:
lm = linear_model.LinearRegression()
ss = StandardScaler()
imputer = Imputer()
poly = PolynomialFeatures(include_bias=False)
vec = CountVectorizer()

In [205]:
featureslm = FeatureUnion([
     ("basic_features_tf", basic_features_tf),
     ("mis_features_tf", mis_features_tf),
     ("land_slope_d_tf", land_slope_d_tf),
     ("bldg_type_d_tf", bldg_type_d_tf),
#     ("overall_qual_d_tf", overall_qual_d_tf),
     ("add_baths_tf", add_baths_tf)
])
featureslm.transform(X_train)

array([[7, 2.0, 576.0, ..., 0, 0, 2.0],
       [8, 2.0, 575.0, ..., 0, 0, 2.5],
       [7, 2.0, 578.0, ..., 0, 0, 2.0],
       ...,
       [5, 1.0, 240.0, ..., 0, 0, 1.0],
       [8, 2.0, 499.0, ..., 0, 1, 2.0],
       [5, 1.0, 240.0, ..., 0, 0, 1.0]], dtype=object)

In [206]:
pipelm = Pipeline([
    ("featureslm", featureslm),
    ("imputer", imputer),
    ("ss", ss),
    ("lm", lm)
])

In [207]:
params = {
    "imputer__strategy" : ["mean", "median", "most_frequent"]
}
gs = GridSearchCV(pipelm, param_grid=params)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.7967203042275135


{'imputer__strategy': 'median'}

In [208]:
gs.score(X_test, y_test)

0.7017205956321881

In [209]:
X_test.shape

(513, 82)

In [210]:
#test["SalePrice"] = gs.predict(test)

Export data

In [211]:
#test[["Id", "SalePrice"]].to_csv("ames_regln2.csv", index=False)

# LASSO

In [212]:
lm = linear_model.LinearRegression()
ss = StandardScaler()
lasso = Lasso()
imputer = Imputer()
polyL = PolynomialFeatures(include_bias=False)

In [214]:
featuresL = FeatureUnion([
    ("basic_features_tf", basic_features_tf),
    ("num_features_tf", num_features_tf),
    ("mis_features_tf", mis_features_tf),
    ("land_slope_d_tf", land_slope_d_tf),
    ("bldg_type_d_tf", bldg_type_d_tf),
#    ("overall_qual_d_tf", overall_qual_d_tf),
    ("add_baths_tf", add_baths_tf)
])
featuresL.transform(X_train)

array([[7, 2.0, 576.0, ..., 0, 0, 2.0],
       [8, 2.0, 575.0, ..., 0, 0, 2.5],
       [7, 2.0, 578.0, ..., 0, 0, 2.0],
       ...,
       [5, 1.0, 240.0, ..., 0, 0, 1.0],
       [8, 2.0, 499.0, ..., 0, 1, 2.0],
       [5, 1.0, 240.0, ..., 0, 0, 1.0]], dtype=object)

In [215]:
pipeL = Pipeline([
    ("featuresL", featuresL),
    ("imputer", imputer),
    ("ss", ss),
    ("lasso", lasso)
])

In [216]:
params = {
    "imputer__strategy" : ["mean", "median", "most_frequent"],
    "lasso__alpha" : [1,2,3,4,5,6,7,8,9,10]
}
gs = GridSearchCV(pipeL, param_grid=params)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.8157641870102059


{'imputer__strategy': 'most_frequent', 'lasso__alpha': 10}

In [217]:
gs.score(X_test, y_test)

0.6978493769566345

In [218]:
gs.predict(test).shape

(879,)

In [219]:
#test["SalePrice"] = gs.predict(test)

In [220]:
#test[["Id", "SalePrice"]].to_csv("ames_regL1.csv", index=False)

# RIDGE

In [221]:
lm = linear_model.LinearRegression()
ss = StandardScaler()
ridge = Ridge()
imputer = Imputer()

In [222]:
featuresR = FeatureUnion([
    ("basic_features_tf", basic_features_tf),
    ("num_features_tf", num_features_tf),
    ("mis_features_tf", mis_features_tf),
    ("land_slope_d_tf", land_slope_d_tf),
    ("bldg_type_d_tf", bldg_type_d_tf),
    ("overall_qual_d_tf", overall_qual_d_tf),
   ("add_baths_tf", add_baths_tf)
])
featuresR.transform(X_train)

array([[7, 2.0, 576.0, ..., 0, 0, 2.0],
       [8, 2.0, 575.0, ..., 0, 0, 2.5],
       [7, 2.0, 578.0, ..., 0, 0, 2.0],
       ...,
       [5, 1.0, 240.0, ..., 0, 0, 1.0],
       [8, 2.0, 499.0, ..., 0, 0, 2.0],
       [5, 1.0, 240.0, ..., 0, 0, 1.0]], dtype=object)

In [223]:
pipeR = Pipeline([
    ("featuresR", featuresR),
    ("imputer", imputer),
    ("ss", ss),
    ("ridge", ridge)
])

In [224]:
pipeR_p = Pipeline([
    ("featuresR", featuresR),
    ("imputer", imputer),
    ("ss", ss),
])

In [225]:
params = {
    "imputer__strategy" : ["mean", "median", "most_frequent"],
    "ridge__alpha" : [1,2,3,4,5,6,7,8,9,10]
}
gs = GridSearchCV(pipeR, param_grid=params)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.8534776283532125


{'imputer__strategy': 'most_frequent', 'ridge__alpha': 10}

In [226]:
gs.score(X_test, y_test)

0.7284158000966925

In [227]:
gs.predict(X_test).shape

(513,)

In [228]:
#test["SalePrice"] = gs.predict(test)

In [229]:
#test[["Id", "SalePrice"]].to_csv("ames_regR1.csv", index=False)

I decided to use my Ridge regression as my final submission. Ridge had the highest score and was the one I felt most confident with. 