# House Price Regression Analysis
### by Charlie LaBarge

In [47]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.sql.functions import col, avg

## Import training data

In [2]:
train_df = spark.read.csv("train.csv", header=True)

## Basic description of the dataset

In [3]:
train_df.printSchema()
print "Training set length: " + str(train_df.count())

root
 |-- Id: string (nullable = true)
 |-- MSSubClass: string (nullable = true)
 |-- MSZoning: string (nullable = true)
 |-- LotFrontage: string (nullable = true)
 |-- LotArea: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- Alley: string (nullable = true)
 |-- LotShape: string (nullable = true)
 |-- LandContour: string (nullable = true)
 |-- Utilities: string (nullable = true)
 |-- LotConfig: string (nullable = true)
 |-- LandSlope: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Condition1: string (nullable = true)
 |-- Condition2: string (nullable = true)
 |-- BldgType: string (nullable = true)
 |-- HouseStyle: string (nullable = true)
 |-- OverallQual: string (nullable = true)
 |-- OverallCond: string (nullable = true)
 |-- YearBuilt: string (nullable = true)
 |-- YearRemodAdd: string (nullable = true)
 |-- RoofStyle: string (nullable = true)
 |-- RoofMatl: string (nullable = true)
 |-- Exterior1st: string (nullable = true)
 |-- Exteri

## Feature shaping + pre-processing

### Helper functions for feature shaping

In [4]:
# given a dataframe and a list of column names, cast those columns to double
def convertColsToDbl(dataframe, col_names):
    for col_name in col_names:
        # convert the column and drop the old column
        dataframe = dataframe.withColumn(col_name + "_asDbl", dataframe[col_name].cast("double")).drop(col_name)
        
    return dataframe

In [5]:
def getStringColumns(dataframe):
    strcols = []
    for column in dataframe.columns:
        # if its a string column, add to the list
        if(str(dataframe.schema[column].dataType) == 'StringType'):
            strcols.append(column)
    
    return strcols

In [6]:
# given a dataframe, convert the given columns to indexed columns (for pushing into a onehotencoder)
def stringIndexDf(dataframe, strcols):
    for column in strcols:
        indexer = StringIndexer(inputCol=column, outputCol=column+"_index")
        dataframe = indexer.fit(dataframe).transform(dataframe)
        dataframe = dataframe.drop(column)
        
    return dataframe

In [7]:
def oneHotDf(dataframe, strcols):
    for column in strcols:
        encoder = OneHotEncoder(inputCol=column+"_index", outputCol=column+"_onehot", dropLast=False)
        dataframe = encoder.transform(dataframe)
        dataframe = dataframe.drop(column+"_index")
        
    return dataframe

### Dealing with null values

In [8]:
# function to deal with null values
def fillNullVals(dataframe):
    for column in dataframe.columns:
        # if string column, replace with "Unknown" string
        if(str(dataframe.schema[column].dataType) == 'StringType'):
            dataframe = dataframe.fillna("Unknown", [column])
        elif (str(dataframe.schema[column].dataType) == 'DoubleType'):
            # calculate average value
            nonnull = dataframe.dropna(subset=[column])
            colAvg = nonnull.agg(avg(col(column)))
            colAvg = colAvg.rdd.map(lambda x: x[0]).first()
                        
            # replace null values with average value
            dataframe = dataframe.fillna(colAvg, [column])
            
    return dataframe

### Declare initial columns to convert to double val, and convert them

In [9]:
doubleCols = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
              'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
              'GrLivArea', 'GarageYrBlt', 'GarageArea', 'MiscVal', 'YrSold']

In [10]:
train_df = convertColsToDbl(train_df, doubleCols)

### Deal with null values

In [11]:
train_df = fillNullVals(train_df)

### Index and one-hot encode string columns

In [12]:
# save the ids to a different df in case we need them, then drop them
ids_df = train_df.select("Id")
train_df = train_df.drop("Id")

# save the price column as the label
train_df = train_df.withColumn("label", train_df["SalePrice"].cast("Double")).drop("SalePrice")

In [13]:
# extract list of string columns
string_cols = getStringColumns(train_df)
string_cols.remove("label") # don't want to index label column

# index the string columns, then one hot encode the indexed string columns 
train_df = stringIndexDf(train_df, string_cols)
train_df = oneHotDf(train_df, string_cols)

In [14]:
# remove label column from columns to be assembled
columns_less_label = list(train_df.columns)
columns_less_label.remove('label')

feature_pipeline = Pipeline(stages= [VectorAssembler(inputCols=columns_less_label, outputCol="features")])

train_df_transformed = feature_pipeline.fit(train_df).transform(train_df)

## Model 1: Random Forest Regressor

In [29]:
rf_model = RandomForestRegressor(maxDepth=30)

In [31]:
# sanity check before doing actual cross-validation of model
rf_fitted_on_all = rf_model.fit(train_df_transformed)
rf_fitted_on_all.transform(train_df_transformed).select("label", "prediction").show(5)

### 3-fold cross validation on RF model

In [37]:
paramGrid = ParamGridBuilder().build()
crossValidator = CrossValidator(estimator=rf_model,
                                estimatorParamMaps=paramGrid,
                                evaluator=RegressionEvaluator(),
                                numFolds=3)

rf_validated = crossValidator.fit(train_df_transformed)

#### RMSE for the model

In [39]:
rf_validated.avgMetrics

[29860.62315360208]

# Part 2: Different Feature Shaping Strategies and Impact on Performance

## Setup: getting training data back in unshaped form