In [109]:
from pyspark.sql import SparkSession, Row
import numpy as np
import pandas as pd
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import mean, sum,bround,log, exp
from pyspark.sql.functions import isnull, when, count, col
from sklearn.metrics import accuracy_score
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.regression import RandomForestRegressor
#from pyspark.ml.evaluation import BinaryClassificationEvaluator
#from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline

In [166]:
spark = SparkSession.builder.appName('HousingPrices').getOrCreate()

# Load the data 
### Convention : use raw

In [62]:
# CV Set
rdt_prs_rlbl = spark.read.csv('train.csv', inferSchema=True, header=True)   # rdt: raw data  # prs: parasitics
#===============================================================

# Test Set
rdt_T_prs = spark.read.csv('test.csv', inferSchema=True, header=True) 

# Print Schema

In [63]:
rdt_prs_rlbl.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- MSSubClass: integer (nullable = true)
 |-- MSZoning: string (nullable = true)
 |-- LotFrontage: integer (nullable = true)
 |-- LotArea: integer (nullable = true)
 |-- Street: string (nullable = true)
 |-- Alley: string (nullable = true)
 |-- LotShape: string (nullable = true)
 |-- LandContour: string (nullable = true)
 |-- Utilities: string (nullable = true)
 |-- LotConfig: string (nullable = true)
 |-- LandSlope: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Condition1: string (nullable = true)
 |-- Condition2: string (nullable = true)
 |-- BldgType: string (nullable = true)
 |-- HouseStyle: string (nullable = true)
 |-- OverallQual: integer (nullable = true)
 |-- OverallCond: integer (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- YearRemodAdd: integer (nullable = true)
 |-- RoofStyle: string (nullable = true)
 |-- RoofMatl: string (nullable = true)
 |-- Exterior1st: string (nullable = true)
 |-

In [65]:
rdt_prs = rdt_prs_rlbl.withColumn('logSalePrice',log(rdt_prs_rlbl.SalePrice)).drop('SalePrice')


In [66]:
rdt_prs.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- MSSubClass: integer (nullable = true)
 |-- MSZoning: string (nullable = true)
 |-- LotFrontage: integer (nullable = true)
 |-- LotArea: integer (nullable = true)
 |-- Street: string (nullable = true)
 |-- Alley: string (nullable = true)
 |-- LotShape: string (nullable = true)
 |-- LandContour: string (nullable = true)
 |-- Utilities: string (nullable = true)
 |-- LotConfig: string (nullable = true)
 |-- LandSlope: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Condition1: string (nullable = true)
 |-- Condition2: string (nullable = true)
 |-- BldgType: string (nullable = true)
 |-- HouseStyle: string (nullable = true)
 |-- OverallQual: integer (nullable = true)
 |-- OverallCond: integer (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- YearRemodAdd: integer (nullable = true)
 |-- RoofStyle: string (nullable = true)
 |-- RoofMatl: string (nullable = true)
 |-- Exterior1st: string (nullable = true)
 |-

# Identify the Label and Index columns

In [68]:
LBL = 'logSalePrice'
IDX = 'Id'

# Have a look at the first few rows

In [69]:
pd.set_option('display.max_columns', 100)

pd.DataFrame(rdt_prs.head(8))





Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80
0,1,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,12.247694
1,2,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,12.109011
2,3,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,12.317167
3,4,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,11.849398
4,5,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,12.429216
5,6,50,RL,85,14115,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,1.5Fin,5,5,1993,1995,Gable,CompShg,VinylSd,VinylSd,,0,TA,TA,Wood,Gd,TA,No,GLQ,732,Unf,0,64,796,GasA,Ex,Y,SBrkr,796,566,0,1362,1,0,1,1,1,1,TA,5,Typ,0,,Attchd,1993,Unf,2,480,TA,TA,Y,40,30,0,320,0,0,,MnPrv,Shed,700,10,2009,WD,Normal,11.8706
6,7,20,RL,75,10084,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,Norm,1Fam,1Story,8,5,2004,2005,Gable,CompShg,VinylSd,VinylSd,Stone,186,Gd,TA,PConc,Ex,TA,Av,GLQ,1369,Unf,0,317,1686,GasA,Ex,Y,SBrkr,1694,0,0,1694,1,0,2,0,3,1,Gd,7,Typ,1,Gd,Attchd,2004,RFn,2,636,TA,TA,Y,255,57,0,0,0,0,,,,0,8,2007,WD,Normal,12.634603
7,8,60,RL,0,10382,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NWAmes,PosN,Norm,1Fam,2Story,7,6,1973,1973,Gable,CompShg,HdBoard,HdBoard,Stone,240,TA,TA,CBlock,Gd,TA,Mn,ALQ,859,BLQ,32,216,1107,GasA,Ex,Y,SBrkr,1107,983,0,2090,1,0,2,1,3,1,TA,7,Typ,2,TA,Attchd,1973,RFn,2,484,TA,TA,Y,235,204,228,0,0,0,,,Shed,350,11,2009,WD,Normal,12.206073


# Check the percentage of Null values in each column

# CV Set

In [70]:
rdt_prs.select([bround(((sum(when(isnull(c), 1).otherwise(0)))/rdt_prs.count()*100),2).alias(c)\
                   for c in rdt_prs.columns]).show()

+---+----------+--------+-----------+-------+------+-----+--------+-----------+---------+---------+---------+------------+----------+----------+--------+----------+-----------+-----------+---------+------------+---------+--------+-----------+-----------+----------+----------+---------+---------+----------+--------+--------+------------+------------+----------+------------+----------+---------+-----------+-------+---------+----------+----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+-----------+------------+----------+----------+-----------+----------+-----------+------------+----------+----------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+------+-----+-----------+-------+------+------+--------+-------------+------------+
| Id|MSSubClass|MSZoning|LotFrontage|LotArea|Street|Alley|LotShape|LandContour|Utilities|LotConfig|LandSlope|Neighborhood|Condition1|Condit

# Test Set

In [71]:
rdt_T_prs.select([bround(((sum(when(isnull(c), 1).otherwise(0)))/rdt_prs.count()*100),2).alias(c)\
                   for c in rdt_T_prs.columns]).show()

+---+----------+--------+-----------+-------+------+-----+--------+-----------+---------+---------+---------+------------+----------+----------+--------+----------+-----------+-----------+---------+------------+---------+--------+-----------+-----------+----------+----------+---------+---------+----------+--------+--------+------------+------------+----------+------------+----------+---------+-----------+-------+---------+----------+----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+-----------+------------+----------+----------+-----------+----------+-----------+------------+----------+----------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+------+-----+-----------+-------+------+------+--------+-------------+
| Id|MSSubClass|MSZoning|LotFrontage|LotArea|Street|Alley|LotShape|LandContour|Utilities|LotConfig|LandSlope|Neighborhood|Condition1|Condition2|BldgType

# Start creating a list to drop columns 

In [72]:
# The initial columns are the ones with large percentage of Nulls
clms_drop_list = []

# Print columns for ease of use 

In [73]:
rdt_prs.columns


['Id',
 'MSSubClass',
 'MSZoning',
 'LotFrontage',
 'LotArea',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType2',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Functional',
 'Fireplaces',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF'

# List of string columns

In [74]:
STR_clms_prs = [List_element[0] for List_element in rdt_prs.dtypes if List_element[1].startswith('string')]
STR_clms_prs

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

# Check out the uniqueness of values in each column
### The idea is to see if each string column is worthwhile for ML training and prediction
### Large ratios (close to 1) show that there are very few categorical variables that have been repeated
### Small ratios (close to 0) show that there are many examples to learn from for each distinct value 

In [75]:
#  Already ran through and checked 
'''
for c in STR_clms_prs:

    Distinct_ratio = round(rdt_prs.select(c).distinct().count()/rdt_prs.count(),2)
    print(c,  Distinct_ratio)
    if Distinct_ratio>0.2:
        clms_drop_list.append(c)     
clms_drop_list      
'''

'\nfor c in STR_clms_prs:\n\n    Distinct_ratio = round(rdt_prs.select(c).distinct().count()/rdt_prs.count(),2)\n    print(c,  Distinct_ratio)\n    if Distinct_ratio>0.2:\n        clms_drop_list.append(c)     \nclms_drop_list      \n'

# This listing filters out the categorical features with large unique values


In [76]:
selec_lst   = list (set(rdt_prs.columns)  - set(clms_drop_list ))   # List of acceptable columns
STR_clms    = list (set(STR_clms_prs)  - set(clms_drop_list ))      # List of all acceptabl string columns

selec_T_lst = list (set(rdt_T_prs.columns)  - set(clms_drop_list ))

In [77]:
# CV Set
rdt = rdt_prs.select(selec_lst)
rdt.show()

# Test Set
rdt_T = rdt_T_prs.select(selec_T_lst)
rdt_T.show()


+-------+--------+-----------+---------+----------+----------+---------+------------+-----------+------------+----------+------------------+----------+--------+----------+---+--------+------------+----------+----------+--------+-------+-----------+----------+--------+--------+------------+------+-----------+-----------+-----------+--------+-----------+----------+----------+----------+----------+------------+----------+----------+----------+---------+-------------+----------+-----------+------------+----------+-----------+------------+---------+-----+-----------+--------+---------+---------+----------+--------+-----------+-----+------+------------+---------+---------+------+---------+-------+---------+--------+------------+------+-----------+---------+--------+------------+-------------+-----------+----------+--------+------------+----------+------------+
|Heating|FullBath|FireplaceQu|RoofStyle|CentralAir|MasVnrType|GrLivArea|BsmtHalfBath|TotalBsmtSF|KitchenAbvGr|GarageArea|      logSal

+-------+--------+-----------+---------+----------+----------+---------+------------+-----------+------------+----------+----------+--------+----------+----+--------+------------+----------+----------+--------+-------+-----------+----------+--------+--------+------------+------+-----------+-----------+-----------+--------+-----------+----------+----------+----------+----------+------------+----------+----------+----------+---------+-------------+----------+-----------+------------+----------+-----------+------------+---------+-----+-----------+--------+---------+---------+----------+--------+-----------+-----+------+------------+---------+---------+------+---------+-------+---------+--------+------------+------+-----------+---------+--------+------------+-------------+-----------+----------+--------+------------+----------+------------+
|Heating|FullBath|FireplaceQu|RoofStyle|CentralAir|MasVnrType|GrLivArea|BsmtHalfBath|TotalBsmtSF|KitchenAbvGr|GarageArea|Functional|LotShape|WoodDeckSF

# List of numerical columns

In [78]:
NUM_clms = [item[0] for item in rdt.dtypes if item[1].startswith('int') or\
                                              item[1].startswith('double') or\
                                              item[1].startswith('float')]
print(NUM_clms)

# =============================================================================
NUM_T_clms = [item[0] for item in rdt_T.dtypes if item[1].startswith('int') or\
                                                item[1].startswith('double') or\
                                                item[1].startswith('float')]
print(NUM_T_clms)

['FullBath', 'GrLivArea', 'BsmtHalfBath', 'TotalBsmtSF', 'KitchenAbvGr', 'GarageArea', 'logSalePrice', 'WoodDeckSF', 'Id', 'BedroomAbvGr', 'PoolArea', 'LotArea', 'OverallQual', 'MasVnrArea', 'YearRemodAdd', 'OverallCond', 'LotFrontage', 'HalfBath', 'MSSubClass', 'BsmtFinSF1', 'GarageCars', 'TotRmsAbvGrd', 'GarageYrBlt', 'BsmtUnfSF', 'YearBuilt', 'YrSold', '3SsnPorch', 'MiscVal', 'MoSold', 'OpenPorchSF', '2ndFlrSF', 'LowQualFinSF', 'EnclosedPorch', 'ScreenPorch', 'BsmtFinSF2', '1stFlrSF', 'Fireplaces', 'BsmtFullBath']
['FullBath', 'GrLivArea', 'BsmtHalfBath', 'TotalBsmtSF', 'KitchenAbvGr', 'GarageArea', 'WoodDeckSF', 'Id', 'BedroomAbvGr', 'PoolArea', 'LotArea', 'OverallQual', 'MasVnrArea', 'YearRemodAdd', 'OverallCond', 'LotFrontage', 'HalfBath', 'MSSubClass', 'BsmtFinSF1', 'GarageCars', 'TotRmsAbvGrd', 'GarageYrBlt', 'BsmtUnfSF', 'YearBuilt', 'YrSold', '3SsnPorch', 'MiscVal', 'MoSold', 'OpenPorchSF', '2ndFlrSF', 'LowQualFinSF', 'EnclosedPorch', 'ScreenPorch', 'BsmtFinSF2', '1stFlrSF', 

# Convert string columns to numerical indexes

In [79]:
# Define the indexer for all string columns
indexers        = [StringIndexer(inputCol= c, outputCol= c+'_IDX', handleInvalid='keep').fit(rdt) for c in STR_clms]


# Create the pipeline
pipe_StrToIdx   = Pipeline(stages=indexers)


# Execute the pipeline to index the string columns
# Also drop the original string columns
sfdt_IDXed         = pipe_StrToIdx.fit(rdt).transform(rdt).drop(*STR_clms)      # CV set from train data 
sfdt_T_IDXed       = pipe_StrToIdx.fit(rdt_T).transform(rdt_T).drop(*STR_clms)  # Test set 
#sfdt_IDXed.show()
#sfdt_T_IDXed.show()

# Handling Null Values for numerical columns
### Null counts and fractions 

##### CV and Test set total count and fraction

In [80]:
print('CV Set---------------------')
NumRows = sfdt_IDXed.count()
print('No. of rows:', NumRows)
print("No. of Nulls", NumRows - sfdt_IDXed .dropna().count() )
print("Fraction of NULLs", np.around(1- sfdt_IDXed.dropna().count()/sfdt_IDXed .count(), 2) )
#==============================================================================================
print('\n')
print('Test Set-------------------')
NumRows = sfdt_T_IDXed.count()
print('No. of rows:', NumRows)
print("No. of Nulls", NumRows - sfdt_T_IDXed .dropna().count() )
print("Fraction of NULLs", np.around(1- sfdt_T_IDXed.dropna().count()/sfdt_T_IDXed .count(), 2) )

CV Set---------------------
No. of rows: 1460
No. of Nulls 8
Fraction of NULLs 0.01


Test Set-------------------
No. of rows: 1459
No. of Nulls 18
Fraction of NULLs 0.01


### Null Percentages per column

#### CV Set

In [81]:
sfdt_IDXed.select([bround(((sum(when(isnull(c), 1).otherwise(0)))/NumRows*100),2).alias(c)\
                   for c in sfdt_IDXed.columns]).show()

+--------+---------+------------+-----------+------------+----------+------------+----------+---+------------+--------+-------+-----------+----------+------------+-----------+-----------+--------+----------+----------+----------+------------+-----------+---------+---------+------+---------+-------+------+-----------+--------+------------+-------------+-----------+----------+--------+----------+------------+-----------+-------------+---------------+-----------------+-------------+--------------+--------------+---------------+--------------+--------------+---------------+----------------+-------------+---------+------------+--------------+-------------+--------------+------------+------------+---------------+---------+----------+------------+----------------+--------------+--------------+-------------+-------------+------------+------------+----------------+------------+----------+---------------+---------------+-------------+--------------+--------------+----------------+--------------+

#### Test Set

In [82]:
sfdt_T_IDXed.select([bround(((sum(when(isnull(c), 1).otherwise(0)))/NumRows*100),2).alias(c)\
                     for c in sfdt_T_IDXed.columns]).show()

+--------+---------+------------+-----------+------------+----------+----------+---+------------+--------+-------+-----------+----------+------------+-----------+-----------+--------+----------+----------+----------+------------+-----------+---------+---------+------+---------+-------+------+-----------+--------+------------+-------------+-----------+----------+--------+----------+------------+-----------+-------------+---------------+-----------------+-------------+--------------+--------------+---------------+--------------+--------------+---------------+----------------+-------------+---------+------------+--------------+-------------+--------------+------------+------------+---------------+---------+----------+------------+----------------+--------------+--------------+-------------+-------------+------------+------------+----------------+------------+----------+---------------+---------------+-------------+--------------+--------------+----------------+--------------+-------------

### Compute Null replacements if any 
#### Figure out the catogories and their frequencies

### Replacement values

In [83]:
# Calculate the median for numerical columns 
Med_dict = {}
for c in NUM_T_clms:
    Med_dict[c] = sfdt_IDXed.approxQuantile(c, [0.5], 0.01)[0]

Med_dict

{'FullBath': 2.0,
 'GrLivArea': 1456.0,
 'BsmtHalfBath': 0.0,
 'TotalBsmtSF': 988.0,
 'KitchenAbvGr': 1.0,
 'GarageArea': 477.0,
 'WoodDeckSF': 0.0,
 'Id': 722.0,
 'BedroomAbvGr': 3.0,
 'PoolArea': 0.0,
 'LotArea': 9416.0,
 'OverallQual': 6.0,
 'MasVnrArea': 0.0,
 'YearRemodAdd': 1993.0,
 'OverallCond': 5.0,
 'LotFrontage': 62.0,
 'HalfBath': 0.0,
 'MSSubClass': 50.0,
 'BsmtFinSF1': 377.0,
 'GarageCars': 2.0,
 'TotRmsAbvGrd': 6.0,
 'GarageYrBlt': 1977.0,
 'BsmtUnfSF': 468.0,
 'YearBuilt': 1972.0,
 'YrSold': 2008.0,
 '3SsnPorch': 0.0,
 'MiscVal': 0.0,
 'MoSold': 6.0,
 'OpenPorchSF': 24.0,
 '2ndFlrSF': 0.0,
 'LowQualFinSF': 0.0,
 'EnclosedPorch': 0.0,
 'ScreenPorch': 0.0,
 'BsmtFinSF2': 0.0,
 '1stFlrSF': 1080.0,
 'Fireplaces': 1.0,
 'BsmtFullBath': 0.0}

### Replace the Null values

In [84]:
# CV Set 
fdt = sfdt_IDXed.na.fill(Med_dict)
fdt.show()

# ===================================================================

fdt_T = sfdt_T_IDXed.na.fill(Med_dict)
fdt_T.show()


+--------+---------+------------+-----------+------------+----------+------------------+----------+---+------------+--------+-------+-----------+----------+------------+-----------+-----------+--------+----------+----------+----------+------------+-----------+---------+---------+------+---------+-------+------+-----------+--------+------------+-------------+-----------+----------+--------+----------+------------+-----------+-------------+---------------+-----------------+-------------+--------------+--------------+---------------+--------------+--------------+---------------+----------------+-------------+---------+------------+--------------+-------------+--------------+------------+------------+---------------+---------+----------+------------+----------------+--------------+--------------+-------------+-------------+------------+------------+----------------+------------+----------+---------------+---------------+-------------+--------------+--------------+----------------+---------

+--------+---------+------------+-----------+------------+----------+----------+----+------------+--------+-------+-----------+----------+------------+-----------+-----------+--------+----------+----------+----------+------------+-----------+---------+---------+------+---------+-------+------+-----------+--------+------------+-------------+-----------+----------+--------+----------+------------+-----------+-------------+---------------+-----------------+-------------+--------------+--------------+---------------+--------------+--------------+---------------+----------------+-------------+---------+------------+--------------+-------------+--------------+------------+------------+---------------+---------+----------+------------+----------------+--------------+--------------+-------------+-------------+------------+------------+----------------+------------+----------+---------------+---------------+-------------+--------------+--------------+----------------+--------------+------------

#### Check for any left over Null values

In [85]:
# CV SET
fdt.dropna().count()/fdt.count()

1.0

In [86]:
# Test Set
fdt_T.dropna().count()/fdt_T.count()

1.0

# Rename the processed dataframes as well as the labels column

In [87]:
# CV Set 
Xy = fdt.withColumnRenamed(LBL,'Labels')

# =============================================

# Test Set (No Labels)
X = fdt_T


# EDA
### EDA-1 summary statistics

In [88]:
# CV Set
for c in Xy.columns:
    #XY.select(c).describe().select(bround(c,2)).show()
    Xy.select(c).describe().select('summary',bround(c,2).alias(c)).show()



+-------+--------+
|summary|FullBath|
+-------+--------+
|  count|  1460.0|
|   mean|    1.57|
| stddev|    0.55|
|    min|     0.0|
|    max|     3.0|
+-------+--------+

+-------+---------+
|summary|GrLivArea|
+-------+---------+
|  count|   1460.0|
|   mean|  1515.46|
| stddev|   525.48|
|    min|    334.0|
|    max|   5642.0|
+-------+---------+

+-------+------------+
|summary|BsmtHalfBath|
+-------+------------+
|  count|      1460.0|
|   mean|        0.06|
| stddev|        0.24|
|    min|         0.0|
|    max|         2.0|
+-------+------------+

+-------+-----------+
|summary|TotalBsmtSF|
+-------+-----------+
|  count|     1460.0|
|   mean|    1057.43|
| stddev|     438.71|
|    min|        0.0|
|    max|     6110.0|
+-------+-----------+

+-------+------------+
|summary|KitchenAbvGr|
+-------+------------+
|  count|      1460.0|
|   mean|        1.05|
| stddev|        0.22|
|    min|         0.0|
|    max|         3.0|
+-------+------------+

+-------+----------+
|summary|Ga

+-------+--------------+
|summary|MasVnrType_IDX|
+-------+--------------+
|  count|        1460.0|
|   mean|          0.53|
| stddev|          0.74|
|    min|           0.0|
|    max|           4.0|
+-------+--------------+

+-------+---------------+
|summary|Exterior1st_IDX|
+-------+---------------+
|  count|         1460.0|
|   mean|           2.03|
| stddev|           2.26|
|    min|            0.0|
|    max|           14.0|
+-------+---------------+

+-------+--------------+
|summary|CentralAir_IDX|
+-------+--------------+
|  count|        1460.0|
|   mean|          0.07|
| stddev|          0.25|
|    min|           0.0|
|    max|           1.0|
+-------+--------------+

+-------+--------------+
|summary|GarageCond_IDX|
+-------+--------------+
|  count|        1460.0|
|   mean|          0.15|
| stddev|          0.54|
|    min|           0.0|
|    max|           5.0|
+-------+--------------+

+-------+---------------+
|summary|Exterior2nd_IDX|
+-------+---------------+
|  count|

### EDA-2 vectorize

In [89]:
# Drop the index column for the CV set 
Xy_clms = Xy.columns # XY labels 
Xy_clms.remove(IDX)
# Drop the index column for the test set 
X_clms  = X.columns
X_clms.remove(IDX)


EDA_Vectorizer = VectorAssembler(inputCols= Xy_clms , outputCol='features_labels')
Vectorized_lng = EDA_Vectorizer.transform(Xy)
#Vectorized_lng.show(truncate=False)
Vectorized_sht = Vectorized_lng.select('features_labels')
Vectorized_sht.show()

+--------------------+
|     features_labels|
+--------------------+
|(80,[0,1,3,4,5,6,...|
|(80,[0,1,2,3,4,5,...|
|(80,[0,1,3,4,5,6,...|
|(80,[0,1,3,4,5,6,...|
|(80,[0,1,3,4,5,6,...|
|(80,[0,1,3,4,5,6,...|
|(80,[0,1,3,4,5,6,...|
|(80,[0,1,3,4,5,6,...|
|(80,[0,1,3,4,5,6,...|
|(80,[0,1,3,4,5,6,...|
|(80,[0,1,3,4,5,6,...|
|(80,[0,1,3,4,5,6,...|
|(80,[0,1,3,4,5,6,...|
|(80,[0,1,3,4,5,6,...|
|(80,[0,1,3,4,5,6,...|
|(80,[0,1,3,4,5,6,...|
|(80,[0,1,3,4,5,6,...|
|(80,[0,1,4,5,6,8,...|
|(80,[0,1,3,4,5,6,...|
|(80,[0,1,3,4,5,6,...|
+--------------------+
only showing top 20 rows



### EDA-3 correlations

In [90]:
# Calculate the pairwise correlations
CorrMat_Spark = Correlation.corr(Vectorized_sht,'features_labels').head()
#print("Pearson correlation matrix:\n" + str(CorrMat_Spark[0]))

#=======================================================================

# Find out the correlation matrix size

CorrMat_Spark[0]

DenseMatrix(80, 80, [1.0, 0.63, -0.0545, 0.3237, 0.1331, 0.4057, 0.5948, 0.1877, ..., 0.0082, 0.0039, 0.0255, -0.0181, -0.0148, 0.1351, -0.0221, 1.0], False)

#### Record the size

In [91]:
N_corr = 80

#### EDA-4 Convert to Pandas Dataframe For Visual Checks 

In [92]:
CorrMat_np = -2*np.ones( (N_corr,N_corr) )
for ii in range(N_corr):
    for jj in range(N_corr):
            CorrMat_np[ii,jj] = CorrMat_Spark[0][ii,jj]
CorrMat_np = np.around(CorrMat_np,2)
#CorrMat_np

# ----------------------------------------------------

CorrMat_pd = pd.DataFrame(CorrMat_np, columns = enumerate(Xy_clms))
CorrMat_pd


Unnamed: 0,"(0, FullBath)","(1, GrLivArea)","(2, BsmtHalfBath)","(3, TotalBsmtSF)","(4, KitchenAbvGr)","(5, GarageArea)","(6, Labels)","(7, WoodDeckSF)","(8, BedroomAbvGr)","(9, PoolArea)","(10, LotArea)","(11, OverallQual)","(12, MasVnrArea)","(13, YearRemodAdd)","(14, OverallCond)","(15, LotFrontage)","(16, HalfBath)","(17, MSSubClass)","(18, BsmtFinSF1)","(19, GarageCars)","(20, TotRmsAbvGrd)","(21, GarageYrBlt)","(22, BsmtUnfSF)","(23, YearBuilt)","(24, YrSold)","(25, 3SsnPorch)","(26, MiscVal)","(27, MoSold)","(28, OpenPorchSF)","(29, 2ndFlrSF)","(30, LowQualFinSF)","(31, EnclosedPorch)","(32, ScreenPorch)","(33, BsmtFinSF2)","(34, 1stFlrSF)","(35, Fireplaces)","(36, BsmtFullBath)","(37, Heating_IDX)","(38, LandSlope_IDX)","(39, FireplaceQu_IDX)","(40, SaleCondition_IDX)","(41, RoofStyle_IDX)","(42, HouseStyle_IDX)","(43, MasVnrType_IDX)","(44, Exterior1st_IDX)","(45, CentralAir_IDX)","(46, GarageCond_IDX)","(47, Exterior2nd_IDX)","(48, BsmtFinType1_IDX)","(49, HeatingQC_IDX)","(50, Alley_IDX)","(51, RoofMatl_IDX)","(52, Functional_IDX)","(53, ExterQual_IDX)","(54, Condition1_IDX)","(55, MSZoning_IDX)","(56, LotShape_IDX)","(57, LandContour_IDX)","(58, Fence_IDX)","(59, Street_IDX)","(60, BsmtQual_IDX)","(61, Neighborhood_IDX)","(62, Foundation_IDX)","(63, GarageType_IDX)","(64, ExterCond_IDX)","(65, Utilities_IDX)","(66, BsmtCond_IDX)","(67, SaleType_IDX)","(68, BsmtExposure_IDX)","(69, BldgType_IDX)","(70, PoolQC_IDX)","(71, MiscFeature_IDX)","(72, KitchenQual_IDX)","(73, LotConfig_IDX)","(74, PavedDrive_IDX)","(75, Electrical_IDX)","(76, BsmtFinType2_IDX)","(77, GarageQual_IDX)","(78, GarageFinish_IDX)","(79, Condition2_IDX)"
0,1.00,0.63,-0.05,0.32,0.13,0.41,0.59,0.19,0.36,0.05,0.13,0.55,0.27,0.44,-0.19,0.12,0.14,0.13,0.06,0.47,0.55,0.14,0.29,0.47,-0.02,0.04,-0.01,0.06,0.26,0.42,-0.00,-0.12,-0.01,-0.08,0.38,0.24,-0.06,-0.03,-0.05,0.20,0.03,-0.01,-0.03,0.27,-0.17,-0.11,-0.14,-0.14,-0.24,-0.28,-0.01,0.01,-0.01,0.40,0.02,-0.06,0.18,-0.03,-0.19,-0.05,0.32,0.09,-0.35,-0.12,-0.09,-0.03,-0.02,0.05,0.09,0.06,0.06,-0.02,0.30,0.06,-0.14,-0.16,-0.07,-0.11,0.28,0.06
1,0.63,1.00,-0.02,0.45,0.10,0.47,0.70,0.25,0.52,0.17,0.26,0.59,0.39,0.29,-0.08,0.22,0.42,0.07,0.21,0.47,0.83,0.16,0.24,0.20,-0.04,0.02,-0.00,0.05,0.33,0.69,0.13,0.01,0.10,-0.01,0.57,0.46,0.03,-0.02,0.04,0.32,0.03,0.15,0.14,0.23,-0.01,-0.09,-0.11,-0.01,-0.15,-0.22,-0.00,0.19,0.06,0.37,0.08,-0.09,0.21,0.05,-0.10,-0.04,0.19,0.12,-0.20,-0.07,-0.05,-0.01,-0.05,0.04,0.07,-0.07,0.15,-0.02,0.31,0.06,-0.07,-0.12,-0.06,-0.06,0.19,0.09
2,-0.05,-0.02,1.00,-0.00,-0.04,-0.02,-0.01,0.04,0.05,0.02,0.05,-0.04,0.03,-0.01,0.12,-0.03,-0.01,-0.00,0.07,-0.02,-0.02,0.02,-0.10,-0.04,-0.05,0.04,-0.01,0.03,-0.03,-0.02,-0.01,-0.01,0.03,0.07,0.00,0.03,-0.15,-0.01,0.07,0.07,0.05,0.04,0.05,0.02,0.02,-0.04,-0.04,0.05,0.07,0.07,-0.04,0.02,-0.01,-0.07,-0.02,-0.06,0.06,0.03,0.03,-0.02,-0.05,0.01,0.03,-0.01,0.05,0.10,0.00,-0.00,0.02,-0.02,0.03,0.02,-0.03,0.04,-0.01,-0.02,0.06,-0.05,-0.05,-0.02
3,0.32,0.45,-0.00,1.00,-0.07,0.49,0.61,0.23,0.05,0.13,0.26,0.54,0.36,0.29,-0.17,0.24,-0.05,-0.24,0.52,0.43,0.29,0.18,0.42,0.39,-0.01,0.04,-0.02,0.01,0.25,-0.17,-0.03,-0.10,0.08,0.10,0.82,0.34,0.31,-0.14,0.03,0.20,0.04,0.16,-0.27,0.36,-0.09,-0.21,-0.15,-0.07,-0.20,-0.27,-0.11,0.20,-0.03,0.39,-0.00,-0.20,0.20,0.06,-0.11,-0.01,0.04,0.04,-0.38,-0.28,-0.13,-0.01,-0.33,0.09,-0.01,-0.13,0.10,-0.06,0.28,0.04,-0.19,-0.18,-0.11,-0.13,0.20,0.05
4,0.13,0.10,-0.04,-0.07,1.00,-0.06,-0.15,-0.09,0.20,-0.01,-0.02,-0.18,-0.04,-0.15,-0.09,0.03,-0.07,0.28,-0.08,-0.05,0.26,-0.16,0.03,-0.17,0.03,-0.02,0.06,0.03,-0.07,0.06,0.01,0.04,-0.05,-0.04,0.07,-0.12,-0.04,0.08,-0.04,-0.11,0.12,0.01,0.07,-0.06,0.10,0.25,0.14,0.12,0.10,0.14,0.02,0.02,0.03,-0.09,0.04,0.09,-0.09,0.02,-0.06,-0.01,0.07,-0.06,0.20,0.22,0.01,-0.01,0.22,0.03,0.10,0.50,-0.01,0.05,-0.10,-0.03,0.11,0.16,0.07,0.14,-0.04,0.15
5,0.41,0.47,-0.02,0.49,-0.06,1.00,0.65,0.22,0.07,0.06,0.18,0.56,0.37,0.37,-0.15,0.20,0.16,-0.10,0.30,0.88,0.34,0.56,0.18,0.48,-0.03,0.04,-0.03,0.03,0.24,0.14,-0.07,-0.12,0.05,-0.02,0.49,0.27,0.18,-0.10,0.00,0.23,-0.00,0.08,-0.12,0.34,-0.19,-0.23,-0.30,-0.17,-0.19,-0.28,-0.06,0.09,-0.07,0.41,0.02,-0.10,0.17,0.04,-0.12,0.05,0.25,-0.01,-0.34,-0.31,-0.16,0.01,-0.12,0.05,0.12,-0.13,0.06,-0.04,0.27,0.07,-0.24,-0.21,-0.04,-0.28,-0.03,0.05
6,0.59,0.70,-0.01,0.61,-0.15,0.65,1.00,0.33,0.21,0.07,0.26,0.82,0.43,0.57,-0.04,0.18,0.31,-0.07,0.37,0.68,0.53,0.35,0.22,0.59,-0.04,0.05,-0.02,0.06,0.32,0.32,-0.04,-0.15,0.12,0.00,0.60,0.49,0.24,-0.15,0.04,0.36,-0.04,0.13,-0.09,0.39,-0.22,-0.35,-0.28,-0.19,-0.25,-0.41,-0.09,0.07,-0.13,0.52,-0.02,-0.24,0.29,0.06,-0.16,-0.06,0.26,0.09,-0.49,-0.33,-0.17,-0.01,-0.23,0.06,0.11,-0.18,0.07,-0.07,0.42,0.11,-0.26,-0.29,-0.10,-0.22,0.26,-0.00
7,0.19,0.25,0.04,0.23,-0.09,0.22,0.33,1.00,0.05,0.07,0.17,0.24,0.16,0.21,-0.00,-0.02,0.11,-0.01,0.20,0.23,0.17,0.12,-0.01,0.22,0.02,-0.03,-0.01,0.02,0.06,0.09,-0.03,-0.13,-0.07,0.07,0.24,0.20,0.18,-0.08,0.10,0.20,-0.05,0.07,-0.04,0.11,-0.08,-0.15,-0.09,-0.04,-0.11,-0.10,-0.12,0.07,-0.01,0.17,-0.01,-0.15,0.16,0.09,0.03,0.02,0.11,0.13,-0.18,-0.14,-0.03,-0.02,-0.09,-0.02,0.11,-0.07,0.04,0.01,0.13,0.06,-0.10,-0.14,0.01,-0.08,0.14,-0.00
8,0.36,0.52,0.05,0.05,0.20,0.07,0.21,0.05,1.00,0.07,0.12,0.10,0.10,-0.04,0.01,0.14,0.23,-0.02,-0.11,0.09,0.68,-0.01,0.17,-0.07,-0.04,-0.02,0.01,0.05,0.09,0.50,0.11,0.04,0.04,-0.02,0.13,0.11,-0.15,-0.02,-0.05,0.08,0.02,0.03,0.14,0.03,-0.01,-0.01,-0.00,0.00,0.02,0.00,-0.01,-0.02,0.01,-0.05,0.09,-0.11,0.06,-0.04,0.01,-0.03,-0.10,-0.06,0.04,0.06,0.02,0.00,0.01,-0.01,-0.05,-0.04,0.08,0.03,-0.03,0.04,0.03,-0.05,-0.02,-0.02,-0.00,0.00
9,0.05,0.17,0.02,0.13,-0.01,0.06,0.07,0.07,0.07,1.00,0.08,0.07,0.01,0.01,-0.00,0.11,0.02,0.01,0.14,0.02,0.08,0.02,-0.04,0.00,-0.06,-0.01,0.03,-0.03,0.06,0.08,0.06,0.05,0.05,0.04,0.13,0.10,0.07,-0.01,-0.02,0.11,0.10,0.06,0.03,-0.00,0.07,-0.02,0.05,0.08,-0.01,0.04,-0.02,0.18,-0.02,0.03,0.03,-0.03,0.05,-0.01,0.10,-0.00,0.01,0.00,-0.01,-0.04,0.03,-0.00,-0.02,-0.01,0.00,-0.03,0.90,0.13,0.05,0.03,-0.02,-0.02,0.03,0.05,0.01,-0.01


# ML Vectorize

In [93]:
# CV Set 
ML_Vectorizer   = VectorAssembler(inputCols= X_clms , outputCol='features')
ML_Vectorized   = ML_Vectorizer.transform(Xy)

# ===================================================================================

# Test Set 
ML_Vectorizer_T = VectorAssembler(inputCols= X_clms , outputCol='features')
ML_Vectorized_T = ML_Vectorizer.transform(X)


In [94]:
ML_Vectorized.show()

+--------+---------+------------+-----------+------------+----------+------------------+----------+---+------------+--------+-------+-----------+----------+------------+-----------+-----------+--------+----------+----------+----------+------------+-----------+---------+---------+------+---------+-------+------+-----------+--------+------------+-------------+-----------+----------+--------+----------+------------+-----------+-------------+---------------+-----------------+-------------+--------------+--------------+---------------+--------------+--------------+---------------+----------------+-------------+---------+------------+--------------+-------------+--------------+------------+------------+---------------+---------+----------+------------+----------------+--------------+--------------+-------------+-------------+------------+------------+----------------+------------+----------+---------------+---------------+-------------+--------------+--------------+----------------+---------

In [95]:
ML_Vectorized_T.show()

+--------+---------+------------+-----------+------------+----------+----------+----+------------+--------+-------+-----------+----------+------------+-----------+-----------+--------+----------+----------+----------+------------+-----------+---------+---------+------+---------+-------+------+-----------+--------+------------+-------------+-----------+----------+--------+----------+------------+-----------+-------------+---------------+-----------------+-------------+--------------+--------------+---------------+--------------+--------------+---------------+----------------+-------------+---------+------------+--------------+-------------+--------------+------------+------------+---------------+---------+----------+------------+----------------+--------------+--------------+-------------+-------------+------------+------------+----------------+------------+----------+---------------+---------------+-------------+--------------+--------------+----------------+--------------+------------

# Perform Feature Scaling if needed

In [96]:
'''
scaler = StandardScaler(inputCol="uns_features", outputCol="features",
                        withStd=True, withMean=False)
Vec_scaled = scaler.fit(Vectorized).transform(Vectorized)
'''

'\nscaler = StandardScaler(inputCol="uns_features", outputCol="features",\n                        withStd=True, withMean=False)\nVec_scaled = scaler.fit(Vectorized).transform(Vectorized)\n'

# Define the ML Data Set

#### CV

In [127]:
cv_set = ML_Vectorized.select('features','Labels')
cv_set.show()

+--------------------+------------------+
|            features|            Labels|
+--------------------+------------------+
|(79,[0,1,3,4,5,7,...|12.247694320220994|
|(79,[0,1,2,3,4,5,...|12.109010932687042|
|(79,[0,1,3,4,5,7,...| 12.31716669303576|
|(79,[0,1,3,4,5,7,...| 11.84939770159144|
|(79,[0,1,3,4,5,6,...|12.429216196844383|
|(79,[0,1,3,4,5,6,...|11.870599909242044|
|(79,[0,1,3,4,5,6,...|12.634603026569334|
|(79,[0,1,3,4,5,6,...|12.206072645530174|
|(79,[0,1,3,4,5,6,...| 11.77452020265869|
|(79,[0,1,3,4,5,7,...|11.678439903447801|
|(79,[0,1,3,4,5,7,...|11.771436160121729|
|(79,[0,1,3,4,5,6,...|12.751299696013497|
|(79,[0,1,3,4,5,6,...|11.877568578558138|
|(79,[0,1,3,4,5,6,...|12.540757571577291|
|(79,[0,1,3,4,5,7,...|11.964001084330445|
|(79,[0,1,3,4,5,6,...|11.790557201568507|
|(79,[0,1,3,4,5,7,...|11.911701584927597|
|(79,[0,1,4,5,7,9,...|11.407564949312402|
|(79,[0,1,3,4,5,7,...|11.976659481202368|
|(79,[0,1,3,4,5,7,...|11.842229212112828|
+--------------------+------------

#### Test 

In [98]:
test_set = ML_Vectorized_T.select('Id','features')


# Define the ML model 

In [99]:
mdl_RF  = RandomForestRegressor(featuresCol='features', labelCol='Labels')
#mdl_GBT = GBTClassifier(featuresCol='features', labelCol='Labels') 
# mdl_LG =  LogisticRegression(featuresCol='features', labelCol='Labels')

# Create the evaluation object
#### Options: rmse - root mean squared error (default)   |   mse - mean squared error  |  r2 - r^2 metric   |  mae - mean absolute error

In [128]:
eval1_obj = RegressionEvaluator(predictionCol='prediction', labelCol='Labels', metricName='rmse')

# Parameter Grids

In [154]:
paramGrid_RF = ParamGridBuilder()\
    .addGrid(mdl_RF.maxDepth, [5,7,10])\
    .addGrid(mdl_RF.maxBins, [32])\
    .addGrid(mdl_RF.impurity, ['variance'])\
    .addGrid(mdl_RF.minInstancesPerNode, [1,5])\
    .addGrid(mdl_RF.checkpointInterval, [10])\
    .addGrid(mdl_RF.subsamplingRate, [1.0])\
    .addGrid(mdl_RF.numTrees, [20,40,60])\
    .addGrid(mdl_RF.featureSubsetStrategy, ['auto'])\
    .build()


# Best 
#0.139 [10, 32, 'variance', 1, 10, 1.0, 40, 'auto']
#0.139 [10, 32, 'variance', 1, 10, 1.0, 60, 'auto']
#0.139 [10, 32, 'variance', 5, 10, 1.0, 60, 'auto']

Np_RF = len(paramGrid_RF)
#Options for subsamplingRate auto, all, onethird, sqrt, log2 

# -------------------------------------
'''
paramGrid_GBT = ParamGridBuilder() \
    .addGrid(mdl_GBT.maxDepth, [5]) \
    .addGrid(mdl_GBT.maxIter,  [6,8,12,20]) \
    .addGrid(mdl_GBT.stepSize, [0.01,0.05,0.1,0.2]) \
    .build()
Np_GBT = len(paramGrid_GBT)
'''
# -------------------------------------
'''
paramGrid_LG = ParamGridBuilder() \
    .addGrid(mdl_LG.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(mdl_LG.maxIter, [100,200]) \
    .build()
Np_LG = len(paramGrid_LG)'''

'\nparamGrid_LG = ParamGridBuilder()     .addGrid(mdl_LG.regParam, [0.01, 0.1, 0.5])     .addGrid(mdl_LG.maxIter, [100,200])     .build()\nNp_LG = len(paramGrid_LG)'

# Cross Validator 

In [155]:
xval_RF = CrossValidator(estimator         = mdl_RF,
                         estimatorParamMaps= paramGrid_RF,
                         evaluator         = eval1_obj,
                         numFolds          = 7,
                         parallelism       = 8)
# ---------------------------------------------------------
'''
xval_GBT = CrossValidator(estimator         = mdl_GBT,
                         estimatorParamMaps= paramGrid_GBT,
                         evaluator         = eval1_obj,
                         numFolds          = 10,
                         parallelism       = 8)
'''
'''
# ---------------------------------------------------------
xval_LG = CrossValidator(estimator         = mdl_LG,
                         estimatorParamMaps= paramGrid_LG,
                         evaluator         = eval1_obj,
                         numFolds          = 10,
                         parallelism       = 8)
'''

'\n# ---------------------------------------------------------\nxval_LG = CrossValidator(estimator         = mdl_LG,\n                         estimatorParamMaps= paramGrid_LG,\n                         evaluator         = eval1_obj,\n                         numFolds          = 10,\n                         parallelism       = 8)\n'

# Fit the model (with CV)

In [156]:
mdl_xv_RF_ftd = xval_RF.fit(cv_set)

In [420]:
#mdl_xv_GBT_ftd = xval_GBT.fit(cv_set)

In [421]:
#mdl_xv_LG_ftd = xval_LG.fit(cv_set)

# Metric AVGs for Cross Validation Across Parameter Grid 

### RF

In [157]:
print('RF CV Scores')
for pp in range(Np_RF):
    L = []
    for key in paramGrid_RF[pp]:
        L.append(paramGrid_RF[pp][key])
        
    print(round(mdl_xv_RF_ftd.avgMetrics[pp],3), L)

RF CV Scores
0.157 [5, 32, 'variance', 1, 10, 1.0, 20, 'auto']
0.155 [5, 32, 'variance', 1, 10, 1.0, 40, 'auto']
0.156 [5, 32, 'variance', 1, 10, 1.0, 60, 'auto']
0.158 [5, 32, 'variance', 5, 10, 1.0, 20, 'auto']
0.155 [5, 32, 'variance', 5, 10, 1.0, 40, 'auto']
0.155 [5, 32, 'variance', 5, 10, 1.0, 60, 'auto']
0.147 [7, 32, 'variance', 1, 10, 1.0, 20, 'auto']
0.143 [7, 32, 'variance', 1, 10, 1.0, 40, 'auto']
0.144 [7, 32, 'variance', 1, 10, 1.0, 60, 'auto']
0.147 [7, 32, 'variance', 5, 10, 1.0, 20, 'auto']
0.145 [7, 32, 'variance', 5, 10, 1.0, 40, 'auto']
0.143 [7, 32, 'variance', 5, 10, 1.0, 60, 'auto']
0.144 [10, 32, 'variance', 1, 10, 1.0, 20, 'auto']
0.139 [10, 32, 'variance', 1, 10, 1.0, 40, 'auto']
0.139 [10, 32, 'variance', 1, 10, 1.0, 60, 'auto']
0.144 [10, 32, 'variance', 5, 10, 1.0, 20, 'auto']
0.141 [10, 32, 'variance', 5, 10, 1.0, 40, 'auto']
0.139 [10, 32, 'variance', 5, 10, 1.0, 60, 'auto']


### GBT

In [423]:
'''print('GBT CV Scores')
for pp in range(Np_GBT):
    L = []
    for key in paramGrid_GBT[pp]:
        L.append(paramGrid_GBT[pp][key])
    print(round(mdl_xv_GBT.avgMetrics[pp],3), L)'''

"print('GBT CV Scores')\nfor pp in range(Np_GBT):\n    L = []\n    for key in paramGrid_GBT[pp]:\n        L.append(paramGrid_GBT[pp][key])\n    print(round(mdl_xv_GBT.avgMetrics[pp],3), L)"

### LG

In [424]:
'''print('LG CV Scores')
for pp in range(Np_LG):
    L = []
    for key in paramGrid_LG[pp]:
        L.append(paramGrid_LG[pp][key])
    print(round(mdl_xv_LG.avgMetrics[pp],3), L)'''

"print('LG CV Scores')\nfor pp in range(Np_LG):\n    L = []\n    for key in paramGrid_LG[pp]:\n        L.append(paramGrid_LG[pp][key])\n    print(round(mdl_xv_LG.avgMetrics[pp],3), L)"

# Test Data

In [158]:
mdl_xv_RF_preds_T = mdl_xv_RF_ftd.transform(test_set) 

In [159]:
mdl_xv_RF_preds_T = mdl_xv_RF_preds_T.withColumnRenamed('prediction','logSalePrice')
mdl_xv_RF_preds_T.columns

['Id', 'features', 'logSalePrice']

In [160]:
fnl = mdl_xv_RF_preds_T.select('Id',exp('logSalePrice').alias('SalePrice'))
fnl.show()

+----+------------------+
|  Id|         SalePrice|
+----+------------------+
|1461|121767.25838797801|
|1462|149620.23785502763|
|1463|183180.54367539813|
|1464|185624.66314030168|
|1465|194913.08397500706|
|1466|179694.09272782734|
|1467|173277.47132964653|
|1468|177112.04318261804|
|1469|178860.97069498844|
|1470| 129114.6847481411|
|1471| 198355.6540644318|
|1472| 96602.51610606069|
|1473|104201.64284616966|
|1474|150738.31433337452|
|1475|139112.48366522155|
|1476| 368300.6031031953|
|1477|  252170.160885334|
|1478|314359.30141901504|
|1479| 300328.8730666479|
|1480| 481839.6713941411|
+----+------------------+
only showing top 20 rows



In [161]:
Sub_PId = fnl.rdd.map(lambda r: r.Id).collect()
Sub_Prd = fnl.rdd.map(lambda r: r.SalePrice).collect()

In [162]:
Submission = pd.DataFrame(columns=['Id', 'SalePrice'])

In [163]:
Submission['Id'] = Sub_PId 
Submission['SalePrice']    = Sub_Prd

In [164]:
Submission 

Unnamed: 0,Id,SalePrice
0,1461,121767.258388
1,1462,149620.237855
2,1463,183180.543675
3,1464,185624.663140
4,1465,194913.083975
5,1466,179694.092728
6,1467,173277.471330
7,1468,177112.043183
8,1469,178860.970695
9,1470,129114.684748


In [165]:
Submission.to_csv('Submission.csv')

# Don't forget to remove the index column