In [1]:
#The purpose of this notebook is to practice using Apache Spark (PySpark). The dataset used is Airbnb prices in New York.

## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Airbnb_prices_pyspark').getOrCreate()

In [4]:
# File location and type
file_location = "/FileStore/tables/AB_NYC_2019.csv"
file_type = "csv"

# CSV options
infer_schema = "True"
first_row_is_header = "True"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149.0,1.0,9,2018-10-19,0.21,6.0,365.0
2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225.0,1.0,45,2019-05-21,0.38,2.0,355.0
3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150.0,3.0,0,,,1.0,365.0
3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89.0,1.0,270,2019-07-05,4.64,1.0,194.0
5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80.0,10.0,9,2018-11-19,0.1,1.0,0.0
5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200.0,3.0,74,2019-06-22,0.59,1.0,129.0
5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,Private room,60.0,45.0,49,2017-10-05,0.4,1.0,0.0
5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Hell's Kitchen,40.76489,-73.98493,Private room,79.0,2.0,430,2019-06-24,3.47,1.0,220.0
5203,Cozy Clean Guest Room - Family Apt,7490,MaryEllen,Manhattan,Upper West Side,40.80178,-73.96723,Private room,79.0,2.0,118,2017-07-21,0.99,1.0,0.0
5238,Cute & Cozy Lower East Side 1 bdrm,7549,Ben,Manhattan,Chinatown,40.71344,-73.99037,Entire home/apt,150.0,1.0,160,2019-06-09,1.33,4.0,188.0


In [5]:
df.printSchema()

In [6]:
# Create a view or table

temp_table_name = "AB_NYC_2019_csv"

df.createOrReplaceTempView(temp_table_name)

In [7]:
%sql

/* Query the created temp table in a SQL cell */
select 
count(distinct neighbourhood)

from `AB_NYC_2019_csv`


count(DISTINCT neighbourhood)
382


In [8]:
#this code below is to practice using SQL syntacs. creating and using temp tables

t1 = spark.sql("""
select 

room_type,
count(*) as Count_1

from `AB_NYC_2019_csv`

group by room_type

order by count(*) desc
""")
t1.createOrReplaceTempView('t1')

t2 = spark.sql("""
select 

room_type,
count(*) as Count_2

from `AB_NYC_2019_csv`

group by room_type

order by count(*) desc
""")
t2.createOrReplaceTempView('t2')

t3 = spark.sql("""
select 

t1.room_type,
t1.Count_1,
t2.Count_2

from `t1` as t1
left join `t2` as t2 on t1.room_type = t2.room_type

""")
t3.show()

In [9]:
#Using the groupby function to find mean, max of different columns within the dataframe

from pyspark.sql.functions import mean, max, col
mean_price = df.groupBy("room_type").agg(mean("price").alias("price"), max("minimum_nights").alias("minimum_nights"))
mean_price.filter(col('room_type').isin(['Entire home/apt','Private room','Shared room'])).show()

In [10]:
#find all the room types that are legit, there are room types in number format that is incorrect

all_room_types = df.select("room_type").distinct().collect()
to_list = [str(row.room_type) for row in all_room_types]
to_list = set(to_list)
to_list_new = []
for i in to_list:
  if i.replace('.','',1).replace('-','',1).isdigit():
    continue 
  else:
    to_list_new.append(i)

In [11]:
to_list_new

In [12]:
from pyspark.sql.functions import mean, max, col
mean_price = df.groupBy("room_type").agg(mean("price").alias("price"), max("minimum_nights").alias("minimum_nights"))
mean_price.filter(col('room_type').isin(to_list_new)).show()

In [13]:
#only include data that is legit in the room type column
df = df.dropna(how = 'any', subset = ['room_type'])
df = df.filter(col('room_type').isin('Shared room','Entire home/apt','Private room'))

In [14]:
display(df)

id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0
5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129
5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,Private room,60,45,49,2017-10-05,0.4,1,0
5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Hell's Kitchen,40.76489,-73.98493,Private room,79,2,430,2019-06-24,3.47,1,220
5203,Cozy Clean Guest Room - Family Apt,7490,MaryEllen,Manhattan,Upper West Side,40.80178,-73.96723,Private room,79,2,118,2017-07-21,0.99,1,0
5238,Cute & Cozy Lower East Side 1 bdrm,7549,Ben,Manhattan,Chinatown,40.71344,-73.99037,Entire home/apt,150,1,160,2019-06-09,1.33,4,188


In [15]:
df = df.fillna(0, subset=['reviews_per_month'])
df = df.fillna('NA', subset=['name'])

In [16]:
#check if theres any missing values within each column
from pyspark.sql.functions import count, when, isnan, col
df_null = df.select([count(when(isnan(i) | \
                                   #col(i).contains('NA') | \
                                   #col(i).contains('NULL') | \
                                   col(i).isNull(), i)).alias(i) \
                    for i in df.columns])

display(df_null)

id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,0,0,21,0,0,0,0,0,0,0,0,10029,10029,0,0


In [17]:
#casting the columns to the correct format
from pyspark.sql.types import IntegerType, FloatType
df = df.select('name','neighbourhood_group','neighbourhood','latitude','longitude','room_type','price','minimum_nights','number_of_reviews','calculated_host_listings_count','availability_365')
df = df.withColumn("price",df["price"].cast(IntegerType()).alias("price"))
df = df.withColumn("latitude",df["latitude"].cast(IntegerType()).alias("latitude"))
df = df.withColumn("longitude",df["longitude"].cast(IntegerType()).alias("longitude"))
df = df.withColumn("minimum_nights",df["minimum_nights"].cast(IntegerType()).alias("minimum_nights"))
df = df.withColumn("number_of_reviews",df["number_of_reviews"].cast(IntegerType()).alias("number_of_reviews"))
df = df.withColumn("calculated_host_listings_count",df["calculated_host_listings_count"].cast(IntegerType()).alias("calculated_host_listings_count"))
df = df.withColumn("availability_365",df["availability_365"].cast(IntegerType()).alias("availability_365"))

In [18]:
df.printSchema()

In [19]:
train, test = df.randomSplit([0.7, 0.3])

In [20]:
from pyspark.ml.feature import VectorAssembler, OneHotEncoderEstimator, StringIndexer, StopWordsRemover, Tokenizer, CountVectorizer, IDF, StandardScaler
from pyspark.ml.regression import RandomForestRegressor, DecisionTreeRegressor, GBTRegressor, LinearRegression
from pyspark.ml import Pipeline

In [21]:
#Preparing the pipeline, feature engineered text, catogorical, and continuous features independently. Followed by vector assembler to concat all the features together. 

#text 
tokenizer = Tokenizer(inputCol= 'name', outputCol='name_text')
stop_remove = StopWordsRemover(inputCol = 'name_text', outputCol = 'stop_text')
count_vec = CountVectorizer(inputCol = 'stop_text', outputCol = 'count_text', minDF  = 10)
IDF_ = IDF(inputCol = 'count_text', outputCol = 'idf_text', minDocFreq = 10)

#categorical
indexers = [StringIndexer(inputCol=column, handleInvalid = 'keep', outputCol=column+"_index") for column in ['neighbourhood_group','neighbourhood','room_type']]
ohe = [OneHotEncoderEstimator(inputCols=[column], outputCols=[column+"_ohe"]) for column in ['neighbourhood_group_index','neighbourhood_index','room_type_index']]
pipeline_categorical = Pipeline(stages=indexers + ohe)

#continuous
va = [VectorAssembler(inputCols=[column], outputCol=column+"_vec") for column in ['price','latitude','longitude']]
ss = [StandardScaler(inputCol=column, outputCol=column+"_ss") for column in ['price_vec','latitude_vec','longitude_vec']]
pipeline_continuous = Pipeline(stages=va + ss)

#vector assembler
assembler = VectorAssembler(inputCols=['idf_text','neighbourhood_group_index_ohe','room_type_index_ohe','latitude_vec_ss','longitude_vec_ss','minimum_nights','number_of_reviews','calculated_host_listings_count','availability_365'], outputCol = 'features')

In [22]:
#staging the pipeline

final_pipeline = Pipeline(stages = [tokenizer,stop_remove,count_vec,IDF_,pipeline_categorical,pipeline_continuous, assembler])

In [23]:
train_final_pipeline_result = final_pipeline.fit(train).transform(train)
test_final_pipeline_result = final_pipeline.fit(train).transform(test)

In [24]:
display(train_final_pipeline_result)

name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365,name_text,stop_text,count_text,idf_text,neighbourhood_group_index,neighbourhood_index,room_type_index,neighbourhood_group_index_ohe,neighbourhood_index_ohe,room_type_index_ohe,price_vec,latitude_vec,longitude_vec,price_vec_ss,latitude_vec_ss,longitude_vec_ss,features
1 Bed Apt in Utopic Williamsburg,Brooklyn,Williamsburg,40,-73,Entire home/apt,155,2,8,1,0,"List(, 1, bed, apt, in, utopic, williamsburg)","List(, 1, bed, apt, utopic, williamsburg)","List(0, 1214, List(6, 9, 18, 29, 35), List(1.0, 1.0, 1.0, 1.0, 1.0))","List(0, 1214, List(6, 9, 18, 29, 35), List(2.6132749236909545, 2.7013082731762776, 3.056982090764344, 3.543506683451132, 3.614099438924168))",1.0,0.0,0.0,"List(0, 5, List(1), List(1.0))","List(0, 214, List(0), List(1.0))","List(0, 3, List(0), List(1.0))","List(1, 1, List(), List(155.0))","List(1, 1, List(), List(40.0))","List(1, 1, List(), List(-73.0))","List(1, 1, List(), List(0.6421936748593362))","List(1, 1, List(), List(0.0))","List(1, 1, List(), List(-278.3029593126578))","List(0, 1228, List(6, 9, 18, 29, 35, 1215, 1219, 1223, 1224, 1225, 1226), List(2.6132749236909545, 2.7013082731762776, 3.056982090764344, 3.543506683451132, 3.614099438924168, 1.0, 1.0, -278.3029593126578, 2.0, 8.0, 1.0))"
2-3 bedroom UWS garden triplex,Manhattan,Upper West Side,40,-73,Entire home/apt,300,30,4,12,310,"List(, 2-3, bedroom, uws, garden, triplex)","List(, 2-3, bedroom, uws, garden, triplex)","List(0, 1214, List(1, 29, 53, 102, 525), List(1.0, 1.0, 1.0, 1.0, 1.0))","List(0, 1214, List(1, 29, 53, 102, 525), List(1.915723331606579, 3.543506683451132, 4.0522997861922, 4.781626623329648, 6.907755278982137))",0.0,4.0,0.0,"List(0, 5, List(0), List(1.0))","List(0, 214, List(4), List(1.0))","List(0, 3, List(0), List(1.0))","List(1, 1, List(), List(300.0))","List(1, 1, List(), List(40.0))","List(1, 1, List(), List(-73.0))","List(1, 1, List(), List(1.2429554997277474))","List(1, 1, List(), List(0.0))","List(1, 1, List(), List(-278.3029593126578))","List(0, 1228, List(1, 29, 53, 102, 525, 1214, 1219, 1223, 1224, 1225, 1226, 1227), List(1.915723331606579, 3.543506683451132, 4.0522997861922, 4.781626623329648, 6.907755278982137, 1.0, 1.0, -278.3029593126578, 30.0, 4.0, 12.0, 310.0))"
"2E""",Queens,Corona,40,-73,Shared room,27,2,13,2,361,"List(, 2e"")","List(, 2e"")","List(0, 1214, List(29), List(1.0))","List(0, 1214, List(29), List(3.543506683451132))",2.0,77.0,2.0,"List(0, 5, List(2), List(1.0))","List(0, 214, List(77), List(1.0))","List(0, 3, List(2), List(1.0))","List(1, 1, List(), List(27.0))","List(1, 1, List(), List(40.0))","List(1, 1, List(), List(-73.0))","List(1, 1, List(), List(0.11186599497549726))","List(1, 1, List(), List(0.0))","List(1, 1, List(), List(-278.3029593126578))","List(0, 1228, List(29, 1216, 1221, 1223, 1224, 1225, 1226, 1227), List(3.543506683451132, 1.0, 1.0, -278.3029593126578, 2.0, 13.0, 2.0, 361.0))"
3 bedroom loft in Williamsburg,Brooklyn,Williamsburg,40,-73,Entire home/apt,500,2,48,1,365,"List(, 3, bedroom, loft, in, williamsburg)","List(, 3, bedroom, loft, williamsburg)","List(0, 1214, List(1, 18, 22, 29, 50), List(1.0, 1.0, 1.0, 1.0, 1.0))","List(0, 1214, List(1, 18, 22, 29, 50), List(1.915723331606579, 3.056982090764344, 3.274046596002172, 3.543506683451132, 3.935833654121865))",1.0,0.0,0.0,"List(0, 5, List(1), List(1.0))","List(0, 214, List(0), List(1.0))","List(0, 3, List(0), List(1.0))","List(1, 1, List(), List(500.0))","List(1, 1, List(), List(40.0))","List(1, 1, List(), List(-73.0))","List(1, 1, List(), List(2.0715924995462456))","List(1, 1, List(), List(0.0))","List(1, 1, List(), List(-278.3029593126578))","List(0, 1228, List(1, 18, 22, 29, 50, 1215, 1219, 1223, 1224, 1225, 1226, 1227), List(1.915723331606579, 3.056982090764344, 3.274046596002172, 3.543506683451132, 3.935833654121865, 1.0, 1.0, -278.3029593126578, 2.0, 48.0, 1.0, 365.0))"
A charming Space in Brooklyn,Brooklyn,Bedford-Stuyvesant,40,-73,Private room,95,2,44,5,47,"List(, a, charming, space, in, brooklyn)","List(, charming, space, brooklyn)","List(0, 1214, List(7, 29, 36, 73), List(1.0, 1.0, 1.0, 1.0))","List(0, 1214, List(7, 29, 36, 73), List(2.619312374108939, 3.543506683451132, 3.611918412977808, 4.410668210637265))",1.0,1.0,1.0,"List(0, 5, List(1), List(1.0))","List(0, 214, List(1), List(1.0))","List(0, 3, List(1), List(1.0))","List(1, 1, List(), List(95.0))","List(1, 1, List(), List(40.0))","List(1, 1, List(), List(-73.0))","List(1, 1, List(), List(0.3936025749137867))","List(1, 1, List(), List(0.0))","List(1, 1, List(), List(-278.3029593126578))","List(0, 1228, List(7, 29, 36, 73, 1215, 1220, 1223, 1224, 1225, 1226, 1227), List(2.619312374108939, 3.543506683451132, 3.611918412977808, 4.410668210637265, 1.0, 1.0, -278.3029593126578, 2.0, 44.0, 5.0, 47.0))"
AMAZING TIME SQUARE!!BRICK WALLS!!,Manhattan,Hell's Kitchen,40,-73,Entire home/apt,115,30,3,52,342,"List(, amazing, time, square!!brick, walls!!)","List(, amazing, time, square!!brick, walls!!)","List(0, 1214, List(29, 60, 164), List(1.0, 1.0, 1.0))","List(0, 1214, List(29, 60, 164), List(3.543506683451132, 4.137006483664363, 5.280824209100519))",0.0,5.0,0.0,"List(0, 5, List(0), List(1.0))","List(0, 214, List(5), List(1.0))","List(0, 3, List(0), List(1.0))","List(1, 1, List(), List(115.0))","List(1, 1, List(), List(40.0))","List(1, 1, List(), List(-73.0))","List(1, 1, List(), List(0.4764662748956365))","List(1, 1, List(), List(0.0))","List(1, 1, List(), List(-278.3029593126578))","List(0, 1228, List(29, 60, 164, 1214, 1219, 1223, 1224, 1225, 1226, 1227), List(3.543506683451132, 4.137006483664363, 5.280824209100519, 1.0, 1.0, -278.3029593126578, 30.0, 3.0, 52.0, 342.0))"
Affordable & Cozy,Bronx,University Heights,40,-73,Private room,37,4,117,1,232,"List(, affordable, &, cozy)","List(, affordable, &, cozy)","List(0, 1214, List(4, 13, 29, 246), List(1.0, 1.0, 1.0, 1.0))","List(0, 1214, List(4, 13, 29, 246), List(2.3604011624884413, 2.8641881483556464, 3.543506683451132, 5.809142990314028))",3.0,123.0,1.0,"List(0, 5, List(3), List(1.0))","List(0, 214, List(123), List(1.0))","List(0, 3, List(1), List(1.0))","List(1, 1, List(), List(37.0))","List(1, 1, List(), List(40.0))","List(1, 1, List(), List(-73.0))","List(1, 1, List(), List(0.15329784496642218))","List(1, 1, List(), List(0.0))","List(1, 1, List(), List(-278.3029593126578))","List(0, 1228, List(4, 13, 29, 246, 1217, 1220, 1223, 1224, 1225, 1226, 1227), List(2.3604011624884413, 2.8641881483556464, 3.543506683451132, 5.809142990314028, 1.0, 1.0, -278.3029593126578, 4.0, 117.0, 1.0, 232.0))"
Beautiful Room In Gramercy!!!,Manhattan,Gramercy,40,-73,Private room,64,26,47,1,331,"List(, beautiful, room, in, gramercy!!!)","List(, beautiful, room, gramercy!!!)","List(0, 1214, List(0, 16, 29), List(1.0, 1.0, 1.0))","List(0, 1214, List(0, 16, 29), List(1.6604216572138548, 3.0451697059798617, 3.543506683451132))",0.0,34.0,1.0,"List(0, 5, List(0), List(1.0))","List(0, 214, List(34), List(1.0))","List(0, 3, List(1), List(1.0))","List(1, 1, List(), List(64.0))","List(1, 1, List(), List(40.0))","List(1, 1, List(), List(-73.0))","List(1, 1, List(), List(0.26516383994191944))","List(1, 1, List(), List(0.0))","List(1, 1, List(), List(-278.3029593126578))","List(0, 1228, List(0, 16, 29, 1214, 1220, 1223, 1224, 1225, 1226, 1227), List(1.6604216572138548, 3.0451697059798617, 3.543506683451132, 1.0, 1.0, -278.3029593126578, 26.0, 47.0, 1.0, 331.0))"
"Brooklyn""",Brooklyn,Crown Heights,40,-73,Private room,100,1,0,1,0,"List(, brooklyn"")","List(, brooklyn"")","List(0, 1214, List(29), List(1.0))","List(0, 1214, List(29), List(3.543506683451132))",1.0,8.0,1.0,"List(0, 5, List(1), List(1.0))","List(0, 214, List(8), List(1.0))","List(0, 3, List(1), List(1.0))","List(1, 1, List(), List(100.0))","List(1, 1, List(), List(40.0))","List(1, 1, List(), List(-73.0))","List(1, 1, List(), List(0.4143184999092491))","List(1, 1, List(), List(0.0))","List(1, 1, List(), List(-278.3029593126578))","List(0, 1228, List(29, 1215, 1220, 1223, 1224, 1226), List(3.543506683451132, 1.0, 1.0, -278.3029593126578, 1.0, 1.0))"
Heart & Soul of Greenwich Village,Manhattan,Greenwich Village,40,-73,Entire home/apt,850,3,107,1,249,"List(, heart, &, soul, of, greenwich, village)","List(, heart, &, soul, greenwich, village)","List(0, 1214, List(13, 20, 21, 29, 191), List(1.0, 1.0, 1.0, 1.0, 1.0))","List(0, 1214, List(13, 20, 21, 29, 191), List(2.8641881483556464, 3.1771130965062255, 3.177818563907618, 3.543506683451132, 5.450509181889962))",0.0,28.0,0.0,"List(0, 5, List(0), List(1.0))","List(0, 214, List(28), List(1.0))","List(0, 3, List(0), List(1.0))","List(1, 1, List(), List(850.0))","List(1, 1, List(), List(40.0))","List(1, 1, List(), List(-73.0))","List(1, 1, List(), List(3.5217072492286174))","List(1, 1, List(), List(0.0))","List(1, 1, List(), List(-278.3029593126578))","List(0, 1228, List(13, 20, 21, 29, 191, 1214, 1219, 1223, 1224, 1225, 1226, 1227), List(2.8641881483556464, 3.1771130965062255, 3.177818563907618, 3.543506683451132, 5.450509181889962, 1.0, 1.0, -278.3029593126578, 3.0, 107.0, 1.0, 249.0))"


In [25]:
#selected a randomforest and using default settings
model = RandomForestRegressor(featuresCol = 'features', labelCol = 'price')
result = model.fit(train_final_pipeline_result).transform(test_final_pipeline_result)

In [26]:
from pyspark.ml.evaluation import RegressionEvaluator
result = result.select('prediction','price')
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price",metricName="rmse")
rmse = evaluator.evaluate(result)
rmse

In [27]:
#using a random forest again but this time using a gridsearch for the best hyper parameter
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

model = RandomForestRegressor(featuresCol = 'features', labelCol = 'price')
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price",metricName="rmse")

paramGrid = ParamGridBuilder()\
    .addGrid(model.maxDepth, [4,5,6]) \
    .addGrid(model.numTrees, [20,30])\
    .build()

# In this case the estimator is simply the linear regression.
# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
tvs = TrainValidationSplit(estimator=model,
                           estimatorParamMaps=paramGrid,
                           evaluator=evaluator,
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

best_model = tvs.fit(train_final_pipeline_result)
rsme = evaluator.evaluate(best_model.transform(test_final_pipeline_result))
rsme