In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor

In [3]:
spark_session = SparkSession.builder.appName('DTree_House_Pricing').getOrCreate()

In [4]:
data = spark_session.read.option('header', 'true') \
                         .option('inferSchema', 'true') \
                         .csv('../../data/realestate.csv')
data.show(10)

+---+---------------+--------+-------------+-----------------------+--------+---------+---------------+
| No|TransactionDate|HouseAge|DistanceToMRT|NumberConvenienceStores|Latitude|Longitude|PriceOfUnitArea|
+---+---------------+--------+-------------+-----------------------+--------+---------+---------------+
|  1|       2012.917|    32.0|     84.87882|                     10|24.98298|121.54024|           37.9|
|  2|       2012.917|    19.5|     306.5947|                      9|24.98034|121.53951|           42.2|
|  3|       2013.583|    13.3|     561.9845|                      5|24.98746|121.54391|           47.3|
|  4|         2013.5|    13.3|     561.9845|                      5|24.98746|121.54391|           54.8|
|  5|       2012.833|     5.0|     390.5684|                      5|24.97937|121.54245|           43.1|
|  6|       2012.667|     7.1|      2175.03|                      3|24.96305|121.51254|           32.1|
|  7|       2012.667|    34.5|     623.4731|                    

In [5]:
assembler = VectorAssembler().setInputCols(['HouseAge', 'DistanceToMRT', 'NumberConvenienceStores']) \
                             .setOutputCol('features')
df = assembler.transform(data).select('PriceOfUnitArea', 'features')
train_test_split = df.randomSplit([0.8, 0.2])
train_df, test_df = train_test_split[0], train_test_split[1]

In [6]:
dtree = DecisionTreeRegressor().setFeaturesCol('features').setLabelCol('PriceOfUnitArea')
trained_dtree = dtree.fit(train_df)

In [9]:
full_predictions = trained_dtree.transform(test_df).cache()

predictions = full_predictions.select('prediction').rdd.map(lambda x: x[0])
labels = full_predictions.select('PriceOfUnitArea').rdd.map(lambda x: x[0])

prediction_and_label = predictions.zip(labels).collect()
prediction_and_label[:10]

[(18.8, 12.2),
 (18.4, 12.8),
 (23.22941176470588, 12.8),
 (23.22941176470588, 12.9),
 (18.4, 16.7),
 (18.8, 17.4),
 (23.22941176470588, 18.2),
 (23.22941176470588, 20.9),
 (23.30000000000001, 21.8),
 (36.55000000000007, 23.5)]

In [10]:
spark_session.stop()