In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
filepath = "housing.csv"

In [4]:
df = spark.read.csv(filepath, header=True, inferSchema=True)

In [5]:
df.printSchema()

root
 |-- Suburb: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Rooms: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Method: string (nullable = true)
 |-- SellerG: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- Postcode: string (nullable = true)
 |-- Bedroom2: integer (nullable = true)
 |-- Bathroom: integer (nullable = true)
 |-- Car: integer (nullable = true)
 |-- Landsize: integer (nullable = true)
 |-- BuildingArea: double (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- CouncilArea: string (nullable = true)
 |-- Lattitude: double (nullable = true)
 |-- Longtitude: double (nullable = true)
 |-- Regionname: string (nullable = true)
 |-- Propertycount: string (nullable = true)



In [6]:
df.select("Bedroom2","Bathroom","Price").show()

+--------+--------+-------+
|Bedroom2|Bathroom|  Price|
+--------+--------+-------+
|       2|       1|   null|
|       2|       1|1480000|
|       2|       1|1035000|
|       3|       2|   null|
|       3|       2|1465000|
|       3|       2| 850000|
|       3|       1|1600000|
|       3|       2|   null|
|       4|       1|   null|
|       3|       2|   null|
|       2|       1| 941000|
|       4|       2|1876000|
|       2|       2|   null|
|       6|       2|   null|
|       2|       1|1636000|
|    null|    null|1000000|
|    null|    null| 745000|
|       1|       1| 300000|
|       3|       1|1097000|
|    null|    null| 542000|
+--------+--------+-------+
only showing top 20 rows



In [7]:
df = df.na.drop()

In [8]:
trainDF, testDF = df.randomSplit([.8,.2], seed= 42)

In [9]:
trainDF.show(5)

+----------+-------------------+-----+----+-------+------+-------+----------+--------+--------+--------+--------+---+--------+------------+---------+-----------+---------+----------+--------------------+-------------+
|    Suburb|            Address|Rooms|Type|  Price|Method|SellerG|      Date|Distance|Postcode|Bedroom2|Bathroom|Car|Landsize|BuildingArea|YearBuilt|CouncilArea|Lattitude|Longtitude|          Regionname|Propertycount|
+----------+-------------------+-----+----+-------+------+-------+----------+--------+--------+--------+--------+---+--------+------------+---------+-----------+---------+----------+--------------------+-------------+
|Abbotsford| 1/43 Abbotsford St|    2|   u| 505000|    PI| Nelson|27-05-2017|       3|    3067|       2|       1|  1|       0|        38.0|     1970|      Yarra|-37.80206| 145.00015|Northern Metropol...|         4019|
|Abbotsford|      10 Valiant St|    2|   h|1097000|     S| Biggin|08-10-2016|     2.5|    3067|       3|       1|  2|     220|  

In [10]:
# "Bedroom2","Bathroom","Price"

In [11]:
trainDF.show(2)

+----------+------------------+-----+----+-------+------+-------+----------+--------+--------+--------+--------+---+--------+------------+---------+-----------+---------+----------+--------------------+-------------+
|    Suburb|           Address|Rooms|Type|  Price|Method|SellerG|      Date|Distance|Postcode|Bedroom2|Bathroom|Car|Landsize|BuildingArea|YearBuilt|CouncilArea|Lattitude|Longtitude|          Regionname|Propertycount|
+----------+------------------+-----+----+-------+------+-------+----------+--------+--------+--------+--------+---+--------+------------+---------+-----------+---------+----------+--------------------+-------------+
|Abbotsford|1/43 Abbotsford St|    2|   u| 505000|    PI| Nelson|27-05-2017|       3|    3067|       2|       1|  1|       0|        38.0|     1970|      Yarra|-37.80206| 145.00015|Northern Metropol...|         4019|
|Abbotsford|     10 Valiant St|    2|   h|1097000|     S| Biggin|08-10-2016|     2.5|    3067|       3|       1|  2|     220|       

In [18]:
from pyspark.ml.feature import VectorAssembler

In [19]:
vecAssembler = VectorAssembler(inputCols=["Rooms"], outputCol="features") # Transformer

In [20]:
vecTrainDF = vecAssembler.transform(trainDF)

In [21]:
vecTrainDF.printSchema()

root
 |-- Suburb: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Rooms: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Method: string (nullable = true)
 |-- SellerG: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- Postcode: string (nullable = true)
 |-- Bedroom2: integer (nullable = true)
 |-- Bathroom: integer (nullable = true)
 |-- Car: integer (nullable = true)
 |-- Landsize: integer (nullable = true)
 |-- BuildingArea: double (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- CouncilArea: string (nullable = true)
 |-- Lattitude: double (nullable = true)
 |-- Longtitude: double (nullable = true)
 |-- Regionname: string (nullable = true)
 |-- Propertycount: string (nullable = true)
 |-- features: vector (nullable = true)



In [22]:
vecTrainDF.select("Rooms","features").show(5)

+-----+--------+
|Rooms|features|
+-----+--------+
|    2|   [2.0]|
|    2|   [2.0]|
|    2|   [2.0]|
|    3|   [3.0]|
|    3|   [3.0]|
+-----+--------+
only showing top 5 rows



In [24]:
#vec2_train.printSchema()

In [26]:
#vec2_train.select("features").show()

In [27]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="features", labelCol= "Price")
lrModel =  lr.fit(vecTrainDF)
vecTestDF = vecAssembler.transform(testDF)
predDF = lrModel.transform(vecTestDF)

In [28]:
predDF.select("Rooms","prediction").show(5)

+-----+------------------+
|Rooms|        prediction|
+-----+------------------+
|    1| 344859.1327814109|
|    2| 719317.6125773378|
|    3|1093776.0923732645|
|    2| 719317.6125773378|
|    3|1093776.0923732645|
+-----+------------------+
only showing top 5 rows



In [29]:
lr_multi = LinearRegression(featuresCol="features", labelCol= "Price")

In [30]:
vecTrainDF.select("Rooms","Bathroom", "Price").show(2)

+-----+--------+-------+
|Rooms|Bathroom|  Price|
+-----+--------+-------+
|    2|       1| 505000|
|    2|       1|1097000|
+-----+--------+-------+
only showing top 2 rows



In [31]:
vecAssembler2 = VectorAssembler(inputCols=["Rooms", "Bathroom"], outputCol="features") # Transformer

In [32]:
vec2_train = vecAssembler2.transform(trainDF)

In [33]:
lrModel_multi =  lr_multi.fit(vec2_train)

In [34]:
vec_test_multi = vecAssembler2.transform(testDF)

In [35]:
vec_test_multi.select("features","Price").show(2)

+---------+-------+
| features|  Price|
+---------+-------+
|[1.0,1.0]| 470000|
|[2.0,2.0]|1190000|
+---------+-------+
only showing top 2 rows



In [36]:
lrModel_multi.transform(vec_test_multi).printSchema()

root
 |-- Suburb: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Rooms: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Method: string (nullable = true)
 |-- SellerG: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- Postcode: string (nullable = true)
 |-- Bedroom2: integer (nullable = true)
 |-- Bathroom: integer (nullable = true)
 |-- Car: integer (nullable = true)
 |-- Landsize: integer (nullable = true)
 |-- BuildingArea: double (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- CouncilArea: string (nullable = true)
 |-- Lattitude: double (nullable = true)
 |-- Longtitude: double (nullable = true)
 |-- Regionname: string (nullable = true)
 |-- Propertycount: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [37]:
lrModel_multi.transform(vec_test_multi).select("features","prediction").show(2)

+---------+-----------------+
| features|       prediction|
+---------+-----------------+
|[1.0,1.0]|423807.8790699811|
|[2.0,2.0]|950739.7443373943|
+---------+-----------------+
only showing top 2 rows



In [38]:
from pyspark.ml import Pipeline

In [39]:
pipeline = Pipeline(stages= [vecAssembler,lr])

In [40]:
pip_model = pipeline.fit(trainDF)

In [41]:
predicted_pipe = pip_model.transform(testDF)

In [42]:
predicted_pipe.select("Rooms","Price","prediction").show(2)

+-----+-------+-----------------+
|Rooms|  Price|       prediction|
+-----+-------+-----------------+
|    1| 470000|344859.1327814109|
|    2|1190000|719317.6125773378|
+-----+-------+-----------------+
only showing top 2 rows



In [43]:
pipeline_multi = Pipeline(stages= [vecAssembler2,lr_multi])

In [44]:
pip_model_multi = pipeline.fit(trainDF)

In [45]:
predicted_pipe_multi = pip_model.transform(testDF)

In [46]:
predicted_pipe_multi.printSchema()

root
 |-- Suburb: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Rooms: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Method: string (nullable = true)
 |-- SellerG: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- Postcode: string (nullable = true)
 |-- Bedroom2: integer (nullable = true)
 |-- Bathroom: integer (nullable = true)
 |-- Car: integer (nullable = true)
 |-- Landsize: integer (nullable = true)
 |-- BuildingArea: double (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- CouncilArea: string (nullable = true)
 |-- Lattitude: double (nullable = true)
 |-- Longtitude: double (nullable = true)
 |-- Regionname: string (nullable = true)
 |-- Propertycount: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [47]:
predicted_pipe_multi.select("Rooms","Bathroom" , "Price", "prediction").show(2)

+-----+--------+-------+-----------------+
|Rooms|Bathroom|  Price|       prediction|
+-----+--------+-------+-----------------+
|    1|       1| 470000|344859.1327814109|
|    2|       2|1190000|719317.6125773378|
+-----+--------+-------+-----------------+
only showing top 2 rows



In [48]:
df.show(2)

+----------+---------------+-----+----+-------+------+-------+----------+--------+--------+--------+--------+---+--------+------------+---------+-----------+---------+----------+--------------------+-------------+
|    Suburb|        Address|Rooms|Type|  Price|Method|SellerG|      Date|Distance|Postcode|Bedroom2|Bathroom|Car|Landsize|BuildingArea|YearBuilt|CouncilArea|Lattitude|Longtitude|          Regionname|Propertycount|
+----------+---------------+-----+----+-------+------+-------+----------+--------+--------+--------+--------+---+--------+------------+---------+-----------+---------+----------+--------------------+-------------+
|Abbotsford|25 Bloomburg St|    2|   h|1035000|     S| Biggin|04-02-2016|     2.5|    3067|       2|       1|  0|     156|        79.0|     1900|      Yarra| -37.8079|  144.9934|Northern Metropol...|         4019|
|Abbotsford|   5 Charles St|    3|   h|1465000|    SP| Biggin|04-03-2017|     2.5|    3067|       3|       2|  0|     134|       150.0|     1900

In [49]:
trainDF.dtypes

[('Suburb', 'string'),
 ('Address', 'string'),
 ('Rooms', 'int'),
 ('Type', 'string'),
 ('Price', 'int'),
 ('Method', 'string'),
 ('SellerG', 'string'),
 ('Date', 'string'),
 ('Distance', 'string'),
 ('Postcode', 'string'),
 ('Bedroom2', 'int'),
 ('Bathroom', 'int'),
 ('Car', 'int'),
 ('Landsize', 'int'),
 ('BuildingArea', 'double'),
 ('YearBuilt', 'int'),
 ('CouncilArea', 'string'),
 ('Lattitude', 'double'),
 ('Longtitude', 'double'),
 ('Regionname', 'string'),
 ('Propertycount', 'string')]

In [50]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

In [51]:
cat_cols = [c for (c, d) in trainDF.dtypes if d == "string" ]  # string columns

In [52]:
indexed_Col = [col_name + "_indexed" for col_name in cat_cols] 

In [53]:
ohe_Col = [col_name + "_ohe" for col_name in cat_cols]   

In [54]:
num_col = [c for (c , d) in trainDF.dtypes if (d == "int") | (d == "double") &(c != "Price")]   

In [55]:
assemb_col = ohe_Col + num_col 
assemb_col

['Suburb_ohe',
 'Address_ohe',
 'Type_ohe',
 'Method_ohe',
 'SellerG_ohe',
 'Date_ohe',
 'Distance_ohe',
 'Postcode_ohe',
 'CouncilArea_ohe',
 'Regionname_ohe',
 'Propertycount_ohe',
 'Rooms',
 'Price',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'YearBuilt',
 'Lattitude',
 'Longtitude']

In [56]:
strInd = StringIndexer(inputCols= cat_cols , outputCols= indexed_Col) 

In [57]:
ohe = OneHotEncoder(inputCols= indexed_Col , outputCols= ohe_Col)

In [58]:
vec_assembler = VectorAssembler(inputCols=assemb_col, outputCol="features")

In [59]:
lr_pipe = LinearRegression(featuresCol= "features", labelCol= "Price")

In [60]:
pipeline =  Pipeline(stages=[strInd , ohe, vec_assembler, lr_pipe])

In [61]:
lr_pipeline_trained  =  pipeline.fit(trainDF)

In [62]:
lr_pipeline_trained.transform(trainDF).select("Price","prediction").show(5, truncate= False)

+-------+------------------+
|Price  |prediction        |
+-------+------------------+
|505000 |505196.1115161292 |
|1097000|1097004.0368161835|
|750000 |749987.4301009141 |
|1876000|1875947.1418727376|
|1000000|1000000.6280348562|
+-------+------------------+
only showing top 5 rows



In [63]:
lr_pipeline_trained.transform(testDF).printSchema()

root
 |-- Suburb: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Rooms: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Method: string (nullable = true)
 |-- SellerG: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- Postcode: string (nullable = true)
 |-- Bedroom2: integer (nullable = true)
 |-- Bathroom: integer (nullable = true)
 |-- Car: integer (nullable = true)
 |-- Landsize: integer (nullable = true)
 |-- BuildingArea: double (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- CouncilArea: string (nullable = true)
 |-- Lattitude: double (nullable = true)
 |-- Longtitude: double (nullable = true)
 |-- Regionname: string (nullable = true)
 |-- Propertycount: string (nullable = true)
 |-- Regionname_indexed: double (nullable = false)
 |-- Postcode_indexed: double (nullable = false)
 |-- Method_indexed: double (nullable = false)
 |-- Sub

In [65]:
#lr_pipeline_trained.transform(testDF).show(2)