In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
  .builder \
  .appName("Exercise for price estimation") \
  .getOrCreate()

25/08/31 17:19:58 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
data = spark.read.csv("data/kc_house_data.csv", header=True, inferSchema=True)

data.printSchema()

root
 |-- id: long (nullable = true)
 |-- date: string (nullable = true)
 |-- price: double (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: double (nullable = true)
 |-- sqft_living: integer (nullable = true)
 |-- sqft_lot: integer (nullable = true)
 |-- floors: double (nullable = true)
 |-- waterfront: integer (nullable = true)
 |-- view: integer (nullable = true)
 |-- condition: integer (nullable = true)
 |-- grade: integer (nullable = true)
 |-- sqft_above: integer (nullable = true)
 |-- sqft_basement: integer (nullable = true)
 |-- yr_built: integer (nullable = true)
 |-- yr_renovated: integer (nullable = true)
 |-- zipcode: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- sqft_living15: integer (nullable = true)
 |-- sqft_lot15: integer (nullable = true)



In [4]:
import pyspark.sql.functions as F

# u = data.select(F.when(F.col(data.yr_built)).isNull())
# u.show()
data.filter(data.id.isNull()).show()

+---+----+-----+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+------------+-------+---+----+-------------+----------+
| id|date|price|bedrooms|bathrooms|sqft_living|sqft_lot|floors|waterfront|view|condition|grade|sqft_above|sqft_basement|yr_built|yr_renovated|zipcode|lat|long|sqft_living15|sqft_lot15|
+---+----+-----+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+------------+-------+---+----+-------------+----------+
+---+----+-----+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+------------+-------+---+----+-------------+----------+



In [6]:
data \
    .select([F.count(F.when(F.col(c).isNotNull(), 1)).alias(c) for c in data.columns]) \
    .show()

+-----+-----+-----+--------+---------+-----------+--------+------+----------+-----+---------+-----+----------+-------------+--------+------------+-------+-----+-----+-------------+----------+
|   id| date|price|bedrooms|bathrooms|sqft_living|sqft_lot|floors|waterfront| view|condition|grade|sqft_above|sqft_basement|yr_built|yr_renovated|zipcode|  lat| long|sqft_living15|sqft_lot15|
+-----+-----+-----+--------+---------+-----------+--------+------+----------+-----+---------+-----+----------+-------------+--------+------------+-------+-----+-----+-------------+----------+
|21613|21613|21613|   21613|    21613|      21613|   21613| 21613|     21613|21613|    21613|21613|     21613|        21613|   21613|       21613|  21613|21613|21613|        21613|     21613|
+-----+-----+-----+--------+---------+-----------+--------+------+----------+-----+---------+-----+----------+-------------+--------+------------+-------+-----+-----+-------------+----------+



In [54]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)
print("Train size: ", train_data.count())
print("Test size: ", test_data.count())

Train size:  17349
Test size:  4264


In [55]:
data.select("*").show(2)

+----------+---------------+--------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+------------+-------+-------+--------+-------------+----------+
|        id|           date|   price|bedrooms|bathrooms|sqft_living|sqft_lot|floors|waterfront|view|condition|grade|sqft_above|sqft_basement|yr_built|yr_renovated|zipcode|    lat|    long|sqft_living15|sqft_lot15|
+----------+---------------+--------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+------------+-------+-------+--------+-------------+----------+
|7129300520|20141013T000000|221900.0|       3|      1.0|       1180|    5650|   1.0|         0|   0|        3|    7|      1180|            0|    1955|           0|  98178|47.5112|-122.257|         1340|      5650|
|6414100192|20141209T000000|538000.0|       3|     2.25|       2570|    7242|   2.0|         0|   0|        3|    7|      2170|          400|   

In [56]:
from pyspark.ml.feature import VectorAssembler

feature_cols = ['bedrooms', 'bathrooms', 'sqft_living', \
                'sqft_lot', 'floors', \
                'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'sqft_living15', 'sqft_lot15' ]
assembler = VectorAssembler(inputCols=feature_cols, outputCol='unscaled_features')
train_data = assembler.transform(train_data)
train_data.select("unscaled_features").show(truncate=False)

                

+-----------------------------------------------------------------------------------+
|unscaled_features                                                                  |
+-----------------------------------------------------------------------------------+
|[6.0,3.0,2400.0,9373.0,2.0,0.0,0.0,3.0,7.0,2400.0,0.0,1991.0,0.0,2060.0,7316.0]    |
|[6.0,3.0,2400.0,9373.0,2.0,0.0,0.0,3.0,7.0,2400.0,0.0,1991.0,0.0,2060.0,7316.0]    |
|[3.0,1.0,1460.0,43000.0,1.0,0.0,0.0,3.0,7.0,1460.0,0.0,1952.0,0.0,2250.0,20023.0]  |
|[3.0,1.0,1430.0,7599.0,1.5,0.0,0.0,4.0,6.0,1010.0,420.0,1930.0,0.0,1290.0,10320.0] |
|[4.0,2.0,1650.0,3504.0,1.0,0.0,0.0,3.0,7.0,760.0,890.0,1951.0,2013.0,1480.0,3504.0]|
|[5.0,1.5,1990.0,18200.0,1.0,0.0,0.0,3.0,7.0,1990.0,0.0,1960.0,0.0,1860.0,8658.0]   |
|[3.0,1.0,1340.0,21336.0,1.5,0.0,0.0,4.0,5.0,1340.0,0.0,1945.0,0.0,1340.0,37703.0]  |
|[4.0,2.0,1980.0,10585.0,1.5,0.0,0.0,2.0,6.0,1980.0,0.0,1924.0,0.0,1360.0,7810.0]   |
|[2.0,1.0,840.0,12750.0,1.0,0.0,0.0,3.0,6.0,840.0,0.0,

In [57]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol='unscaled_features', outputCol='features', withMean=True, withStd=True)
scaler_model = scaler.fit(train_data)
transformed_train_data = scaler_model.transform(train_data)
transformed_train_data.select("features").show()

+--------------------+
|            features|
+--------------------+
|[2.81501044818592...|
|[2.81501044818592...|
|[-0.3989773725645...|
|[-0.3989773725645...|
|[0.67235190101892...|
|[1.74368117460242...|
|[-0.3989773725645...|
|[0.67235190101892...|
|[-1.4703066461480...|
|[-1.4703066461480...|
|[-0.3989773725645...|
|[-0.3989773725645...|
|[1.74368117460242...|
|[-1.4703066461480...|
|[-1.4703066461480...|
|[-0.3989773725645...|
|[-0.3989773725645...|
|[-0.3989773725645...|
|[-0.3989773725645...|
|[1.74368117460242...|
+--------------------+
only showing top 20 rows


In [58]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol='features', labelCol='price')

model = lr.fit(transformed_train_data)

25/08/31 18:11:03 WARN Instrumentation: [0505a8c3] regParam is zero, which might cause numerical instability and overfitting.


In [59]:
test_data = assembler.transform(test_data)
test_data = scaler_model.transform(test_data)
# test_data.show()

In [60]:
test_predictions = model.transform(test_data)
# test_predictions.show()

In [61]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator_mae = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='mae')
mae = evaluator_mae.evaluate(test_predictions)
print(f"Mean Absolute Error (MAE): {mae}")

Mean Absolute Error (MAE): 139509.86010701608


In [62]:
# With all: Mean Absolute Error (MAE): 139509.86010701608
# Without NN: Mean Absolute Error (MAE): 140029.72305802908
# Without Enum Cols: Mean Absolute Error (MAE): 161911.9106498095
# Note: Same Score with/without StandardScaler