In [0]:
%pyspark
airbnbDF = (spark.read.format('csv')
                    .option('header', 'true')
                    .option('inferSchema', 'true')
                    .option('sep', ',')
                    .load('hdfs:///education/ece/big-data/2020/fall/bda/resources/lab5/airbnb_clean.csv'))

airbnbDF.printSchema()

root
 |-- host_is_superhost: string (nullable = true)
 |-- cancellation_policy: string (nullable = true)
 |-- instant_bookable: string (nullable = true)
 |-- host_total_listings_count: integer (nullable = true)
 |-- neighbourhood_cleansed: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- property_type: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- accommodates: integer (nullable = true)
 |-- bathrooms: double (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- beds: integer (nullable = true)
 |-- bed_type: string (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- number_of_reviews: integer (nullable = true)
 |-- review_scores_rating: integer (nullable = true)
 |-- review_scores_accuracy: integer (nullable = true)
 |-- review_scores_cleanliness: integer (nullable = true)
 |-- review_scores_checkin: integer (nullable = true)
 |-- review_scores_communication: integer (null

## Demo: Linear Regression

In [2]:
%pyspark
# Split dataset to train and test

trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=123)
print 'No. of rows in train DF:', trainDF.count()
print 'No. of rows in test DF:', testDF.count()

No. of rows in train DF: 802
No. of rows in test DF: 198


In [3]:
%pyspark
# We will try to predict the price in function of how many people can stay at the property

trainDF.select("price", "accommodates").show()

+-----+------------+
|price|accommodates|
+-----+------------+
|  130|           3|
|  250|           4|
|  405|           6|
|  185|           2|
|  135|           2|
|  150|           3|
|  155|           4|
|  160|           1|
| 2281|           2|
|  297|           2|
|   81|           4|
|   76|           5|
|  155|           2|
|  150|           2|
|   75|           2|
|  162|           2|
|  200|           3|
|  115|           2|
|  125|           2|
|  159|           3|
+-----+------------+
only showing top 20 rows



In [4]:
%pyspark
# We can first summarize the DF to see if it's really clean

trainDF.select("price", "accommodates").summary().show()

+-------+------------------+------------------+
|summary|             price|      accommodates|
+-------+------------------+------------------+
|  count|               802|               802|
|   mean|213.15960099750623|3.2768079800498753|
| stddev|203.85681852899708| 1.990786715309765|
|    min|                29|                 1|
|    25%|               100|                 2|
|    50%|               150|                 2|
|    75%|               249|                 4|
|    max|              2281|                15|
+-------+------------------+------------------+



In [5]:
%pyspark
# Define linear regression estimator and fit the model
# Why doesn't it work?

from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="accommodates", labelCol="price")
#lrModel = lr.fit(trainDF)



In [6]:
%pyspark
# Vectorize the column 'accommodates'
# Pay attention to .transform()

from pyspark.ml.feature import VectorAssembler

vecAssembler = VectorAssembler(inputCols=["accommodates"], outputCol="features")
vecTrainDF = vecAssembler.transform(trainDF)
vecTrainDF.select(["accommodates", "features"]).show()

+------------+--------+
|accommodates|features|
+------------+--------+
|           3|   [3.0]|
|           4|   [4.0]|
|           6|   [6.0]|
|           2|   [2.0]|
|           2|   [2.0]|
|           3|   [3.0]|
|           4|   [4.0]|
|           1|   [1.0]|
|           2|   [2.0]|
|           2|   [2.0]|
|           4|   [4.0]|
|           5|   [5.0]|
|           2|   [2.0]|
|           2|   [2.0]|
|           2|   [2.0]|
|           2|   [2.0]|
|           3|   [3.0]|
|           2|   [2.0]|
|           2|   [2.0]|
|           3|   [3.0]|
+------------+--------+
only showing top 20 rows



In [7]:
%pyspark
# Let's try with two input columns

vecAssembler_2 = VectorAssembler(inputCols=["accommodates", "bedrooms"], outputCol="features")
vecTrainDF_2 = vecAssembler_2.transform(trainDF)
vecTrainDF_2.select(["accommodates", "bedrooms", "features"]).show()

+------------+--------+---------+
|accommodates|bedrooms| features|
+------------+--------+---------+
|           3|       1|[3.0,1.0]|
|           4|       2|[4.0,2.0]|
|           6|       3|[6.0,3.0]|
|           2|       1|[2.0,1.0]|
|           2|       1|[2.0,1.0]|
|           3|       1|[3.0,1.0]|
|           4|       1|[4.0,1.0]|
|           1|       1|[1.0,1.0]|
|           2|       1|[2.0,1.0]|
|           2|       1|[2.0,1.0]|
|           4|       1|[4.0,1.0]|
|           5|       2|[5.0,2.0]|
|           2|       1|[2.0,1.0]|
|           2|       1|[2.0,1.0]|
|           2|       1|[2.0,1.0]|
|           2|       1|[2.0,1.0]|
|           3|       1|[3.0,1.0]|
|           2|       1|[2.0,1.0]|
|           2|       1|[2.0,1.0]|
|           3|       1|[3.0,1.0]|
+------------+--------+---------+
only showing top 20 rows



In [8]:
%pyspark
# Which parameters can the estimator take?

from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="features", labelCol="price")
lr.explainParams().split('\n')


['aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)', 'elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)', 'epsilon: The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber (default: 1.35)', 'featuresCol: features column name. (default: features, current: features)', 'fitIntercept: whether to fit an intercept term. (default: True)', 'labelCol: label column name. (default: label, current: price)', 'loss: The loss function to be optimized. Supported options: squaredError, huber. (default: squaredError)', 'maxIter: max number of iterations (>= 0). (default: 100)', 'predictionCol: prediction column name. (default: prediction)', 'regParam: regularization parameter (>= 0). (default: 0.0)', 'solver: The solver algorithm for optimization. Supported options: auto, normal, l-bfgs. (default: auto)', 'standardization: 

In [9]:
%pyspark
# Fit the estimator to the data

lrModel = lr.fit(vecTrainDF)
lrModel

LinearRegression_4d59b3f133110b6f7a88


In [10]:
%pyspark
# Print the coefficients of the model

k = lrModel.coefficients[0]
n = lrModel.intercept

print 'y =', k, '* x +', n

y = 62.627665173598565 * x + 7.94076798477


In [11]:
%pyspark
# Perform the cross validation (and optionally grid search)

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline

# Define the params for cv (and grid search: commented line)
params = (ParamGridBuilder()
          #.addGrid(lr.elasticNetParam, [0.0, 1.0])
          .build())
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")

cv = CrossValidator(estimator=lr, evaluator=evaluator, estimatorParamMaps=params, numFolds=10, seed=11)
cvModel = cv.fit(vecTrainDF)

print 'Avg. metrics:', cvModel.avgMetrics
print 'Best model slope:', cvModel.bestModel.coefficients[0]
print 'Best model intercept:', cvModel.bestModel.intercept

Avg. mertics: [0.3780198471330412]
Best model slope: 62.627665173598565
Best model intercept: 7.94076798477


In [12]:
%pyspark
# Apply the model to the test set. But, first we need to process the test data the same way we processed the train data.

vecTestDF = vecAssembler.transform(testDF)
predDF = cvModel.transform(vecTestDF)

predDF.select("accommodates", "features", "price", "prediction").show()

+------------+--------+-----+------------------+
|accommodates|features|price|        prediction|
+------------+--------+-----+------------------+
|           4|   [4.0]|  350|  258.451428679161|
|           2|   [2.0]|  250|133.19609833196387|
|           6|   [6.0]|  275|383.70675902635816|
|           2|   [2.0]|  115|133.19609833196387|
|           8|   [8.0]|  145| 508.9620893735553|
|           2|   [2.0]|  135|133.19609833196387|
|           1|   [1.0]|   67| 70.56843315836531|
|           2|   [2.0]|  141|133.19609833196387|
|           2|   [2.0]|  157|133.19609833196387|
|           2|   [2.0]|   50|133.19609833196387|
|           2|   [2.0]|   45|133.19609833196387|
|           1|   [1.0]|   45| 70.56843315836531|
|           2|   [2.0]|  165|133.19609833196387|
|           4|   [4.0]|  195|  258.451428679161|
|           3|   [3.0]|  143|195.82376350556245|
|           3|   [3.0]|  150|195.82376350556245|
|           6|   [6.0]|  425|383.70675902635816|
|           3|   [3.

In [13]:
%pyspark
# ALTERNATIVE
# Create a pipeline from all of the stages before (data preparation + model definition. Don't put cv in the pipeline.)
# Apply the pipeline to test frame

from pyspark.ml import Pipeline

vecAssembler = VectorAssembler(inputCols=["accommodates"], outputCol="features")
lr = LinearRegression(featuresCol="features", labelCol="price")
pipeline = Pipeline(stages=[vecAssembler, lr])

pipelineModel = pipeline.fit(trainDF)

predDF = pipelineModel.transform(testDF)
predDF.select("accommodates", "features", "price", "prediction").show()

+------------+--------+-----+------------------+
|accommodates|features|price|        prediction|
+------------+--------+-----+------------------+
|           4|   [4.0]|  350|  258.451428679161|
|           2|   [2.0]|  250|133.19609833196387|
|           6|   [6.0]|  275|383.70675902635816|
|           2|   [2.0]|  115|133.19609833196387|
|           8|   [8.0]|  145| 508.9620893735553|
|           2|   [2.0]|  135|133.19609833196387|
|           1|   [1.0]|   67| 70.56843315836531|
|           2|   [2.0]|  141|133.19609833196387|
|           2|   [2.0]|  157|133.19609833196387|
|           2|   [2.0]|   50|133.19609833196387|
|           2|   [2.0]|   45|133.19609833196387|
|           1|   [1.0]|   45| 70.56843315836531|
|           2|   [2.0]|  165|133.19609833196387|
|           4|   [4.0]|  195|  258.451428679161|
|           3|   [3.0]|  143|195.82376350556245|
|           3|   [3.0]|  150|195.82376350556245|
|           6|   [6.0]|  425|383.70675902635816|
|           3|   [3.

In [14]:
%pyspark
# How good is the model?
# RegressionEvaluator will compute the differences between the predicted price and the real one and calculate the metric of overall model quality (R2)

from pyspark.ml.evaluation import RegressionEvaluator

regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price", metricName="r2")

r2 = regressionEvaluator.evaluate(predDF)
print 'R2 of the linear model is:', r2

R2 of the linear model is: 0.427506233551


## Lab: Random Forest Regression
Build and evaluate a Random Forest model the same way we did with Linear Regression.


In [16]:
%pyspark
# Spit data to train and test set. Use the same split and seed as for linear regression, so we can compare the models at the end.

trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=123)
print 'No. of rows in train DF:', trainDF.count()
print 'No. of rows in test DF:', testDF.count()




In [17]:
%pyspark
# Select only three atributes, since we didn't have time to look at the one-hot encoding - accommodates, bathrooms, review_scores_rating. 
# Predict the price as before
# Name the dataframe trainDF 

cols =  ['accommodates', 'bathrooms', 'review_scores_rating', 'price']
trainDF = trainDF.select(cols)
trainDF.show(10)




In [18]:
%pyspark
# Vectorize the attribute (feature) columns with VectorAssembler

from pyspark.ml.feature import VectorAssembler

input_cols =  ['accommodates', 'bathrooms', 'review_scores_rating']
vecAssembler = VectorAssembler(inputCols=input_cols, outputCol="features")

vecTrainDF = vecAssembler.transform(trainDF)
vecTrainDF.select(["accommodates", "features"]).show(10)


+------------+---------------+
|accommodates|       features|
+------------+---------------+
|           3| [3.0,1.0,88.0]|
|           4| [4.0,2.0,74.0]|
|           6|[6.0,2.5,100.0]|
|           2| [2.0,1.0,96.0]|
|           2| [2.0,1.0,95.0]|
|           3| [3.0,1.0,92.0]|
|           4| [4.0,1.0,99.0]|
|           1| [1.0,1.0,80.0]|
|           2| [2.0,1.0,99.0]|
|           2| [2.0,1.0,97.0]|
|           4| [4.0,1.0,98.0]|
|           5| [5.0,1.0,93.0]|
|           2| [2.0,1.0,99.0]|
|           2| [2.0,1.0,93.0]|
|           2| [2.0,2.0,91.0]|
|           2| [2.0,1.0,98.0]|
|           3| [3.0,1.0,76.0]|
|           2| [2.0,1.0,98.0]|
|           2| [2.0,1.0,89.0]|
|           3| [3.0,1.0,94.0]|
+------------+---------------+
only showing top 20 rows



In [19]:
%pyspark
# Build a RandomForestRegressor. The parameters that you need to define are labelCol (as before) and maxBins=40
# You can read about maxBins using rf.explainParams().split('\n')

from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(labelCol="price", maxBins=40)
rf.explainParams().split('\n')


['cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval. (default: False)', 'checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)', 'featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n]. (default: auto)', 'featuresCol: features column name. (default: features)', 'impurity: Criterion used for information gain calculation (case-insensitive). Supported options: variance (default: variance)', 'labelCol: label column name. (default: lab

In [20]:
%pyspark
# Fit the model to the dataframe with the vectorized features

rfModel = rf.fit(vecTrainDF)
rfModel


RandomForestRegressionModel (uid=RandomForestRegressor_4397bded437b6bf46466) with 20 trees


In [21]:
%pyspark
# Cross Validation 

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

params = (ParamGridBuilder()
          .addGrid(rf.numTrees, [1, 2])
          .build())
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")

cv = CrossValidator(estimator=rf, evaluator=evaluator, estimatorParamMaps=params, numFolds=10, seed=11)
cvModel = cv.fit(vecTrainDF)
cvModel.avgMetrics
cvModel.bestModel

RandomForestRegressionModel (uid=RandomForestRegressor_4397bded437b6bf46466) with 2 trees


In [22]:
%pyspark
# Prepare test frame

testDF = testDF.select(cols)
vecTestDF = vecAssembler.transform(testDF)
vecTestDF.show(10)

+------------+---------+--------------------+-----+---------------+
|accommodates|bathrooms|review_scores_rating|price|       features|
+------------+---------+--------------------+-----+---------------+
|           4|      2.0|                  98|  350| [4.0,2.0,98.0]|
|           2|      2.0|                 100|  250|[2.0,2.0,100.0]|
|           6|      1.0|                  99|  275| [6.0,1.0,99.0]|
|           2|      1.0|                  94|  115| [2.0,1.0,94.0]|
|           8|      1.0|                  94|  145| [8.0,1.0,94.0]|
|           2|      1.0|                  98|  135| [2.0,1.0,98.0]|
|           1|      1.5|                  98|   67| [1.0,1.5,98.0]|
|           2|      1.0|                 100|  141|[2.0,1.0,100.0]|
|           2|      1.0|                  87|  157| [2.0,1.0,87.0]|
|           2|      3.0|                  82|   50| [2.0,3.0,82.0]|
+------------+---------+--------------------+-----+---------------+
only showing top 10 rows



In [23]:
%pyspark
# Apply the cvModel to the test data (first you need to select the correct columns and vectorize them)

predDF = cvModel.transform(vecTestDF)
predDF.show()

+------------+---------+--------------------+-----+---------------+------------------+
|accommodates|bathrooms|review_scores_rating|price|       features|        prediction|
+------------+---------+--------------------+-----+---------------+------------------+
|           4|      2.0|                  98|  350| [4.0,2.0,98.0]|275.92905405405406|
|           2|      2.0|                 100|  250|[2.0,2.0,100.0]|193.79044117647058|
|           6|      1.0|                  99|  275| [6.0,1.0,99.0]| 336.6859077716865|
|           2|      1.0|                  94|  115| [2.0,1.0,94.0]| 141.5931572386375|
|           8|      1.0|                  94|  145| [8.0,1.0,94.0]| 195.0746440354228|
|           2|      1.0|                  98|  135| [2.0,1.0,98.0]|156.73839694917575|
|           1|      1.5|                  98|   67| [1.0,1.5,98.0]| 130.6401515151515|
|           2|      1.0|                 100|  141|[2.0,1.0,100.0]|162.26366279069765|
|           2|      1.0|                  8

In [24]:
%pyspark
# ALTERNATIVE
# Create a pipeline from all of the stages before (data preparation + model definition. Don't put cv in the pipeline.)
# Apply the pipeline to test frame

from pyspark.ml import Pipeline

input_cols =  ['accommodates', 'bathrooms', 'review_scores_rating']
vecAssembler = VectorAssembler(inputCols=input_cols, outputCol="features")
rf = RandomForestRegressor(labelCol="price")
pipeline = Pipeline(stages=[vecAssembler, rf])

pipelineModel = pipeline.fit(trainDF)

predDF = pipelineModel.transform(testDF)
predDF.select("accommodates", "features", "price", "prediction").show(10)

+------------+---------------+-----+------------------+
|accommodates|       features|price|        prediction|
+------------+---------------+-----+------------------+
|           4| [4.0,2.0,98.0]|  350|303.69934110457683|
|           2|[2.0,2.0,100.0]|  250|174.01891090100565|
|           6| [6.0,1.0,99.0]|  275| 322.4187212630626|
|           2| [2.0,1.0,94.0]|  115| 126.6057939396339|
|           8| [8.0,1.0,94.0]|  145|238.99663866513734|
|           2| [2.0,1.0,98.0]|  135|156.43191315337307|
|           1| [1.0,1.5,98.0]|   67|167.21924844906218|
|           2|[2.0,1.0,100.0]|  141|173.05749456837674|
|           2| [2.0,1.0,87.0]|  157|124.09803193740763|
|           2| [2.0,3.0,82.0]|   50| 65.20045977621189|
+------------+---------------+-----+------------------+
only showing top 10 rows



In [25]:
%pyspark
# Evaluate the quality of the model using R2 and RMSE (which model are we evaluating?)

from pyspark.ml.evaluation import RegressionEvaluator

regressionEvaluator_r2 = RegressionEvaluator(predictionCol="prediction", labelCol="price", metricName="r2")
regressionEvaluator_rmse = RegressionEvaluator(predictionCol="prediction", labelCol="price", metricName="rmse")

r2 = regressionEvaluator_r2.evaluate(predDF)
rmse = regressionEvaluator_rmse.evaluate(predDF)

print 'R2 of the random forest regression is:', r2
print 'RMSE of the random forest regression is:', rmse

R2 of the random forest regression is: 0.532970919226
RMSE of the random forest regression is: 103.130671995


In [26]:
%pyspark
predDF_1 = cvModel.transform(vecTestDF)

r2 = regressionEvaluator_r2.evaluate(predDF_1)
rmse = regressionEvaluator_rmse.evaluate(predDF_1)

print 'R2 of the random forest regression is:', r2
print 'RMSE of the random forest regression is:', rmse

R2 of the random forest regression is: 0.394755488148
RMSE of the random forest regression is: 117.403573408


In [27]:
%pyspark
# Which feature is the most important for our model?

print 'Feature importance for rfModel:', rfModel.featureImportances
print 'Feature importance for cvModel:', cvModel.bestModel.featureImportances

Feature importance for rfModel: (3,[0,1,2],[0.5759106280987623,0.28976990197264957,0.13431946992858818])
Feature importance for cvModel: (3,[0,1,2],[0.5960604831032423,0.25322249301072597,0.15071702388603164])


In [28]:
%pyspark
import pandas as pd

featuresDF = pd.DataFrame(list(zip(vecAssembler.getInputCols(), rfModel.featureImportances)), columns=["feature", "importance"])
featuresDF

                feature  importance
0          accommodates    0.575911
1             bathrooms    0.289770
2  review_scores_rating    0.134319


In [29]:
%pyspark


