# Bike Rental DataSet from UCI Machine Learning Repository
## Citations
Fanaee-T, Hadi, and Gama, Joao, 'Event labeling combining ensemble detectors and background knowledge', Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelber
## Attributes on original data


- season : season (1:spring, 2:summer, 3:fall, 4:winter)
- yr : year (0: 2011, 1:2012)
- mnth : month ( 1 to 12)
- hr : hour (0 to 23)
- holiday : weather day is holiday or not (extracted from [Web Link])
- weekday : day of the week
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
- weathersit : 
 - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
 - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
 - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
 - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temp : Normalized temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale)
- hum: Normalized humidity. The values are divided to 100 (max)
- windspeed: Normalized wind speed. The values are divided to 67 (max)


## URL:
https://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset

In [0]:
rowData = spark.read.csv("/FileStore/tables/Bike_Rental_UCI_dataset-1.csv", inferSchema=True, header = True)

In [0]:
rowData.show(n=15)

+------+---+----+---+-------+----------+----------+----+----+---------+---------+----+------+
|season| yr|mnth| hr|holiday|workingday|weathersit|temp| hum|windspeed|dayOfWeek|days|demand|
+------+---+----+---+-------+----------+----------+----+----+---------+---------+----+------+
|     1|  0|   1|  0|      0|         0|         1|0.24|0.81|      0.0|      Sat|   0|    16|
|     1|  0|   1|  1|      0|         0|         1|0.22| 0.8|      0.0|      Sat|   0|    40|
|     1|  0|   1|  2|      0|         0|         1|0.22| 0.8|      0.0|      Sat|   0|    32|
|     1|  0|   1|  3|      0|         0|         1|0.24|0.75|      0.0|      Sat|   0|    13|
|     1|  0|   1|  4|      0|         0|         1|0.24|0.75|      0.0|      Sat|   0|     1|
|     1|  0|   1|  5|      0|         0|         2|0.24|0.75|   0.0896|      Sat|   0|     1|
|     1|  0|   1|  6|      0|         0|         1|0.22| 0.8|      0.0|      Sat|   0|     2|
|     1|  0|   1|  7|      0|         0|         1| 0.2|0.86

In [0]:
rowData.where(rowData.dayOfWeek=="Mon").show()

+------+---+----+---+-------+----------+----------+----+----+-------------------+---------+----+------+
|season| yr|mnth| hr|holiday|workingday|weathersit|temp| hum|          windspeed|dayOfWeek|days|demand|
+------+---+----+---+-------+----------+----------+----+----+-------------------+---------+----+------+
|     1|  0|   1|  0|      0|         1|         1|0.22|0.44|             0.3582|      Mon|   1|     5|
|     1|  0|   1|  1|      0|         1|         1| 0.2|0.44|             0.4179|      Mon|   2|     2|
|     1|  0|   1|  4|      0|         1|         1|0.16|0.47|             0.3881|      Mon|   2|     1|
|     1|  0|   1|  5|      0|         1|         1|0.16|0.47|             0.2836|      Mon|   2|     3|
|     1|  0|   1|  6|      0|         1|         1|0.14| 0.5|             0.3881|      Mon|   2|    30|
|     1|  0|   1|  7|      0|         1|         1|0.14| 0.5|0.19399999999999998|      Mon|   2|    64|
|     1|  0|   1|  8|      0|         1|         1|0.14| 0.5|   

In [0]:
rowData.printSchema()

root
 |-- season: integer (nullable = true)
 |-- yr: integer (nullable = true)
 |-- mnth: integer (nullable = true)
 |-- hr: integer (nullable = true)
 |-- holiday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- weathersit: integer (nullable = true)
 |-- temp: double (nullable = true)
 |-- hum: double (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- dayOfWeek: string (nullable = true)
 |-- days: integer (nullable = true)
 |-- demand: integer (nullable = true)



In [0]:
rowData.groupBy('dayOfWeek').count().show()

+---------+-----+
|dayOfWeek|count|
+---------+-----+
|      Sun| 2502|
|      Mon| 2479|
|      Sat| 2512|
|      Wed| 2475|
|      Tue| 2453|
|      Fri| 2487|
|      Thr| 2471|
+---------+-----+



In [0]:
rowData.groupBy('mnth').count().show()

+----+-----+
|mnth|count|
+----+-----+
|  12| 1483|
|   1| 1429|
|   6| 1440|
|   3| 1473|
|   5| 1488|
|   9| 1437|
|   4| 1437|
|   8| 1475|
|   7| 1488|
|  10| 1451|
|  11| 1437|
|   2| 1341|
+----+-----+



In [0]:
rowData.select("days").distinct().count()

Out[7]: 725

In [0]:
rowData.groupBy('yr').count().show()

+---+-----+
| yr|count|
+---+-----+
|  1| 8734|
|  0| 8645|
+---+-----+



In [0]:
rowData.groupBy('season').count().show()

+------+-----+
|season|count|
+------+-----+
|     1| 4242|
|     3| 4496|
|     4| 4232|
|     2| 4409|
+------+-----+



In [0]:
from pyspark.ml.feature import StringIndexer

In [0]:
indexer = StringIndexer(inputCol='dayOfWeek', outputCol='day_cat')

In [0]:
indexed_data =indexer.fit(rowData).transform(rowData)

In [0]:
indexed_data.show()

+------+---+----+---+-------+----------+----------+----+----+---------+---------+----+------+-------+
|season| yr|mnth| hr|holiday|workingday|weathersit|temp| hum|windspeed|dayOfWeek|days|demand|day_cat|
+------+---+----+---+-------+----------+----------+----+----+---------+---------+----+------+-------+
|     1|  0|   1|  0|      0|         0|         1|0.24|0.81|      0.0|      Sat|   0|    16|    0.0|
|     1|  0|   1|  1|      0|         0|         1|0.22| 0.8|      0.0|      Sat|   0|    40|    0.0|
|     1|  0|   1|  2|      0|         0|         1|0.22| 0.8|      0.0|      Sat|   0|    32|    0.0|
|     1|  0|   1|  3|      0|         0|         1|0.24|0.75|      0.0|      Sat|   0|    13|    0.0|
|     1|  0|   1|  4|      0|         0|         1|0.24|0.75|      0.0|      Sat|   0|     1|    0.0|
|     1|  0|   1|  5|      0|         0|         2|0.24|0.75|   0.0896|      Sat|   0|     1|    0.0|
|     1|  0|   1|  6|      0|         0|         1|0.22| 0.8|      0.0|      Sat| 

In [0]:
indexed_data.select('day_cat').distinct().orderBy('day_cat').show()

+-------+
|day_cat|
+-------+
|    0.0|
|    1.0|
|    2.0|
|    3.0|
|    4.0|
|    5.0|
|    6.0|
+-------+



In [0]:
indexed_data.groupBy('day_cat', 'dayOfWeek').count().orderBy('day_cat').show()

+-------+---------+-----+
|day_cat|dayOfWeek|count|
+-------+---------+-----+
|    0.0|      Sat| 2512|
|    1.0|      Sun| 2502|
|    2.0|      Fri| 2487|
|    3.0|      Mon| 2479|
|    4.0|      Wed| 2475|
|    5.0|      Thr| 2471|
|    6.0|      Tue| 2453|
+-------+---------+-----+



In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [0]:
indexed_data.columns

Out[17]: ['season',
 'yr',
 'mnth',
 'hr',
 'holiday',
 'workingday',
 'weathersit',
 'temp',
 'hum',
 'windspeed',
 'dayOfWeek',
 'days',
 'demand',
 'day_cat']

In [0]:
vec = VectorAssembler(
  inputCols= [
    'season',
    'yr',
    'mnth',
    'hr',
    'holiday',
    'workingday',
    'weathersit',
    'temp',
    'hum',
    'windspeed',
    'day_cat'
    ],
   outputCol = 'features'                  
 )

In [0]:
data = vec.transform(indexed_data)

In [0]:
data.display()

season,yr,mnth,hr,holiday,workingday,weathersit,temp,hum,windspeed,dayOfWeek,days,demand,day_cat,features
1,0,1,0,0,0,1,0.24,0.81,0.0,Sat,0,16,0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0, 2, 6, 7, 8), values -> List(1.0, 1.0, 1.0, 0.24, 0.81))"
1,0,1,1,0,0,1,0.22,0.8,0.0,Sat,0,40,0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0, 2, 3, 6, 7, 8), values -> List(1.0, 1.0, 1.0, 1.0, 0.22, 0.8))"
1,0,1,2,0,0,1,0.22,0.8,0.0,Sat,0,32,0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0, 2, 3, 6, 7, 8), values -> List(1.0, 1.0, 2.0, 1.0, 0.22, 0.8))"
1,0,1,3,0,0,1,0.24,0.75,0.0,Sat,0,13,0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0, 2, 3, 6, 7, 8), values -> List(1.0, 1.0, 3.0, 1.0, 0.24, 0.75))"
1,0,1,4,0,0,1,0.24,0.75,0.0,Sat,0,1,0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0, 2, 3, 6, 7, 8), values -> List(1.0, 1.0, 4.0, 1.0, 0.24, 0.75))"
1,0,1,5,0,0,2,0.24,0.75,0.0896,Sat,0,1,0.0,"Map(vectorType -> dense, length -> 11, values -> List(1.0, 0.0, 1.0, 5.0, 0.0, 0.0, 2.0, 0.24, 0.75, 0.0896, 0.0))"
1,0,1,6,0,0,1,0.22,0.8,0.0,Sat,0,2,0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0, 2, 3, 6, 7, 8), values -> List(1.0, 1.0, 6.0, 1.0, 0.22, 0.8))"
1,0,1,7,0,0,1,0.2,0.86,0.0,Sat,0,3,0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0, 2, 3, 6, 7, 8), values -> List(1.0, 1.0, 7.0, 1.0, 0.2, 0.86))"
1,0,1,8,0,0,1,0.24,0.75,0.0,Sat,0,8,0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0, 2, 3, 6, 7, 8), values -> List(1.0, 1.0, 8.0, 1.0, 0.24, 0.75))"
1,0,1,9,0,0,1,0.32,0.76,0.0,Sat,0,14,0.0,"Map(vectorType -> sparse, length -> 11, indices -> List(0, 2, 3, 6, 7, 8), values -> List(1.0, 1.0, 9.0, 1.0, 0.32, 0.76))"


In [0]:
data.take(1)

Out[21]: [Row(season=1, yr=0, mnth=1, hr=0, holiday=0, workingday=0, weathersit=1, temp=0.24, hum=0.81, windspeed=0.0, dayOfWeek='Sat', days=0, demand=16, day_cat=0.0, features=SparseVector(11, {0: 1.0, 2: 1.0, 6: 1.0, 7: 0.24, 8: 0.81}))]

In [0]:
for item in data.take(1)[0]:
    print (item)

1
0
1
0
0
0
1
0.24
0.81
0.0
Sat
0
16
0.0
(11,[0,2,6,7,8],[1.0,1.0,1.0,0.24,0.81])


In [0]:
for item in data.take(3):
  print (item)
  print('\n')

Row(season=1, yr=0, mnth=1, hr=0, holiday=0, workingday=0, weathersit=1, temp=0.24, hum=0.81, windspeed=0.0, dayOfWeek='Sat', days=0, demand=16, day_cat=0.0, features=SparseVector(11, {0: 1.0, 2: 1.0, 6: 1.0, 7: 0.24, 8: 0.81}))


Row(season=1, yr=0, mnth=1, hr=1, holiday=0, workingday=0, weathersit=1, temp=0.22, hum=0.8, windspeed=0.0, dayOfWeek='Sat', days=0, demand=40, day_cat=0.0, features=SparseVector(11, {0: 1.0, 2: 1.0, 3: 1.0, 6: 1.0, 7: 0.22, 8: 0.8}))


Row(season=1, yr=0, mnth=1, hr=2, holiday=0, workingday=0, weathersit=1, temp=0.22, hum=0.8, windspeed=0.0, dayOfWeek='Sat', days=0, demand=32, day_cat=0.0, features=SparseVector(11, {0: 1.0, 2: 1.0, 3: 2.0, 6: 1.0, 7: 0.22, 8: 0.8}))




In [0]:
display(data.select("hr", "demand"))

hr,demand
0,16
1,40
2,32
3,13
4,1
5,1
6,2
7,3
8,8
9,14


In [0]:
modelData = data.select('features', 'demand') 

In [0]:
modelData.show(truncate =False)

+---------------------------------------------------+------+
|features                                           |demand|
+---------------------------------------------------+------+
|(11,[0,2,6,7,8],[1.0,1.0,1.0,0.24,0.81])           |16    |
|(11,[0,2,3,6,7,8],[1.0,1.0,1.0,1.0,0.22,0.8])      |40    |
|(11,[0,2,3,6,7,8],[1.0,1.0,2.0,1.0,0.22,0.8])      |32    |
|(11,[0,2,3,6,7,8],[1.0,1.0,3.0,1.0,0.24,0.75])     |13    |
|(11,[0,2,3,6,7,8],[1.0,1.0,4.0,1.0,0.24,0.75])     |1     |
|[1.0,0.0,1.0,5.0,0.0,0.0,2.0,0.24,0.75,0.0896,0.0] |1     |
|(11,[0,2,3,6,7,8],[1.0,1.0,6.0,1.0,0.22,0.8])      |2     |
|(11,[0,2,3,6,7,8],[1.0,1.0,7.0,1.0,0.2,0.86])      |3     |
|(11,[0,2,3,6,7,8],[1.0,1.0,8.0,1.0,0.24,0.75])     |8     |
|(11,[0,2,3,6,7,8],[1.0,1.0,9.0,1.0,0.32,0.76])     |14    |
|[1.0,0.0,1.0,10.0,0.0,0.0,1.0,0.38,0.76,0.2537,0.0]|36    |
|[1.0,0.0,1.0,11.0,0.0,0.0,1.0,0.36,0.81,0.2836,0.0]|56    |
|[1.0,0.0,1.0,12.0,0.0,0.0,1.0,0.42,0.77,0.2836,0.0]|84    |
|[1.0,0.0,1.0,13.0,0.0,0

In [0]:
trainData, testData = modelData.randomSplit([0.7, 0.3])

In [0]:
modelData.describe().show()

+-------+------------------+
|summary|            demand|
+-------+------------------+
|  count|             17379|
|   mean|189.46308763450142|
| stddev| 181.3875990918646|
|    min|                 1|
|    max|               977|
+-------+------------------+



In [0]:
trainData.describe().show()

+-------+------------------+
|summary|            demand|
+-------+------------------+
|  count|             12342|
|   mean|189.57510938259603|
| stddev|181.43243518422335|
|    min|                 1|
|    max|               977|
+-------+------------------+



In [0]:
testData.describe().show()

+-------+------------------+
|summary|            demand|
+-------+------------------+
|  count|              5037|
|   mean|  189.188604327973|
| stddev|181.29540532631282|
|    min|                 1|
|    max|               976|
+-------+------------------+



In [0]:
help(LinearRegression)

Help on class LinearRegression in module pyspark.ml.regression:

class LinearRegression(_JavaRegressor, _LinearRegressionParams, pyspark.ml.util.JavaMLWritable, pyspark.ml.util.JavaMLReadable)
 |  LinearRegression(*, featuresCol: str = 'features', labelCol: str = 'label', predictionCol: str = 'prediction', maxIter: int = 100, regParam: float = 0.0, elasticNetParam: float = 0.0, tol: float = 1e-06, fitIntercept: bool = True, standardization: bool = True, solver: str = 'auto', weightCol: Optional[str] = None, aggregationDepth: int = 2, loss: str = 'squaredError', epsilon: float = 1.35, maxBlockSizeInMB: float = 0.0)
 |  
 |  Linear regression.
 |  
 |  The learning objective is to minimize the specified loss function, with regularization.
 |  This supports two kinds of loss:
 |  
 |  * squaredError (a.k.a squared loss)
 |  * huber (a hybrid of squared error for relatively small errors and absolute error for     relatively large ones, and we estimate the scale parameter from training data

In [0]:
# Create a regression object
lr = LinearRegression(labelCol='demand')

In [0]:
lr.explainParam("elasticNetParam")

Out[33]: 'elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)'

In [0]:
lr.explainParams()

Out[34]: 'aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)\nelasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)\nepsilon: The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber (default: 1.35)\nfeaturesCol: features column name. (default: features)\nfitIntercept: whether to fit an intercept term. (default: True)\nlabelCol: label column name. (default: label, current: demand)\nloss: The loss function to be optimized. Supported options: squaredError, huber. (default: squaredError)\nmaxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0. (default: 0.0)\nmaxIter: max number of itera

In [0]:
trainData.cache()

Out[35]: DataFrame[features: vector, demand: int]

In [0]:
testData.cache()

Out[36]: DataFrame[features: vector, demand: int]

In [0]:
# Train the linear model on training data
lrModel = lr.fit(trainData)

In [0]:
# Extract the summary from the returned LinearRegressionModel instance trained
summary = lrModel.summary

In [0]:
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

Coefficients: [19.201085291756335,80.30462893667222,0.026463621110472357,7.716216799529801,-14.406009511759342,15.14689621868093,-3.8930464905213777,280.14722583404557,-204.34837208795327,15.847450780785945,-2.8447670796494893]
Intercept: 2.0581205329857233


In [0]:
summary.explainedVariance

Out[40]: 12864.092555428157

In [0]:
summary.meanAbsoluteError

Out[41]: 105.58942637571246

In [0]:
data.select('demand').describe().show()

+-------+------------------+
|summary|            demand|
+-------+------------------+
|  count|             17379|
|   mean|189.46308763450142|
| stddev| 181.3875990918646|
|    min|                 1|
|    max|               977|
+-------+------------------+



In [0]:
summary.r2

Out[43]: 0.3908269347193586

In [0]:
summary.predictions.show(n=20, truncate = False)

+-----------------------------------------------+------+-------------------+
|features                                       |demand|prediction         |
+-----------------------------------------------+------+-------------------+
|(11,[0,1,2,6,7,8],[1.0,1.0,12.0,2.0,0.24,0.7]) |26.0  |18.286778972300848 |
|(11,[0,1,2,6,7,8],[2.0,1.0,3.0,1.0,0.58,0.68]) |156.0 |140.47976238991885 |
|(11,[0,1,2,6,7,8],[3.0,1.0,6.0,1.0,0.64,0.83]) |116.0 |145.91681628185634 |
|(11,[0,1,2,6,7,8],[3.0,1.0,8.0,2.0,0.7,0.61])  |135.0 |203.84217244294834 |
|(11,[0,2,3,6,7,8],[1.0,1.0,1.0,1.0,0.22,0.8])  |40.0  |-76.73746823201165 |
|(11,[0,2,3,6,7,8],[1.0,1.0,2.0,1.0,0.22,0.8])  |32.0  |-69.02125143248185 |
|(11,[0,2,3,6,7,8],[1.0,1.0,2.0,2.0,0.18,0.55]) |16.0  |-33.03309393437674 |
|(11,[0,2,3,6,7,8],[1.0,1.0,3.0,1.0,0.24,0.75]) |13.0  |-45.484671511873444|
|(11,[0,2,3,6,7,8],[1.0,1.0,3.0,2.0,0.16,0.59]) |8.0   |-39.093756535045955|
|(11,[0,2,3,6,7,8],[1.0,1.0,4.0,1.0,0.24,0.75]) |1.0   |-37.76845471234364 |

In [0]:
print ("explainedVariance={}".format(summary.explainedVariance))
print ("meanAbsoluteError=%g" %summary.meanAbsoluteError)

explainedVariance=12864.092555428157
meanAbsoluteError=105.589


In [0]:
testResults = lrModel.evaluate(testData)

In [0]:
testResults.residuals.show(n=10)

+-------------------+
|          residuals|
+-------------------+
|  37.51662166437225|
|  45.09229507490272|
|-2.5919148881339993|
| 36.377539735516166|
|-6.5359978637366964|
|  99.29329359524613|
|  31.61512831809584|
|-117.51508552735845|
| 39.715920214599805|
|-128.41467309163718|
+-------------------+
only showing top 10 rows



In [0]:
testResults.residuals.groupBy().avg().show() 

+------------------+
|    avg(residuals)|
+------------------+
|-0.728335418090562|
+------------------+



- The average of the residuals does not reflect the reality as the residuals can be negative
- The mean absolute error is the average of the absolute values of the residuals

In [0]:
from pyspark.sql.functions import abs

df = testResults.residuals
df.select(abs(df.residuals)).groupBy().avg().show()

+-------------------+
|avg(abs(residuals))|
+-------------------+
| 107.09112625134705|
+-------------------+



In [0]:
print ("r2=%g"%testResults.r2)   # my model explains x % of the variance of the data
print ("rootMeanSquaredError=%g"%testResults.rootMeanSquaredError)

r2=0.378508
rootMeanSquaredError=142.91


In [0]:
print ("meanAbsoluteError=%g"%testResults.meanAbsoluteError)

meanAbsoluteError=107.091


## Underfitting !
- decrease regularization parameter? 
- Add more features? Feature Engineering?
- Polynomial Regression? other algorithms? Trees? 

### Anyway let's get some insights from our data !

In [0]:
data.printSchema()

root
 |-- season: integer (nullable = true)
 |-- yr: integer (nullable = true)
 |-- mnth: integer (nullable = true)
 |-- hr: integer (nullable = true)
 |-- holiday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- weathersit: integer (nullable = true)
 |-- temp: double (nullable = true)
 |-- hum: double (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- dayOfWeek: string (nullable = true)
 |-- days: integer (nullable = true)
 |-- demand: integer (nullable = true)
 |-- day_cat: double (nullable = false)
 |-- features: vector (nullable = true)



In [0]:
insights = lrModel.evaluate(data)

In [0]:
pred = insights.predictions

In [0]:
pred.show()

+------+---+----+---+-------+----------+----------+----+----+---------+---------+----+------+-------+--------------------+-------------------+
|season| yr|mnth| hr|holiday|workingday|weathersit|temp| hum|windspeed|dayOfWeek|days|demand|day_cat|            features|         prediction|
+------+---+----+---+-------+----------+----------+----+----+---------+---------+----+------+-------+--------------------+-------------------+
|     1|  0|   1|  0|      0|         0|         1|0.24|0.81|      0.0|      Sat|   0|    16|    0.0|(11,[0,2,6,7,8],[...| -80.89422423574007|
|     1|  0|   1|  1|      0|         0|         1|0.22| 0.8|      0.0|      Sat|   0|    40|    0.0|(11,[0,2,3,6,7,8]...| -76.73746823201165|
|     1|  0|   1|  2|      0|         0|         1|0.22| 0.8|      0.0|      Sat|   0|    32|    0.0|(11,[0,2,3,6,7,8]...| -69.02125143248185|
|     1|  0|   1|  3|      0|         0|         1|0.24|0.75|      0.0|      Sat|   0|    13|    0.0|(11,[0,2,3,6,7,8]...|-45.484671511873444|

In [0]:
pred.take(1)

Out[56]: [Row(season=1, yr=0, mnth=1, hr=0, holiday=0, workingday=0, weathersit=1, temp=0.24, hum=0.81, windspeed=0.0, dayOfWeek='Sat', days=0, demand=16, day_cat=0.0, features=SparseVector(11, {0: 1.0, 2: 1.0, 6: 1.0, 7: 0.24, 8: 0.81}), prediction=-80.89422423574007)]

In [0]:
pred_res = pred.withColumn('res_abs', abs(pred.prediction-pred.demand))

In [0]:
pred_res.show()

+------+---+----+---+-------+----------+----------+----+----+---------+---------+----+------+-------+--------------------+-------------------+------------------+
|season| yr|mnth| hr|holiday|workingday|weathersit|temp| hum|windspeed|dayOfWeek|days|demand|day_cat|            features|         prediction|           res_abs|
+------+---+----+---+-------+----------+----------+----+----+---------+---------+----+------+-------+--------------------+-------------------+------------------+
|     1|  0|   1|  0|      0|         0|         1|0.24|0.81|      0.0|      Sat|   0|    16|    0.0|(11,[0,2,6,7,8],[...| -80.89422423574007| 96.89422423574007|
|     1|  0|   1|  1|      0|         0|         1|0.22| 0.8|      0.0|      Sat|   0|    40|    0.0|(11,[0,2,3,6,7,8]...| -76.73746823201165|116.73746823201165|
|     1|  0|   1|  2|      0|         0|         1|0.22| 0.8|      0.0|      Sat|   0|    32|    0.0|(11,[0,2,3,6,7,8]...| -69.02125143248185|101.02125143248185|
|     1|  0|   1|  3|      0

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

# Calculate the RMSE
RegressionEvaluator(labelCol='demand', metricName='rmse').evaluate(pred_res)

Out[59]: 141.981837801861

In [0]:
for item in pred_res.take(1)[0]:
  print(item)

1
0
1
0
0
0
1
0.24
0.81
0.0
Sat
0
16
0.0
(11,[0,2,6,7,8],[1.0,1.0,1.0,0.24,0.81])
-80.89422423574007
96.89422423574007


In [0]:
pred_res.take(1)

Out[61]: [Row(season=1, yr=0, mnth=1, hr=0, holiday=0, workingday=0, weathersit=1, temp=0.24, hum=0.81, windspeed=0.0, dayOfWeek='Sat', days=0, demand=16, day_cat=0.0, features=SparseVector(11, {0: 1.0, 2: 1.0, 6: 1.0, 7: 0.24, 8: 0.81}), prediction=-80.89422423574007, res_abs=96.89422423574007)]

## Insights from the results

In [0]:
from pyspark.sql.functions import avg, stddev, format_number

In [0]:
from pyspark.sql.functions import format_number

# Grouping data by hr
pred_res.groupBy('hr').agg(format_number(avg('res_abs'), 2).alias('avg_abs_residual'), 
                           format_number(avg('demand'), 2).alias('avg_demand'), 
                           format_number(stddev('prediction'), 2).alias('stddev_prediction'), 
                           format_number(stddev('demand'), 2).alias('stddev_demand')
                          ).sort('hr').show()

+---+----------------+----------+-----------------+-------------+
| hr|avg_abs_residual|avg_demand|stddev_prediction|stddev_demand|
+---+----------------+----------+-----------------+-------------+
|  0|           59.91|     53.90|            77.23|        42.31|
|  1|           71.37|     33.38|            76.14|        33.54|
|  2|           79.09|     22.87|            74.75|        26.58|
|  3|           89.08|     11.73|            72.42|        13.24|
|  4|           95.96|      6.35|            71.60|         4.14|
|  5|           86.72|     19.89|            72.30|        13.20|
|  6|           53.10|     76.04|            73.12|        55.08|
|  7|          142.46|    212.06|            76.31|       161.44|
|  8|          248.64|    359.01|            81.06|       235.19|
|  9|           76.97|    219.31|            83.90|        93.70|
| 10|           69.92|    173.67|            87.39|       102.21|
| 11|           80.86|    208.14|            89.08|       127.50|
| 12|     

In [0]:
# Grouping data by season
pred_res.groupBy('season').agg(format_number(avg('res_abs'), 2).alias('avg_abs_residual'), 
                           format_number(avg('demand'), 2).alias('avg_demand'), 
                           format_number(stddev('prediction'), 2).alias('stddev_prediction'), 
                           format_number(stddev('demand'), 2).alias('stddev_demand')
                          ).sort('season').show()

+------+----------------+----------+-----------------+-------------+
|season|avg_abs_residual|avg_demand|stddev_prediction|stddev_demand|
+------+----------------+----------+-----------------+-------------+
|     1|           77.99|    111.11|            98.63|       119.22|
|     2|          109.20|    208.34|           107.96|       188.36|
|     3|          127.83|    236.02|           101.51|       197.71|
|     4|          107.66|    198.87|            95.99|       182.97|
+------+----------------+----------+-----------------+-------------+



In [0]:
# Grouping data by weathersit
pred_res.groupBy('weathersit').agg(format_number(avg('res_abs'), 2).alias('avg_abs_residual'), 
                           format_number(avg('demand'), 2).alias('avg_demand'), 
                           format_number(stddev('prediction'), 2).alias('stddev_prediction'), 
                           format_number(stddev('demand'), 2).alias('stddev_demand')
                          ).sort('weathersit').show()

+----------+----------------+----------+-----------------+-------------+
|weathersit|avg_abs_residual|avg_demand|stddev_prediction|stddev_demand|
+----------+----------------+----------+-----------------+-------------+
|         1|          110.83|    204.87|           113.27|       189.49|
|         2|           99.33|    175.17|           102.60|       165.43|
|         3|           88.94|    111.58|           101.10|       133.78|
|         4|           46.58|     74.33|            78.05|        77.93|
+----------+----------------+----------+-----------------+-------------+



In [0]:
# Grouping data by holiday
pred_res.groupBy('holiday').agg(format_number(avg('res_abs'), 2).alias('avg_abs_residual'), 
                           format_number(avg('demand'), 2).alias('avg_demand'), 
                           format_number(stddev('prediction'), 2).alias('stddev_prediction'), 
                           format_number(stddev('demand'), 2).alias('stddev_demand')
                          ).sort('holiday').show()

+-------+----------------+----------+-----------------+-------------+
|holiday|avg_abs_residual|avg_demand|stddev_prediction|stddev_demand|
+-------+----------------+----------+-----------------+-------------+
|      0|          106.71|    190.43|           112.81|       181.98|
|      1|           82.86|    156.87|           107.64|       156.76|
+-------+----------------+----------+-----------------+-------------+



## Adding dummy variables and Cross validation

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [0]:
# Identify the categorical variables in the dataset
categorical_cols = ['season', 'holiday', 'weathersit', 'dayOfWeek', 'hr', 'mnth']

# Convert categorical variables to numerical values using StringIndexer
indexed = [StringIndexer(inputCol = col, outputCol= col + '_idx')
            for col in categorical_cols]

In [0]:
# Convert indexed categorical variables to dummy variables using OneHotEncoder
# Create an instance of the one hot encoder
encoded = [OneHotEncoder(dropLast = False, inputCol = col + '_idx', outputCol = col + '_dum')
            for col in categorical_cols]

In [0]:
# Combine the dummy variables with the original dataset
assembler = VectorAssembler(inputCols = ['yr', 'workingday', 'temp', 'hum', 'windspeed'] + [col + '_dum' for col in categorical_cols], outputCol = 'features')

In [0]:
# Create a LinearRegression model
lr = LinearRegression(featuresCol="features", labelCol="demand")

In [0]:
# Create an empty parameter grid
params = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

In [0]:
# Create object to evaluate the regression model
evaluator = RegressionEvaluator(metricName="r2", labelCol=lr.getLabelCol(), predictionCol=lr.getPredictionCol())

In [0]:
# Create a cross validator
cv = CrossValidator(estimator=lr, estimatorParamMaps=params, evaluator=evaluator)

In [0]:
trainData, testData = rowData.randomSplit([0.7, 0.3])

In [0]:
# Construct a pipeline
pipelineLR = Pipeline(stages=indexed + encoded + [assembler, cv])

In [0]:
# Train the Pipeline
pipelineModelLR = pipelineLR.fit(trainData)

In [0]:
predictionsLR = pipelineModelLR.transform(testData)

In [0]:
featuresCols = predictionsLR.columns

In [0]:
display(predictionsLR.select("demand", "prediction", *featuresCols))

demand,prediction,season,yr,mnth,hr,holiday,workingday,weathersit,temp,hum,windspeed,dayOfWeek,days,demand.1,season_idx,holiday_idx,weathersit_idx,dayOfWeek_idx,hr_idx,mnth_idx,season_dum,holiday_dum,weathersit_dum,dayOfWeek_dum,hr_dum,mnth_dum,features,prediction.1
13,-97.69567602287364,1,0,1,0,0,0,1,0.04,0.45,0.2537,Sat,19,13,2.0,0.0,0.0,0.0,20.0,9.0,"Map(vectorType -> sparse, length -> 4, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(20), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(2, 3, 4, 7, 9, 11, 15, 42, 55), values -> List(0.04, 0.45, 0.2537, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",-97.69567602287364
28,-74.16476813021306,1,0,1,0,0,0,1,0.22,0.64,0.3582,Sat,25,28,2.0,0.0,0.0,0.0,20.0,9.0,"Map(vectorType -> sparse, length -> 4, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(20), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(2, 3, 4, 7, 9, 11, 15, 42, 55), values -> List(0.22, 0.64, 0.3582, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",-74.16476813021306
25,-77.00532788799164,1,0,1,0,0,0,2,0.18,0.51,0.1642,Sat,6,25,2.0,0.0,1.0,0.0,20.0,9.0,"Map(vectorType -> sparse, length -> 4, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(20), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(2, 3, 4, 7, 9, 12, 15, 42, 55), values -> List(0.18, 0.51, 0.1642, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",-77.00532788799164
12,-81.8723037319654,1,0,1,0,0,1,1,0.14,0.59,0.1045,Tue,9,12,2.0,0.0,0.0,2.0,20.0,9.0,"Map(vectorType -> sparse, length -> 4, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(20), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(1, 2, 3, 4, 7, 9, 11, 17, 42, 55), values -> List(1.0, 0.14, 0.59, 0.1045, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",-81.8723037319654
7,-106.60488706285427,1,0,1,0,0,1,2,0.16,0.86,0.0896,Wed,10,7,2.0,0.0,1.0,1.0,20.0,9.0,"Map(vectorType -> sparse, length -> 4, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(20), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(1, 2, 3, 4, 7, 9, 12, 16, 42, 55), values -> List(1.0, 0.16, 0.86, 0.0896, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",-106.60488706285427
17,-82.42496358132887,1,0,1,0,0,1,2,0.2,0.64,0.1939999999999999,Fri,5,17,2.0,0.0,1.0,6.0,20.0,9.0,"Map(vectorType -> sparse, length -> 4, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(6), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(20), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(1, 2, 3, 4, 7, 9, 12, 21, 42, 55), values -> List(1.0, 0.2, 0.64, 0.19399999999999998, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",-82.42496358132887
21,-79.77985875396634,1,0,1,0,0,1,2,0.24,0.7,0.2537,Fri,18,21,2.0,0.0,1.0,6.0,20.0,9.0,"Map(vectorType -> sparse, length -> 4, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(6), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(20), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(1, 2, 3, 4, 7, 9, 12, 21, 42, 55), values -> List(1.0, 0.24, 0.7, 0.2537, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",-79.77985875396634
17,-98.89102895672237,1,0,1,0,1,0,2,0.2,0.47,0.2239,Mon,15,17,2.0,1.0,1.0,5.0,20.0,9.0,"Map(vectorType -> sparse, length -> 4, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(20), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(2, 3, 4, 7, 10, 12, 20, 42, 55), values -> List(0.2, 0.47, 0.2239, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",-98.89102895672237
13,-136.71918026360686,1,0,1,1,0,0,1,0.04,0.57,0.1045,Sun,20,13,2.0,0.0,0.0,3.0,17.0,9.0,"Map(vectorType -> sparse, length -> 4, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(17), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(2, 3, 4, 7, 9, 11, 18, 39, 55), values -> List(0.04, 0.57, 0.1045, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",-136.71918026360686
12,-123.3593478256059,1,0,1,1,0,0,1,0.1,0.42,0.4627,Sun,7,12,2.0,0.0,0.0,3.0,17.0,9.0,"Map(vectorType -> sparse, length -> 4, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(17), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(9), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(2, 3, 4, 7, 9, 11, 18, 39, 55), values -> List(0.1, 0.42, 0.4627, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",-123.3593478256059


In [0]:
rmse = evaluator.evaluate(predictionsLR)
print("RMSE on our test set: %g" % rmse)

# Evaluate the predictions using the RegressionEvaluator
r2 = evaluator.evaluate(predictionsLR, {evaluator.metricName: "r2"})
print("R^2 score on test set = %g" % r2)

mse = evaluator.evaluate(predictionsLR, {evaluator.metricName: "mse"})
print("MSE score on test set = %g" % mse)

RMSE on our test set: 0.676705
R^2 score on test set = 0.676705
MSE score on test set = 10473


In [0]:
results = []

results.append({'model': "LinearRegression", 'r2': evaluator.evaluate(predictionsLR, {evaluator.metricName: "r2"})})

## Random Forest Regressor

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [0]:
rowData.show()

+------+---+----+---+-------+----------+----------+----+----+---------+---------+----+------+
|season| yr|mnth| hr|holiday|workingday|weathersit|temp| hum|windspeed|dayOfWeek|days|demand|
+------+---+----+---+-------+----------+----------+----+----+---------+---------+----+------+
|     1|  0|   1|  0|      0|         0|         1|0.24|0.81|      0.0|      Sat|   0|    16|
|     1|  0|   1|  1|      0|         0|         1|0.22| 0.8|      0.0|      Sat|   0|    40|
|     1|  0|   1|  2|      0|         0|         1|0.22| 0.8|      0.0|      Sat|   0|    32|
|     1|  0|   1|  3|      0|         0|         1|0.24|0.75|      0.0|      Sat|   0|    13|
|     1|  0|   1|  4|      0|         0|         1|0.24|0.75|      0.0|      Sat|   0|     1|
|     1|  0|   1|  5|      0|         0|         2|0.24|0.75|   0.0896|      Sat|   0|     1|
|     1|  0|   1|  6|      0|         0|         1|0.22| 0.8|      0.0|      Sat|   0|     2|
|     1|  0|   1|  7|      0|         0|         1| 0.2|0.86

In [0]:
# Identify the categorical variables in the dataset
categorical_vars = ['season', 'holiday', 'weathersit', 'dayOfWeek', 'hr', 'mnth']

In [0]:
# Convert categorical variables to numerical values using StringIndexer
indexers = [StringIndexer(inputCol = var, outputCol = var + '_idx')
            for var in categorical_vars]

In [0]:
# Convert indexed categorical variables to dummy variables using OneHotEncoder
encoders = [OneHotEncoder(dropLast = False, inputCol = var + '_idx', outputCol = var + '_dum')
            for var in categorical_vars]

In [0]:
# Combine the dummy variables with the original dataset
assembler = VectorAssembler(inputCols = ['yr', 'workingday', 'temp', 'hum', 'windspeed'] + [var + '_dum' for var in categorical_vars], outputCol = 'features')

In [0]:
# Define the RandomForestRegressor model
rf = RandomForestRegressor(featuresCol = 'features', labelCol = 'demand')

In [0]:
# Define the parameter grid to search over
paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .addGrid(rf.numTrees, [10, 20, 30]) \
    .build()

In [0]:
# Define the evaluator to use for the model
evaluator = RegressionEvaluator(metricName="rmse", labelCol=rf.getLabelCol(), predictionCol=rf.getPredictionCol())

In [0]:
# Define the cross-validator
cv = CrossValidator(estimator = rf, estimatorParamMaps = paramGrid, evaluator = evaluator)

In [0]:
# Split the dataset randomly into 70% for training and 30% for testing.
train, test = rowData.randomSplit([0.7, 0.3])

In [0]:
# Construct a pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler, cv])

In [0]:
# Train the Pipeline
pipelineModel = pipeline.fit(train)

In [0]:
predictions = pipelineModel.transform(test)

In [0]:
featuresCols = predictions.columns

In [0]:
# Evaluate the predictions using the RegressionEvaluator
rmse = evaluator.evaluate(predictions)
print("RMSE on our test set: %g" % rmse)

r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
print("R^2 score on test set = %g" % r2)

mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
print("MSE score on test set = %g" % mse)

RMSE on our test set: 68.0505
R^2 score on test set = 0.856748
MSE score on test set = 4630.87


In [0]:
display(predictions.select("demand", "prediction", *featuresCols))

demand,prediction,season,yr,mnth,hr,holiday,workingday,weathersit,temp,hum,windspeed,dayOfWeek,days,demand.1,season_idx,holiday_idx,weathersit_idx,dayOfWeek_idx,hr_idx,mnth_idx,season_dum,holiday_dum,weathersit_dum,dayOfWeek_dum,hr_dum,mnth_dum,features,prediction.1
28,29.78884421004601,1,0,1,0,0,0,1,0.22,0.64,0.3582,Sat,25,28,3.0,0.0,0.0,0.0,2.0,10.0,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(2, 3, 4, 8, 9, 11, 15, 24, 56), values -> List(0.22, 0.64, 0.3582, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",29.78884421004601
25,38.14460020914546,1,0,1,0,0,0,2,0.18,0.51,0.1642,Sat,6,25,3.0,0.0,1.0,0.0,2.0,10.0,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(2, 3, 4, 8, 9, 12, 15, 24, 56), values -> List(0.18, 0.51, 0.1642, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",38.14460020914546
17,59.65174448208528,1,0,1,0,0,0,2,0.46,0.88,0.2985,Sun,1,17,3.0,0.0,1.0,1.0,2.0,10.0,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(2, 3, 4, 8, 9, 12, 16, 24, 56), values -> List(0.46, 0.88, 0.2985, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",59.65174448208528
7,34.34153063440526,1,0,1,0,0,1,1,0.06,0.41,0.1939999999999999,Mon,21,7,3.0,0.0,0.0,4.0,2.0,10.0,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(1, 2, 3, 4, 8, 9, 11, 19, 24, 56), values -> List(1.0, 0.06, 0.41, 0.19399999999999998, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",34.34153063440526
14,31.21122845337406,1,0,1,0,0,1,1,0.12,0.5,0.1939999999999999,Fri,12,14,3.0,0.0,0.0,5.0,2.0,10.0,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(1, 2, 3, 4, 8, 9, 11, 20, 24, 56), values -> List(1.0, 0.12, 0.5, 0.19399999999999998, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",31.21122845337406
7,27.361405935259462,1,0,1,0,0,1,1,0.14,0.59,0.2836,Thr,11,7,3.0,0.0,0.0,6.0,2.0,10.0,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(6), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(1, 2, 3, 4, 8, 9, 11, 21, 24, 56), values -> List(1.0, 0.14, 0.59, 0.2836, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",27.361405935259462
6,27.794812076631448,1,0,1,0,0,1,1,0.2,0.64,0.0,Wed,3,6,3.0,0.0,0.0,3.0,2.0,10.0,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(1, 2, 3, 8, 9, 11, 18, 24, 56), values -> List(1.0, 0.2, 0.64, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",27.794812076631448
5,34.881463677922966,1,0,1,0,0,1,1,0.22,0.44,0.3582,Mon,1,5,3.0,0.0,0.0,4.0,2.0,10.0,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(1, 2, 3, 4, 8, 9, 11, 19, 24, 56), values -> List(1.0, 0.22, 0.44, 0.3582, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",34.881463677922966
9,27.152973930344857,1,0,1,0,0,1,2,0.16,0.69,0.2836,Tue,22,9,3.0,0.0,1.0,2.0,2.0,10.0,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(1, 2, 3, 4, 8, 9, 12, 17, 24, 56), values -> List(1.0, 0.16, 0.69, 0.2836, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",27.152973930344857
3,26.848067208285062,1,0,1,0,0,1,2,0.22,0.93,0.0,Wed,17,3,3.0,0.0,1.0,3.0,2.0,10.0,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 4, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 24, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 58, indices -> List(1, 2, 3, 8, 9, 12, 18, 24, 56), values -> List(1.0, 0.22, 0.93, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",26.848067208285062


## Compare results

In [0]:
results.append({'model': "RandomForestRegressor", 'r2': evaluator.evaluate(predictions, {evaluator.metricName: "r2"})})

In [0]:
results_df = spark.createDataFrame(results)
results_df.select("model", "r2").orderBy(results_df.r2.desc()).show(truncate=False)

+---------------------+------------------+
|model                |r2                |
+---------------------+------------------+
|RandomForestRegressor|0.8567482320683689|
|LinearRegression     |0.6767048651364174|
+---------------------+------------------+

