In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('lrexample').getOrCreate()

21/12/09 00:23:46 WARN Utils: Your hostname, ortiz-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
21/12/09 00:23:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/09 00:23:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.csv("carDetail.csv", header="true", inferSchema="true")
df.cache()
df.show()

                                                                                

+------+----------+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+------+--------+-----+
|number|       day|season| yr|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed|casual|register|total|
+------+----------+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+------+--------+-----+
|     1|2011-01-01|     1|  0|   1|  0|      0|      6|         0|         1|0.24|0.2879|0.81|      0.0|     3|      13|   16|
|     2|2011-01-01|     1|  0|   1|  1|      0|      6|         0|         1|0.22|0.2727| 0.8|      0.0|     8|      32|   40|
|     3|2011-01-01|     1|  0|   1|  2|      0|      6|         0|         1|0.22|0.2727| 0.8|      0.0|     5|      27|   32|
|     4|2011-01-01|     1|  0|   1|  3|      0|      6|         0|         1|0.24|0.2879|0.75|      0.0|     3|      10|   13|
|     5|2011-01-01|     1|  0|   1|  4|      0|      6|         0|         1|0.24|0.2879|0.75|      0.0|     0|

In [19]:
df = df.drop("number").drop("day").drop("casual").drop("register")
df.printSchema()

root
 |-- season: integer (nullable = true)
 |-- yr: integer (nullable = true)
 |-- mnth: integer (nullable = true)
 |-- hr: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- weathersit: integer (nullable = true)
 |-- temp: integer (nullable = true)
 |-- windspeed: integer (nullable = true)
 |-- total: integer (nullable = true)



In [20]:
from pyspark.sql.functions import col  
df = df.select([col(c).cast("int").alias(c) for c in df.columns])
df.printSchema()

root
 |-- season: integer (nullable = true)
 |-- yr: integer (nullable = true)
 |-- mnth: integer (nullable = true)
 |-- hr: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- weathersit: integer (nullable = true)
 |-- temp: integer (nullable = true)
 |-- windspeed: integer (nullable = true)
 |-- total: integer (nullable = true)



In [21]:
train, test = df.randomSplit([0.7, 0.3])
print("We have %d training examples and %d test examples." % (train.count(), test.count()))

                                                                                

We have 7761 training examples and 3237 test examples.


In [22]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
featuresCols = df.columns
featuresCols.remove('total')
vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol="rawFeatures")
vectorIndexer = VectorIndexer(inputCol="rawFeatures", outputCol="features", maxCategories=4)

In [23]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(labelCol="total")

In [24]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

paramGrid = ParamGridBuilder()\
  .addGrid(gbt.maxDepth, [2, 5])\
  .addGrid(gbt.maxIter, [10, 100])\
  .build()
evaluator = RegressionEvaluator(metricName="rmse", labelCol=gbt.getLabelCol(), predictionCol=gbt.getPredictionCol())
cv = CrossValidator(estimator=gbt, evaluator=evaluator, estimatorParamMaps=paramGrid)

In [25]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, cv])

In [26]:
pipelineModel = pipeline.fit(train)

                                                                                

In [27]:
predictions = pipelineModel.transform(test)
(predictions.select("total", "prediction", *featuresCols)).show()

+-----+-------------------+------+---+----+---+-------+----------+----------+----+---------+
|total|         prediction|season| yr|mnth| hr|weekday|workingday|weathersit|temp|windspeed|
+-----+-------------------+------+---+----+---+-------+----------+----------+----+---------+
|   22| 23.076537025007802|     1|  0|   1|  0|      0|         0|         1|   0|        0|
|   39| 23.076537025007802|     1|  0|   1|  0|      0|         0|         1|   0|        0|
|   17| 24.982277968497186|     1|  0|   1|  0|      1|         0|         2|   0|        0|
|    5|  7.145147306133547|     1|  0|   1|  0|      2|         1|         1|   0|        0|
|   11|  7.088469398020875|     1|  0|   1|  0|      4|         1|         1|   0|        0|
|   13|  7.088469398020875|     1|  0|   1|  0|      4|         1|         1|   0|        0|
|    9| 24.288630300128073|     1|  0|   1|  0|      5|         1|         2|   0|        0|
|   17| 24.288630300128073|     1|  0|   1|  0|      5|         1|    

In [28]:
rmse = evaluator.evaluate(predictions)
print("RMSE on our test set: %g" % rmse)

[Stage 5709:>                                                       (0 + 1) / 1]

RMSE on our test set: 43.4065


                                                                                