In [0]:
%sql
select * from powerplant

AT,V,AP,RH,PE
14.96,41.76,1024.07,73.17,463.26
25.18,62.96,1020.04,59.08,444.37
5.11,39.4,1012.16,92.14,488.56
20.86,57.32,1010.24,76.64,446.48
10.82,37.5,1009.23,96.62,473.9
26.27,59.44,1012.23,58.77,443.67
15.89,43.96,1014.02,75.24,467.35
9.48,44.71,1019.12,66.43,478.42
14.64,45.0,1021.78,41.25,475.98
11.74,43.56,1015.14,70.72,477.5


In [0]:
%sql

select AT as Temperature, PE as Power from PowerPlant

Temperature,Power
14.96,463.26
25.18,444.37
5.11,488.56
20.86,446.48
10.82,473.9
26.27,443.67
15.89,467.35
9.48,478.42
14.64,475.98
11.74,477.5


In [0]:
%sql

select PE as Power, V as ExhaustVaccum from PowerPlant

Power,ExhaustVaccum
463.26,41.76
444.37,62.96
488.56,39.4
446.48,57.32
473.9,37.5
443.67,59.44
467.35,43.96
478.42,44.71
475.98,45.0
477.5,43.56


In [0]:
%sql

select PE as Power, AP as Pressure from PowerPlant

Power,Pressure
463.26,1024.07
444.37,1020.04
488.56,1012.16
446.48,1010.24
473.9,1009.23
443.67,1012.23
467.35,1014.02
478.42,1019.12
475.98,1021.78
477.5,1015.14


In [0]:
%sql

select PE as Power, RH as Humidity from PowerPlant

Power,Humidity
463.26,73.17
444.37,59.08
488.56,92.14
446.48,76.64
473.9,96.62
443.67,58.77
467.35,75.24
478.42,66.43
475.98,41.25
477.5,70.72


In [0]:
%scala
val csv = spark.read.option("inferSchema","true").option("header", "true").csv("/FileStore/tables/Folds5x2_pp-2.csv")
csv.show()

In [0]:
%scala

val splits = csv.randomSplit(Array(0.7, 0.3))
val train = splits(0)
val test = splits(1)
val train_rows = train.count()
val test_rows = test.count()
println("Training Rows: " + train_rows + " Testing Rows: " + test_rows)

In [0]:
%scala

import org.apache.spark.ml.feature.VectorAssembler

val assembler = new VectorAssembler().setInputCols(Array("AT", "V", "AP", "RH")).setOutputCol("features")

val training = assembler.transform(train).select($"features", $"PE".alias("label"))

training.show(false)

In [0]:
%scala

import org.apache.spark.ml.regression.LinearRegression

val lr = new LinearRegression().setLabelCol("label").setFeaturesCol("features").setMaxIter(10).setRegParam(0.3)
val model = lr.fit(training)
println("Model Trained!")

In [0]:
%scala

val testing = assembler.transform(test).select($"features", $"PE".alias("trueLabel"))
testing.show(false)

In [0]:
%scala

val prediction = model.transform(testing)
val predicted = prediction.select("features", "prediction", "trueLabel")
predicted.show()

In [0]:
%scala

predicted.createOrReplaceTempView("regressionPredictions")

In [0]:
%sql

SELECT trueLabel, prediction FROM regressionPredictions

trueLabel,prediction
490.55,492.100728904644
490.34,492.2491113503037
488.69,491.9199739771639
481.29,487.76812353216985
489.38,482.6746627758744
487.19,488.119687152689
489.11,483.00962895665066
488.98,486.3546019941244
490.02,486.0649200714178
489.04,487.5647025334486


In [0]:
%scala
import org.apache.spark.ml.evaluation.RegressionEvaluator

val evaluator = new RegressionEvaluator().setLabelCol("trueLabel").setPredictionCol("prediction").setMetricName("rmse")
val rmse = evaluator.evaluate(prediction)
println("Root Mean Square Error (RMSE): " + (rmse))