# Power electricity power consumption prediction model

Let's first start by importing the needed libraries

In [1]:
from pyspark.sql import SparkSession

And create a local parallel spark session

In [2]:
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Power electricity prediction") \
        .getOrCreate()

The we read the dataframe we need to apply the prediction on

In [3]:
train = spark.read.csv('../data/engineered/powerelectricity_train.csv', header=True, inferSchema=True)
test = spark.read.csv('../data/engineered/powerelectricity_test.csv', header=True, inferSchema=True)

Now we will test some regression models and evaluate them using rmse

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor, LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler

In [5]:
train_ep = train.drop('rms_current')
train_ep = train_ep.drop('electric_power')

#train_rms = train.drop('electric_power')

test_ep = test.drop('rms_current')
test_ep = test_ep.drop('electric_power')

#test_rms = test.drop('electric_power')

features_ep = train_ep.columns.copy()
#features_ep.remove('electric_power')
features_ep.remove('delta_y')

#features_rms = train_rms.columns.copy()
#features_rms.remove('rms_current')

In [6]:
assembler_pe_train = VectorAssembler(inputCols=features_ep,outputCol="features")
train_ep = assembler_pe_train.transform(train_ep)
assembler_pe_test = VectorAssembler(inputCols=features_ep,outputCol="features")
test_ep = assembler_pe_test.transform(test_ep)

Let's start by predicting electric_power

In [47]:
#rf_pe = RandomForestRegressor(featuresCol="features",labelCol='delta_y', maxDepth=15, maxBins=14, minInstancesPerNode=1, minInfoGain=0.0005, subsamplingRate=1, seed=42, numTrees=50)
#rf_pe = RandomForestRegressor(featuresCol="features",labelCol='delta_y', maxDepth=6,numTrees=20,seed=12345)
rf_pe = GBTRegressor(featuresCol="features",labelCol='delta_y',maxDepth=4,maxIter=50,seed=12345)

In [48]:
%%time

model_pe = rf_pe.fit(train_ep)

CPU times: user 7.5 ms, sys: 24.8 ms, total: 32.3 ms
Wall time: 32.4 s


In [49]:
predictions = model_pe.transform(train_ep)

In [50]:
print('################################ TRAIN ################################')
evaluator_rmse = RegressionEvaluator(
    labelCol="delta_y", predictionCol="prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on train data = %g" % rmse)

evaluator_r2 = RegressionEvaluator(
    labelCol="delta_y", predictionCol="prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print("R squared (r2) on train data = %g" % r2)

evaluator_mae = RegressionEvaluator(
    labelCol="delta_y", predictionCol="prediction", metricName="mae")
mae = evaluator_mae.evaluate(predictions)
print("Mean Average Error (RMSE) on train data = %g" % mae)

################################ TRAIN ################################
Root Mean Squared Error (RMSE) on train data = 0.0888002
R squared (r2) on train data = 0.888913
Mean Average Error (RMSE) on train data = 0.0644947


In [51]:
predictions = model_pe.transform(test_ep)

In [52]:
print('################################ TEST ################################')

evaluator_rmse = RegressionEvaluator(
    labelCol="delta_y", predictionCol="prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

evaluator_r2 = RegressionEvaluator(
    labelCol="delta_y", predictionCol="prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print("R squared (r2) on test data = %g" % r2)

evaluator_mae = RegressionEvaluator(
    labelCol="delta_y", predictionCol="prediction", metricName="mae")
mae = evaluator_mae.evaluate(predictions)
print("Mean Average Error (RMSE) on test data = %g" % mae)

################################ TEST ################################
Root Mean Squared Error (RMSE) on test data = 0.1108
R squared (r2) on test data = 0.679817
Mean Average Error (RMSE) on test data = 0.0872837


In [53]:
predictions.select(["delta_y", "prediction"]).show(100000)

+--------------------+--------------------+
|             delta_y|          prediction|
+--------------------+--------------------+
| 0.13542984739455566| 0.15873521425644935|
| 0.10223147182542958|  0.1752515666215361|
| 0.10223147182542958|  0.1752515666215361|
| 0.10047543741876241|  0.1752515666215361|
| 0.11687876778953177| 0.15873521425644935|
|  0.1402871488975146|  0.1665999925982097|
| 0.13941973094290905| 0.15873521425644935|
| 0.14617985509566012|  0.1665999925982097|
| 0.14581591602597355|  0.1665999925982097|
| 0.13315439502869486| 0.15873521425644935|
| 0.13466045762174916| 0.15873521425644935|
| 0.13670606614062697| 0.15873521425644935|
| 0.12490898599912725| 0.15873521425644935|
| 0.08871918264429146| 0.04209860831160711|
|0.015976191967638886|0.033089066043052555|
| 0.12122555941193047| 0.16604737194061278|
|  0.1233782552524123| 0.15873521425644935|
| 0.14951330321559153| 0.17453465388501982|
|  0.1290556275596777| 0.17488636836217056|
| 0.13453243221773858| 0.1587352

And finally save to a file the predictions

In [54]:
import os

try:
    os.mkdir('../predictions/')
except:
    pass

preds = predictions.select(["prediction"])
preds.toPandas().to_csv('../predictions/preds.csv')