# Power electricity power consumption prediction model

Let's first start by importing the needed libraries

In [1]:
from pyspark.sql import SparkSession

And create a local parallel spark session

In [2]:
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Power electricity prediction") \
        .getOrCreate()

The we read the dataframe we need to apply the prediction on

In [3]:
train = spark.read.csv('../data/engineered/powerelectricity_train.csv', header=True, inferSchema=True)
test = spark.read.csv('../data/engineered/powerelectricity_test.csv', header=True, inferSchema=True)

Now we will test some regression models and evaluate them using rmse

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor, LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler

In [5]:
train_ep = train.drop('rms_current')
train_rms = train.drop('electric_power')

test_ep = test.drop('rms_current')
test_rms = test.drop('electric_power')

features_ep = train_ep.columns.copy()
features_ep.remove('electric_power')

features_rms = train_rms.columns.copy()
features_rms.remove('rms_current')

In [6]:
assembler_pe_train = VectorAssembler(inputCols=features_ep,outputCol="features")
train_ep = assembler_pe_train.transform(train_ep)

Let's start by predicting electric_power

In [7]:
rf_pe = RandomForestRegressor(featuresCol="features",labelCol='electric_power',seed=42)
#rf_pe = GBTRegressor(featuresCol="features",labelCol='electric_power',seed=42)
#rf_pe = LinearRegression(featuresCol="features",labelCol='electric_power',maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [8]:
%%time

model_pe = rf_pe.fit(train_ep)

CPU times: user 21.4 ms, sys: 991 µs, total: 22.4 ms
Wall time: 32.4 s


In [9]:
assembler_pe_test = VectorAssembler(inputCols=features_ep,outputCol="features")
test_ep = assembler_pe_test.transform(test_ep)

In [10]:
predictions = model_pe.transform(test_ep)

In [11]:
evaluator_rmse = RegressionEvaluator(
    labelCol="electric_power", predictionCol="prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

evaluator_r2 = RegressionEvaluator(
    labelCol="electric_power", predictionCol="prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print("R squared (r2) on test data = %g" % r2)

evaluator_mae = RegressionEvaluator(
    labelCol="electric_power", predictionCol="prediction", metricName="mae")
mae = evaluator_mae.evaluate(predictions)
print("Mean Average Error (RMSE) on test data = %g" % mae)

Root Mean Squared Error (RMSE) on test data = 1.05799
R squared (r2) on test data = 0.685377
Mean Average Error (RMSE) on test data = 0.751476


In [12]:
predictions.select(["electric_power", "prediction"]).show(1000)

+------------------+------------------+
|    electric_power|        prediction|
+------------------+------------------+
| 11.16869558013398|12.126702750042389|
| 11.16869558013398|12.364624902699848|
|11.635412790096973| 12.47891197244581|
|11.028763593911533|12.415777057727874|
| 10.45244273157684|12.339560212921679|
|10.398312926096299|12.290507799897757|
|10.520052909291724|12.163423855450493|
|   10.537602002974|12.163423855450493|
|10.540344473617575|12.163423855450493|
|10.338987888945667|12.163423855450493|
|10.317805125711827| 12.26118705578294|
|10.307470801778932|12.221345379588957|
|10.334486171731601|12.221345379588957|
|10.124113071132333|12.221345379588957|
|10.227571439149177|12.155598048776245|
|10.637223553189525|12.155598048776245|
|  10.6197281868661|12.341809423537173|
|10.732319295286425|12.341809423537173|
|10.385465595017276|12.341809423537173|
|10.437615146599233|12.569847698745509|
|10.348054410695367|12.341809423537173|
|  10.2071218900698|12.341809423537173|
