# Power electricity power consumption prediction model

Let's first start by importing the needed libraries

In [1]:
from pyspark.sql import SparkSession

And create a local parallel spark session

In [2]:
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Power electricity prediction") \
        .getOrCreate()

The we read the dataframe we need to apply the prediction on

In [3]:
train = spark.read.csv('../data/engineered/powerelectricity_train.csv', header=True, inferSchema=True)
test = spark.read.csv('../data/engineered/powerelectricity_test.csv', header=True, inferSchema=True)

Now we will test some regression models and evaluate them using rmse

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor, LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler

In [5]:
train_ep = train.drop('rms_current')
train_ep = train.drop('electric_power')

#train_rms = train.drop('electric_power')

test_ep = test.drop('rms_current')
test_ep = test.drop('electric_power')

#test_rms = test.drop('electric_power')

features_ep = train_ep.columns.copy()
#features_ep.remove('electric_power')
features_ep.remove('delta_y')

#features_rms = train_rms.columns.copy()
#features_rms.remove('rms_current')

In [6]:
assembler_pe_train = VectorAssembler(inputCols=features_ep,outputCol="features")
train_ep = assembler_pe_train.transform(train_ep)

Let's start by predicting electric_power

In [7]:
rf_pe = RandomForestRegressor(featuresCol="features",labelCol='delta_y',numTrees=12, maxDepth=12,seed=42)
#rf_pe = GBTRegressor(featuresCol="features",labelCol='electric_power',seed=42)
#rf_pe = LinearRegression(featuresCol="features",labelCol='electric_power',maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [8]:
%%time

model_pe = rf_pe.fit(train_ep)

CPU times: user 17.7 ms, sys: 4.64 ms, total: 22.4 ms
Wall time: 19.3 s


In [9]:
assembler_pe_test = VectorAssembler(inputCols=features_ep,outputCol="features")
test_ep = assembler_pe_test.transform(test_ep)

In [10]:
predictions = model_pe.transform(test_ep)

In [11]:
evaluator_rmse = RegressionEvaluator(
    labelCol="delta_y", predictionCol="prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

evaluator_r2 = RegressionEvaluator(
    labelCol="delta_y", predictionCol="prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print("R squared (r2) on test data = %g" % r2)

evaluator_mae = RegressionEvaluator(
    labelCol="delta_y", predictionCol="prediction", metricName="mae")
mae = evaluator_mae.evaluate(predictions)
print("Mean Average Error (RMSE) on test data = %g" % mae)

Root Mean Squared Error (RMSE) on test data = 0.399826
R squared (r2) on test data = 0.932422
Mean Average Error (RMSE) on test data = 0.28259


In [12]:
predictions.select(["delta_y", "prediction"]).show(100000)

+--------------------+--------------------+
|             delta_y|          prediction|
+--------------------+--------------------+
|  0.5582151680453507|  0.6888365314369685|
|  0.5582151680453507|  0.5967030902409791|
|  0.5582151680453507|  0.6055864786425792|
|  0.3476407540450346| 0.40369740690775235|
| 0.06735910139058454| 0.26954453402799194|
|  0.1766774216981588| 0.32534163513654574|
| 0.35730156822812376| 0.31788228616246145|
|  0.5643760041147257|  0.4715048925983932|
|  0.8536479705886499|  1.0780284803855242|
|  0.4529802355210659|  0.5698209192251361|
|  0.9974610877921606|   1.116584425532255|
|  0.5335925996353765|  0.5890587559773152|
|  0.6322382652544452|  0.6503783236756192|
|   1.126864443971563|   1.072876885166761|
| 0.11070358722432516| 0.29507671981378547|
| 0.46950997077134105| 0.41524791646757375|
| 0.44433379024100006|  0.3542054224823672|
|  0.8483083163917122|  0.6599711210461178|
|  0.7641504408145607|  0.6173206790639251|
|  0.5069511356203176|    0.4334