In [1]:
import findspark

findspark.init()
findspark.find()

'/usr/local/opt/apache-spark/libexec'

In [3]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
import pyspark

spark = SparkSession.builder.appName("NV Energy Project").config('spark.ui.showConsoleProgress', 'false').getOrCreate()

sc = spark.sparkContext.getOrCreate()
sc.setLogLevel('OFF')

In [24]:
sw_df = spark.read.option("header", "true").option("inferSchema", "true").csv("CleanData/solarAndWeather.csv")


DataFrame[Timestamp: timestamp, minutes: double, weekNumber: double, GlobalHoriz: double, RelHumidity: double, AvgWindSpeed: double, Precipitation: double, Power100: double, Power150: double, Power200: double]

In [6]:
#Create test and train for RF ML models

solar_weather_train_df, solar_weather_test_df = sw_df.randomSplit([.8, .2], seed=42)

In [21]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

#Model for 100MW Farm

sw_vec_assembler = VectorAssembler(inputCols=["minutes", "weekNumber","GlobalHoriz", "RelHumidity", "AvgWindSpeed", "Precipitation"], outputCol="features")

sw_rf = RandomForestRegressor(featuresCol="features", labelCol="Power100", numTrees = 100, maxDepth = 10)

pipeline = Pipeline(stages=[sw_vec_assembler, sw_rf])

sw_100_pipeline_model = pipeline.fit(solar_weather_train_df);

                                                                                

'r2: 0.8790679964415425 rmse: 9.283995342605921'

In [10]:
#Model for 150MW Farm

sw_rf = RandomForestRegressor(featuresCol="features", labelCol="Power150", numTrees = 100, maxDepth = 10)

pipeline = Pipeline(stages=[sw_vec_assembler, sw_rf])

sw_150_pipeline_model = pipeline.fit(solar_weather_train_df);

                                                                                

'r2: 0.8861426596079328 rmse: 13.462028921090889'

In [23]:
#Model for 200MW Farm

sw_rf = RandomForestRegressor(featuresCol="features", labelCol="Power200", numTrees = 100, maxDepth = 10)

pipeline = Pipeline(stages=[sw_vec_assembler, sw_rf])

sw_200_pipeline_model = pipeline.fit(solar_weather_train_df)

pred_df = sw_200_pipeline_model.transform(solar_weather_test_df)

                                                                                

'r2: 0.7836649353286952 rmse: 23.16203285483346'

In [14]:
def getConsumption(year):
    #Formula from DataBricks Consumption LR 
    return  year * 675250.9602239017 - 1325137830.201505

In [34]:
def getSolar(farm_size, time, week_num, ghi, humidity, wind, precipitation):
    
    #Convert time to minutes
    hours = time.hour
    minutes = time.minute
    total_minutes = hours * 60 + minutes
    
    columns = ["minutes", "weekNumber","GlobalHoriz", "RelHumidity", "AvgWindSpeed", "Precipitation"]
    data = [(total_minutes, week_num, ghi, humidity, wind, precipitation)]
    selection_df = spark.createDataFrame(data=data, schema=columns)

    #Determine model to use for prediction
    if farm_size == '100 MW':
        pred_df = sw_100_pipeline_model.transform(selection_df)
    elif farm_size == '150 MW':
        pred_df = sw_150_pipeline_model.transform(selection_df)
    else:
        pred_df = sw_200_pipeline_model.transform(selection_df)
    
    prediction = pred_df.collect()[0][7]
    
    return prediction

+-------+----------+-----------+-----------+------------+-------------+--------------------+----------------+
|minutes|weekNumber|GlobalHoriz|RelHumidity|AvgWindSpeed|Precipitation|            features|      prediction|
+-------+----------+-----------+-----------+------------+-------------+--------------------+----------------+
|     60|         2|          2|          2|           2|            2|[60.0,2.0,2.0,2.0...|9.69780010868273|
+-------+----------+-----------+-----------+------------+-------------+--------------------+----------------+

9.69780010868273
