In [1]:
import findspark

findspark.init()
findspark.find()

'/usr/local/opt/apache-spark/libexec'

In [2]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
import pyspark

spark = SparkSession.builder.appName("NV Energy Project").config('spark.ui.showConsoleProgress', 'false').getOrCreate()

sc = spark.sparkContext.getOrCreate()
sc.setLogLevel('OFF')

23/03/31 09:21:14 WARN Utils: Your hostname, Courtneys-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.6 instead (on interface en0)
23/03/31 09:21:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/31 09:21:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/03/31 09:21:15 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
sw_df = spark.read.option("header", "true").option("inferSchema", "true").csv("../CleanData/solarAndWeather.csv")


In [4]:
#Create test and train for RF ML models

solar_weather_train_df, solar_weather_test_df = sw_df.randomSplit([.8, .2], seed=42)

In [5]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

#Model for 100MW Farm

sw_vec_assembler = VectorAssembler(inputCols=["minutes", "weekNumber","GlobalHoriz", "RelHumidity", "AvgWindSpeed", "Precipitation"], outputCol="features")

sw_rf = RandomForestRegressor(featuresCol="features", labelCol="Power100", numTrees = 100, maxDepth = 10)

pipeline = Pipeline(stages=[sw_vec_assembler, sw_rf])

sw_100_pipeline_model = pipeline.fit(solar_weather_train_df);

In [6]:
#Model for 150MW Farm

sw_rf = RandomForestRegressor(featuresCol="features", labelCol="Power150", numTrees = 100, maxDepth = 10)

pipeline = Pipeline(stages=[sw_vec_assembler, sw_rf])

sw_150_pipeline_model = pipeline.fit(solar_weather_train_df);

In [7]:
#Model for 200MW Farm

sw_rf = RandomForestRegressor(featuresCol="features", labelCol="Power200", numTrees = 100, maxDepth = 10)

pipeline = Pipeline(stages=[sw_vec_assembler, sw_rf])

sw_200_pipeline_model = pipeline.fit(solar_weather_train_df)

pred_df = sw_200_pipeline_model.transform(solar_weather_test_df)

In [8]:
def getConsumption(year):
    #Formula from DataBricks Consumption LR 
    return  year * 675250.9602239017 - 1325137830.201505

In [9]:
def getSolar(farm_size, time, week_num, ghi, humidity, wind, precipitation):
    
    #Convert time to minutes
    hours = time.hour
    minutes = time.minute
    total_minutes = hours * 60 + minutes
    
    columns = ["minutes", "weekNumber","GlobalHoriz", "RelHumidity", "AvgWindSpeed", "Precipitation"]
    data = [(total_minutes, week_num, ghi, humidity, wind, precipitation)]
    selection_df = spark.createDataFrame(data=data, schema=columns)

    #Determine model to use for prediction
    if farm_size == '100 MW':
        pred_df = sw_100_pipeline_model.transform(selection_df)
    elif farm_size == '150 MW':
        pred_df = sw_150_pipeline_model.transform(selection_df)
    else:
        pred_df = sw_200_pipeline_model.transform(selection_df)
    
    prediction = pred_df.collect()[0][7]
    
    return prediction