In [2]:
try:
    sc.stop()
except:
    pass
from pyspark import SparkContext


sc = SparkContext(appName='ModelSelection').getOrCreate()
from pyspark.sql import SparkSession
spark = SparkSession(sparkContext = sc)
spark.conf.set("spark.sql.session.timeZone", "UTC")

In [3]:
df = spark.read.csv(f"hdfs://localhost:9000/solar_data/processed/Arkansas_1.csv", inferSchema=True, header=True)

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler


In [5]:
spark

In [6]:
###Random Forest Regressor
cols = df.columns
cols.remove('sum(GHI)')

#vectorize features
assembler = VectorAssembler(inputCols=cols, outputCol='features')
data = assembler.transform(df)

#train/test split
train, test = data.randomSplit([.8, .2])

##Configuring ML pipeline (might use format later)
rf = RandomForestRegressor(featuresCol='features', labelCol='sum(GHI)')
pipeline = Pipeline(stages=[rf])


paramGrid = (ParamGridBuilder()
                 .addGrid(rf.numTrees, [5, 10, 48]) #Extremely buggy, couldn't tune too many params
                 .build())

crossval = CrossValidator(estimator=pipeline,
                         estimatorParamMaps=paramGrid,
                         evaluator=RegressionEvaluator(labelCol='sum(GHI)'),
                         numFolds=3)

rfmodel = crossval.fit(train)


In [7]:
rf1 = rfmodel.bestModel.stages[0] 

In [8]:
rf1.save("/home/dyllanjr/Solar_Irradiance_Prediction/models/Random_Forest")

In [9]:
evaluator = RegressionEvaluator(labelCol='sum(GHI)')
train_preds = rf1.transform(train)
print("Train RMSE: ", evaluator.evaluate(train_preds))
evaluator = RegressionEvaluator(labelCol='sum(GHI)', metricName='r2')
print("Train r2: ", evaluator.evaluate(train_preds))

Train RMSE:  1312.3685013328789
Train r2:  0.9097137847533137


In [10]:
evaluator = RegressionEvaluator(labelCol='sum(GHI)')
test_preds = rf1.transform(test)
print("Test RMSE: ", evaluator.evaluate(test_preds))
evaluator = RegressionEvaluator(labelCol='sum(GHI)', metricName='r2')
print("Test r2: ", evaluator.evaluate(test_preds))

Test RMSE:  1359.0105543262655
Test r2:  0.9043913763361462


In [13]:
###Gradient Boosted Tree Regressor
cols = df.columns
cols.remove('sum(GHI)')

#vectorize features
assembler = VectorAssembler(inputCols=cols, outputCol='features')
data = assembler.transform(df)

#train/test split
train, test = data.randomSplit([.8, .2])

##Configuring ML pipeline (might use format later)
gbf = GBTRegressor(featuresCol='features', labelCol='sum(GHI)')
pipeline = Pipeline(stages=[gbf])


paramGrid = (ParamGridBuilder()
                 .addGrid(gbf.maxDepth, [5, 10, 15])
                 .build())

crossval = CrossValidator(estimator=pipeline,
                         estimatorParamMaps=paramGrid,
                         evaluator=RegressionEvaluator(labelCol='sum(GHI)'),
                         numFolds=3)

gbfmodel = crossval.fit(train)

In [14]:
gbf1 = gbfmodel.bestModel.stages[0] 
gbf1.save("/home/dyllanjr/Solar_Irradiance_Prediction/models/Gradient_Boosted")

In [15]:
evaluator = RegressionEvaluator(labelCol='sum(GHI)')
train_preds = gbf1.transform(train)
print("Train RMSE: ", evaluator.evaluate(train_preds))
evaluator = RegressionEvaluator(labelCol='sum(GHI)', metricName='r2')
print("Train r2: ", evaluator.evaluate(train_preds))

Train RMSE:  1042.9539560552705
Train r2:  0.9430340956812803


In [16]:
evaluator = RegressionEvaluator(labelCol='sum(GHI)')
test_preds = gbf1.transform(test)
print("Test RMSE: ", evaluator.evaluate(test_preds))
evaluator = RegressionEvaluator(labelCol='sum(GHI)', metricName='r2')
print("Test r2: ", evaluator.evaluate(test_preds))

Test RMSE:  1372.364667046965
Test r2:  0.9020236492288102
