In [None]:
import findspark

findspark.init()
findspark.find()

In [None]:
from pyspark.sql import SparkSession
import pyspark
from pyspark.sql.functions import translate, col

spark = SparkSession.builder.appName("NV Energy Project").getOrCreate()

In [7]:
#Import consumption data

from pyspark.sql.functions import translate, col

nv_ac_df = spark.read.option("header", "true").option("inferSchema", "true").csv("CleanData/nv_annual_consumption60-21.csv")

nv_ac_df2 = nv_ac_df.withColumn("TotalConsumption", translate(col("TotalConsumption"), ",", "").cast("integer"))


In [8]:
#Convert data to millions scale for visualization

import pyspark
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

convertUDF = udf(lambda z: convertMill(z), DoubleType())

def convertMill(reading):
    new = reading/1000000
    return new

nv_ac_df2 = nv_ac_df2.select("YEAR", "TotalConsumption", convertUDF(col("TotalConsumption").alias("TotalMillions"))) 

nv_ac_df2 = nv_ac_df2.withColumnRenamed("<lambda>(TotalConsumption AS TotalMillions)", "Actual Consumption")

nv_ac_df3 = nv_ac_df2.filter(nv_ac_df2.YEAR != 1990)

nv_ac_df3 = nv_ac_df3.filter(nv_ac_df2.YEAR != 2010)


In [9]:
#Predictions from Consumption LR model for vizualization

from pyspark.sql.functions import col, udf

convertUDF = udf(lambda z: predict(z), DoubleType())

def predict(x):
    return  x * 675250.9602239017 - 1325137830.201505

nv_ac_df3 = nv_ac_df3.select("YEAR", "TotalConsumption", "Actual Consumption", 
                             convertUDF(col("YEAR").alias("ModelPred"))) 

nv_ac_df3 = nv_ac_df3.withColumnRenamed("<lambda>(YEAR AS ModelPred)", "ModelPred")

#Convert predictions to millions scale

convertUDF2 = udf(lambda z: convertMill(z), DoubleType())

def convertMill(reading):
    new = reading/1000000
    return new

nv_ac_df4 = nv_ac_df3.select("YEAR", "TotalConsumption", "Actual Consumption", "ModelPred",
                             convertUDF2(col("ModelPred").alias("PredMillions"))) 

nv_ac_df4 = nv_ac_df4.withColumnRenamed("<lambda>(ModelPred AS PredMillions)", "Model Predictions")

nv_ac_df4 = nv_ac_df4.orderBy("YEAR")


In [10]:
#Create panda frame for visualization

def createNVCFrame():
    
    nv_ac_df5 = nv_ac_df4.toPandas()
    
    return nv_ac_df5