# Start Server and import modules

In [122]:
# REFERENCE: https://www.kaggle.com/code/sercanyesiloz/pyspark-tutorial/notebook
import os
import warnings
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, FloatType
from pyspark.sql.functions import split, count, when, isnan, col, regexp_replace

# importing the necessary modules

import sys
import pyspark
import random
import pandas as pd
import findspark
findspark.init()

from pyspark.ml.regression import LinearRegression
from pyspark.ml.stat import Correlation
import pyspark.sql.functions as F

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = "notebook"

In [123]:
!pip3 install kaggle
import kaggle
import zipfile
from os.path import exists

# Authenticate
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

# check for file and if it does not exist, download and extract dataset
if os.path.exists('./darksky') is False:
    api.dataset_download_files('jeanmidev/smart-meters-in-london', path = './')
    with zipfile.ZipFile('./smart-meters-in-london.zip', 'r') as zipref:
        zipref.extractall('./darksky')

# list files
!ls -R ./darksky

acorn_details.csv                     informations_households.csv
[34mdaily_dataset[m[m                         uk_bank_holidays.csv
darksky_parameters_documentation.html weather_daily_darksky.csv
[34mhalfhourly_dataset[m[m                    weather_hourly_darksky.csv

./darksky/daily_dataset:
[34mdaily_dataset[m[m

./darksky/daily_dataset/daily_dataset:
block_105.csv block_111.csv block_18.csv  block_24.csv  block_30.csv
block_106.csv block_12.csv  block_19.csv  block_25.csv  block_31.csv
block_107.csv block_13.csv  block_2.csv   block_26.csv  block_32.csv
block_108.csv block_14.csv  block_20.csv  block_27.csv  block_33.csv
block_109.csv block_15.csv  block_21.csv  block_28.csv  block_34.csv
block_11.csv  block_16.csv  block_22.csv  block_29.csv
block_110.csv block_17.csv  block_23.csv  block_3.csv

./darksky/halfhourly_dataset:
[34mhalfhourly_dataset[m[m

./darksky/halfhourly_dataset/halfhourly_dataset:
block_62.csv block_69.csv block_75.csv block_81.csv block_88.csv blo

In [124]:
# importing the necessary modules

import findspark
findspark.init()

# Initialize a SparkSession
from pyspark.sql import SparkSession

# Creating SparkSession object
spark = SparkSession.builder \
    .appName('TeamDarkSky') \
    .config("spark.jars", "/Library/Frameworks/Python.framework/Versions/3.11/bin/sqljdbc_12.2/enu/mssql-jdbc-12.2.0.jre8.jar")\
    .getOrCreate()

# Caling the session variable object

spark

# Import Datasets

In [125]:
# create Spark dataframe for daily energy usage data

dailyEnergyDf = spark.read.csv('./darksky/daily_dataset/daily_dataset', inferSchema=True, header=True)

dailyEnergyDf.printSchema()



[Stage 433:>                                                        (0 + 8) / 8]

root
 |-- LCLid: string (nullable = true)
 |-- day: timestamp (nullable = true)
 |-- energy_median: double (nullable = true)
 |-- energy_mean: double (nullable = true)
 |-- energy_max: double (nullable = true)
 |-- energy_count: integer (nullable = true)
 |-- energy_std: double (nullable = true)
 |-- energy_sum: double (nullable = true)
 |-- energy_min: double (nullable = true)



                                                                                

In [126]:
dailyEnergyDf.show()

+---------+-------------------+-------------------+-------------------+-------------------+------------+--------------------+------------------+--------------------+
|    LCLid|                day|      energy_median|        energy_mean|         energy_max|energy_count|          energy_std|        energy_sum|          energy_min|
+---------+-------------------+-------------------+-------------------+-------------------+------------+--------------------+------------------+--------------------+
|MAC000048|2011-12-08 00:00:00|              0.107|0.15921739130434787| 0.5760000000000001|          23| 0.11802116779387856| 3.662000000000001|               0.087|
|MAC000048|2011-12-09 00:00:00|              0.092|            0.12575|               0.57|          48| 0.08927664391617872|             6.036|               0.087|
|MAC000048|2011-12-10 00:00:00|             0.0925|0.20964583541666668| 1.3219999999999998|          48|  0.2882244672915173|        10.0630001|               0.087|
|MAC

In [127]:
# Create Spark dataframe for hourly weather
weatherHourlyDf = spark.read.csv('./darksky/weather_hourly_darksky.csv', inferSchema=True, header=True)

weatherHourlyDf.printSchema()

root
 |-- visibility: double (nullable = true)
 |-- windBearing: integer (nullable = true)
 |-- temperature: double (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- dewPoint: double (nullable = true)
 |-- pressure: double (nullable = true)
 |-- apparentTemperature: double (nullable = true)
 |-- windSpeed: double (nullable = true)
 |-- precipType: string (nullable = true)
 |-- icon: string (nullable = true)
 |-- humidity: double (nullable = true)
 |-- summary: string (nullable = true)



In [128]:
weatherHourlyDf.show()

+----------+-----------+-----------+-------------------+--------+--------+-------------------+---------+----------+-------------------+--------+-------------+
|visibility|windBearing|temperature|               time|dewPoint|pressure|apparentTemperature|windSpeed|precipType|               icon|humidity|      summary|
+----------+-----------+-----------+-------------------+--------+--------+-------------------+---------+----------+-------------------+--------+-------------+
|      5.97|        104|      10.24|2011-11-11 00:00:00|    8.86| 1016.76|              10.24|     2.77|      rain|partly-cloudy-night|    0.91|Partly Cloudy|
|      4.88|         99|       9.76|2011-11-11 01:00:00|    8.83| 1016.63|               8.24|     2.95|      rain|partly-cloudy-night|    0.94|Partly Cloudy|
|       3.7|         98|       9.46|2011-11-11 02:00:00|    8.79| 1016.36|               7.76|     3.17|      rain|partly-cloudy-night|    0.96|Partly Cloudy|
|      3.12|         99|       9.23|2011-11-11

In [129]:
holidaysDf = spark.read.csv('./darksky/uk_bank_holidays.csv', inferSchema=True, header=True)

holidaysDf.show(1000)

+-------------------+--------------------+
|      Bank holidays|                Type|
+-------------------+--------------------+
|2012-12-26 00:00:00|          Boxing Day|
|2012-12-25 00:00:00|       Christmas Day|
|2012-08-27 00:00:00| Summer bank holiday|
|2012-05-06 00:00:00|Queen?s Diamond J...|
|2012-04-06 00:00:00|Spring bank holid...|
|2012-07-05 00:00:00|Early May bank ho...|
|2012-09-04 00:00:00|       Easter Monday|
|2012-06-04 00:00:00|         Good Friday|
|2012-01-02 00:00:00|New Year?s Day (s...|
|2013-12-26 00:00:00|          Boxing Day|
|2013-12-25 00:00:00|       Christmas Day|
|2013-08-26 00:00:00| Summer bank holiday|
|2013-05-27 00:00:00| Spring bank holiday|
|2013-06-05 00:00:00|Early May bank ho...|
|2013-01-04 00:00:00|       Easter Monday|
|2013-03-29 00:00:00|         Good Friday|
|2013-01-01 00:00:00|      New Year?s Day|
|2014-12-26 00:00:00|          Boxing Day|
|2014-12-25 00:00:00|       Christmas Day|
|2014-08-25 00:00:00| Summer bank holiday|
|2014-05-26

X acorn_details.csv                       X informations_households.csv
X daily_dataset                           X uk_bank_holidays.csv
/ darksky_parameters_documentation.html   X weather_daily_darksky.csv
X halfhourly_dataset                      X weather_hourly_darksky.csv
X hhblock_dataset

In [130]:
# cache datasets
dailyEnergyDf.cache()

holidaysDf.cache()

weatherHourlyDf.cache()

23/06/11 02:39:42 WARN CacheManager: Asked to cache already cached data.
23/06/11 02:39:42 WARN CacheManager: Asked to cache already cached data.
23/06/11 02:39:42 WARN CacheManager: Asked to cache already cached data.


DataFrame[visibility: double, windBearing: int, temperature: double, time: timestamp, dewPoint: double, pressure: double, apparentTemperature: double, windSpeed: double, precipType: string, icon: string, humidity: double, summary: string]

# Pre-process data to get ready for aggregation, filters, and joins

In [131]:
from pyspark.sql.functions import to_date

# create date column in all of the datasets 
dailyEnergyDf = dailyEnergyDf.withColumn("Date", to_date(col("day")))
weatherHourlyDf = weatherHourlyDf.withColumn("Date", to_date(col("time")))
holidaysDf = holidaysDf.withColumn("Date", to_date(col("Bank holidays")))


In [132]:
datesDf = holidaysDf.select(col("Date"))

filtereddailyEnergyDf = dailyEnergyDf.join(datesDf, dailyEnergyDf.Date == datesDf.Date, "leftanti")

filteredweatherHourlyDf = weatherHourlyDf.join(datesDf, weatherHourlyDf.Date == datesDf.Date, "leftanti")


In [133]:
filtereddailyEnergyDf.show()

+---------+-------------------+-------------------+-------------------+-------------------+------------+--------------------+------------------+--------------------+----------+
|    LCLid|                day|      energy_median|        energy_mean|         energy_max|energy_count|          energy_std|        energy_sum|          energy_min|      Date|
+---------+-------------------+-------------------+-------------------+-------------------+------------+--------------------+------------------+--------------------+----------+
|MAC000048|2011-12-08 00:00:00|              0.107|0.15921739130434787| 0.5760000000000001|          23| 0.11802116779387856| 3.662000000000001|               0.087|2011-12-08|
|MAC000048|2011-12-09 00:00:00|              0.092|            0.12575|               0.57|          48| 0.08927664391617872|             6.036|               0.087|2011-12-09|
|MAC000048|2011-12-10 00:00:00|             0.0925|0.20964583541666668| 1.3219999999999998|          48|  0.2882244

In [134]:
# filter to only show data for nighttime
from pyspark.sql.functions import avg

filteredweatherHourlyDf = filteredweatherHourlyDf.filter(col("icon").contains("night"))

filteredweatherHourlyDf.show()

+----------+-----------+-----------+-------------------+--------+--------+-------------------+---------+----------+-------------------+--------+-------------+----------+
|visibility|windBearing|temperature|               time|dewPoint|pressure|apparentTemperature|windSpeed|precipType|               icon|humidity|      summary|      Date|
+----------+-----------+-----------+-------------------+--------+--------+-------------------+---------+----------+-------------------+--------+-------------+----------+
|      5.97|        104|      10.24|2011-11-11 00:00:00|    8.86| 1016.76|              10.24|     2.77|      rain|partly-cloudy-night|    0.91|Partly Cloudy|2011-11-11|
|      4.88|         99|       9.76|2011-11-11 01:00:00|    8.83| 1016.63|               8.24|     2.95|      rain|partly-cloudy-night|    0.94|Partly Cloudy|2011-11-11|
|       3.7|         98|       9.46|2011-11-11 02:00:00|    8.79| 1016.36|               7.76|     3.17|      rain|partly-cloudy-night|    0.96|Partly

In [135]:
# aggregate the weather by date and average the continuous columns

aggWeatherHourlyDf = filteredweatherHourlyDf.groupBy("Date"
                                            ).agg(avg("visibility").alias("DarkSky (visibility)"), 
                                                  avg("windBearing"), 
                                                  avg("temperature"), 
                                                  avg("dewPoint"), 
                                                  avg("pressure"), 
                                                  avg("apparentTemperature"), 
                                                  avg("windSpeed"), 
                                                  avg("humidity"))

aggWeatherHourlyDf.show()

+----------+--------------------+------------------+--------------------+-------------------+------------------+------------------------+------------------+------------------+
|      Date|DarkSky (visibility)|  avg(windBearing)|    avg(temperature)|      avg(dewPoint)|     avg(pressure)|avg(apparentTemperature)|    avg(windSpeed)|     avg(humidity)|
+----------+--------------------+------------------+--------------------+-------------------+------------------+------------------------+------------------+------------------+
|2012-04-17|  13.532999999999998|             192.4|               6.009| 1.3800000000000001|1003.9019999999998|                     2.8|             4.603|             0.724|
|2012-10-06|   9.793076923076924|112.61538461538461|   8.657692307692308|  7.343846153846154|1012.3853846153847|       7.777692307692307|1.7769230769230768|0.9161538461538462|
|2013-03-26|                13.1| 38.18181818181818|  1.1945454545454548| -4.476363636363636|1013.1272727272728|     -2.

In [136]:
# Join the filtered and aggregated energy and weather tables
weatherJoinEnergy = aggWeatherHourlyDf.join(filtereddailyEnergyDf, aggWeatherHourlyDf.Date == filtereddailyEnergyDf.Date, "inner"
                                            ).select("DarkSky (visibility)", 
                                                     "avg(windBearing)", 
                                                     "avg(temperature)",
                                                     "avg(dewPoint)",
                                                     "avg(pressure)",
                                                     "avg(apparentTemperature)",
                                                     "avg(windSpeed)",
                                                     "avg(humidity)",
                                                     "energy_median",
                                                     "energy_mean",
                                                     "energy_max",
                                                     "energy_std",
                                                     "energy_sum",
                                                     "energy_min",
                                                     )

weatherJoinEnergy = weatherJoinEnergy.na.drop()

weatherJoinEnergy.show()

+--------------------+------------------+------------------+-------------------+------------------+------------------------+------------------+------------------+-------------------+-------------------+-------------------+--------------------+------------------+--------------------+
|DarkSky (visibility)|  avg(windBearing)|  avg(temperature)|      avg(dewPoint)|     avg(pressure)|avg(apparentTemperature)|    avg(windSpeed)|     avg(humidity)|      energy_median|        energy_mean|         energy_max|          energy_std|        energy_sum|          energy_min|
+--------------------+------------------+------------------+-------------------+------------------+------------------------+------------------+------------------+-------------------+-------------------+-------------------+--------------------+------------------+--------------------+
|  13.435000000000002|             227.0|           5.33625|             2.4925|1018.7537500000001|                  2.3175|           3.97125|0.820

# Model all of the energy and weather variables together

In [137]:
from pyspark.ml.feature import VectorAssembler

# converting columns into independent features (variable)
# exclude string data type
featureassembler = VectorAssembler(inputCols=["avg(windBearing)", 
                                              "avg(temperature)",
                                              "avg(dewPoint)",
                                              "avg(pressure)",
                                              "avg(apparentTemperature)",
                                              "avg(windSpeed)",
                                              "avg(humidity)",
                                              "energy_median",
                                              "energy_mean",
                                              "energy_max",
                                              "energy_std",
                                              "energy_sum",
                                              "energy_min"], 
                                    outputCol="Independent Features")

In [138]:
# transforming
output = featureassembler.transform(weatherJoinEnergy)

output.show()

+--------------------+------------------+------------------+-------------------+------------------+------------------------+------------------+------------------+-------------------+-------------------+-------------------+--------------------+------------------+--------------------+--------------------+
|DarkSky (visibility)|  avg(windBearing)|  avg(temperature)|      avg(dewPoint)|     avg(pressure)|avg(apparentTemperature)|    avg(windSpeed)|     avg(humidity)|      energy_median|        energy_mean|         energy_max|          energy_std|        energy_sum|          energy_min|Independent Features|
+--------------------+------------------+------------------+-------------------+------------------+------------------------+------------------+------------------+-------------------+-------------------+-------------------+--------------------+------------------+--------------------+--------------------+
|  13.435000000000002|             227.0|           5.33625|             2.4925|1018.

In [139]:
# Selecting independent (feature) and dependent variable (price)
finalized_data = output.select("Independent Features", "DarkSky (visibility)")

finalized_data.show()

+--------------------+--------------------+
|Independent Features|DarkSky (visibility)|
+--------------------+--------------------+
|[227.0,5.33625,2....|  13.435000000000002|
|[256.5,4.68812499...|  12.920000000000002|
|[240.352941176470...|  12.968823529411766|
|[204.117647058823...|   11.94764705882353|
|[235.615384615384...|  12.796923076923076|
|[231.555555555555...|  13.064444444444442|
|[217.470588235294...|  13.297058823529412|
|[192.5,5.50857142...|  12.532857142857141|
|[213.352941176470...|  11.042941176470588|
|[289.470588235294...|   11.22235294117647|
|[295.235294117647...|  13.042352941176471|
|[242.647058823529...|  10.803529411764705|
|[265.058823529411...|  13.011176470588234|
|[247.117647058823...|   9.331764705882353|
|[253.058823529411...|   13.05764705882353|
|[261.375,8.848125...|  12.755625000000002|
|[256.6875,5.67687...|             13.4225|
|[204.7,9.3,5.5319...|  13.366999999999999|
|[232.176470588235...|                12.6|
|[263.705882352941...|  13.08176

In [140]:
# Splitting data into train and test
train_data, test_data = finalized_data.randomSplit([0.7, 0.3], seed=42)


In [141]:
train_data.show()

+--------------------+--------------------+
|Independent Features|DarkSky (visibility)|
+--------------------+--------------------+
|(13,[0,1,2,3,4,5,...|                3.67|
|(13,[0,1,2,3,4,5,...|             13.1525|
|(13,[0,1,2,3,4,5,...|  10.231666666666667|
|(13,[0,1,2,3,4,5,...|  12.449166666666665|
|(13,[0,1,2,3,4,5,...|  12.449166666666665|
|(13,[0,1,2,3,4,5,...|   4.464999999999999|
|(13,[0,1,2,3,4,5,...|  12.233333333333334|
|(13,[0,1,2,3,4,5,...|   8.384615384615385|
|(13,[0,1,2,3,4,5,...|   8.384615384615385|
|(13,[0,1,2,3,4,5,...|  13.432142857142859|
|(13,[0,1,2,3,4,5,...|            12.50375|
|(13,[0,1,2,3,4,5,...|            13.27125|
|(13,[0,1,2,3,4,5,...|              12.528|
|(13,[0,1,2,3,4,5,...|               12.81|
|(13,[0,1,2,3,4,5,...|  11.827000000000002|
|(13,[0,1,2,3,4,5,...|  13.272499999999999|
|(13,[0,1,2,3,4,5,...|  12.891428571428571|
|(13,[0,1,2,3,4,5,...|   5.974285714285714|
|(13,[0,1,2,3,4,5,...|             14.0025|
|(13,[0,1,2,3,4,5,...|  13.05272

                                                                                

In [142]:
# applying linear regression on selected feature and label data
regressor = LinearRegression(featuresCol = "Independent Features", labelCol = "DarkSky (visibility)")

In [143]:
# training our model
trained_model = regressor.fit(train_data)

23/06/11 02:39:45 WARN Instrumentation: [610e9f75] regParam is zero, which might cause numerical instability and overfitting.


                                                                                

In [144]:
# evaluating the trained model with train data
train_results = trained_model.evaluate(train_data)

                                                                                

In [145]:
# Calculating Rsquared and printing value
print("The value of Rsquared is:", train_results.r2)
print("The model accuracy is {0:.0f}% with train data".format(train_results.r2*100))

The value of Rsquared is: 0.4925177622457696
The model accuracy is 49% with train data


In [146]:
# evaluate the trained model on test data
test_results = trained_model.evaluate(test_data)

                                                                                

In [147]:
# Calculating Rsquared and printing value
print("The value of Rsquared is:", test_results.r2)
print("The model accuracy is {0:.0f}% with test data".format(test_results.r2*100))

The value of Rsquared is: 0.49211412077897465
The model accuracy is 49% with test data


In [148]:
# Predicting the amount of visibility
predictions = trained_model.transform(test_data)

In [149]:
# display the prediction visibility and current visibility
predictions.show(truncate=True)

+--------------------+--------------------+------------------+
|Independent Features|DarkSky (visibility)|        prediction|
+--------------------+--------------------+------------------+
|(13,[0,1,2,3,4,5,...|  13.264545454545452|11.566463168282468|
|(13,[0,1,2,3,4,5,...|               8.216| 8.094285488675396|
|(13,[0,1,2,3,4,5,...|  13.925714285714283|11.530212828341533|
|(13,[0,1,2,3,4,5,...|             13.7625|11.624231652424129|
|(13,[0,1,2,3,4,5,...|   7.235714285714287|10.433614867174782|
|(13,[0,1,2,3,4,5,...|  10.509333333333334| 9.787300089813101|
|(13,[0,1,2,3,4,5,...|  13.432142857142859|12.978985375917972|
|(13,[0,1,2,3,4,5,...|    8.58230769230769| 7.292479367632467|
|(13,[0,1,2,3,4,5,...|  12.452857142857143|11.117368230779846|
|(13,[0,1,2,3,4,5,...|  11.827000000000002|12.447852539525257|
|(13,[0,1,2,3,4,5,...|  11.827000000000002|12.447852539525257|
|(13,[0,1,2,3,4,5,...|  12.783846153846154|11.058616136870157|
|(13,[0,1,2,3,4,5,...|  12.783846153846154|11.058616136

                                                                                

In [150]:
from pyspark.ml.feature import VectorAssembler

# converting columns into independent features (variable)
# exclude string data type
weatherfeatureassembler = VectorAssembler(inputCols=["avg(temperature)",
                                              "avg(dewPoint)",
                                              "avg(pressure)",
                                              "avg(humidity)"],
                                    outputCol="Independent Features")

In [151]:
# transforming
output = weatherfeatureassembler.transform(weatherJoinEnergy)

output.show()

+--------------------+------------------+------------------+-------------------+------------------+------------------------+------------------+------------------+-------------------+-------------------+-------------------+--------------------+------------------+--------------------+--------------------+
|DarkSky (visibility)|  avg(windBearing)|  avg(temperature)|      avg(dewPoint)|     avg(pressure)|avg(apparentTemperature)|    avg(windSpeed)|     avg(humidity)|      energy_median|        energy_mean|         energy_max|          energy_std|        energy_sum|          energy_min|Independent Features|
+--------------------+------------------+------------------+-------------------+------------------+------------------------+------------------+------------------+-------------------+-------------------+-------------------+--------------------+------------------+--------------------+--------------------+
|  13.435000000000002|             227.0|           5.33625|             2.4925|1018.

In [152]:
# Selecting independent (feature) and dependent variable (price)
finalized_data = output.select("Independent Features", "DarkSky (visibility)")

finalized_data.show()

+--------------------+--------------------+
|Independent Features|DarkSky (visibility)|
+--------------------+--------------------+
|[5.33625,2.4925,1...|  13.435000000000002|
|[4.68812499999999...|  12.920000000000002|
|[1.98000000000000...|  12.968823529411766|
|[5.73058823529411...|   11.94764705882353|
|[5.65846153846153...|  12.796923076923076|
|[5.84222222222222...|  13.064444444444442|
|[4.95294117647058...|  13.297058823529412|
|[5.50857142857143...|  12.532857142857141|
|[3.59117647058823...|  11.042941176470588|
|[2.76235294117647...|   11.22235294117647|
|[1.50999999999999...|  13.042352941176471|
|[2.93705882352941...|  10.803529411764705|
|[6.37411764705882...|  13.011176470588234|
|[9.12588235294117...|   9.331764705882353|
|[9.77705882352941...|   13.05764705882353|
|[8.848125,6.23937...|  12.755625000000002|
|[5.67687500000000...|             13.4225|
|[9.3,5.5319999999...|  13.366999999999999|
|[8.11176470588235...|                12.6|
|[6.84411764705882...|  13.08176

In [153]:
# Splitting data into train and test
train_data, test_data = finalized_data.randomSplit([0.7, 0.3], seed=42)

In [154]:
# applying linear regression on selected feature and label data
regressor = LinearRegression(featuresCol = "Independent Features", labelCol = "DarkSky (visibility)")

In [155]:
# training our model
trained_model = regressor.fit(train_data)

23/06/11 02:39:56 WARN Instrumentation: [9e13b480] regParam is zero, which might cause numerical instability and overfitting.


                                                                                

In [156]:
# evaluating the trained model with train data
train_results = trained_model.evaluate(train_data)

# Calculating Rsquared and printing value
print("The value of Rsquared is:", train_results.r2)
print("The model accuracy is {0:.0f}% with train data".format(train_results.r2*100))

[Stage 548:>                                                        (0 + 8) / 8]

The value of Rsquared is: 0.38493383057940067
The model accuracy is 38% with train data


                                                                                

In [157]:
from pyspark.ml.feature import VectorAssembler

# converting columns into independent features (variable)
# exclude string data type
energyfeatureassembler = VectorAssembler(inputCols=["energy_median",
                                                    "energy_mean",
                                                    "energy_max",
                                                    "energy_std",
                                                    "energy_sum",
                                                    "energy_min"], 
                                        outputCol="Independent Features")

In [158]:
# transforming
output = energyfeatureassembler.transform(weatherJoinEnergy)

output.show()

+--------------------+------------------+------------------+-------------------+------------------+------------------------+------------------+------------------+-------------------+-------------------+-------------------+--------------------+------------------+--------------------+--------------------+
|DarkSky (visibility)|  avg(windBearing)|  avg(temperature)|      avg(dewPoint)|     avg(pressure)|avg(apparentTemperature)|    avg(windSpeed)|     avg(humidity)|      energy_median|        energy_mean|         energy_max|          energy_std|        energy_sum|          energy_min|Independent Features|
+--------------------+------------------+------------------+-------------------+------------------+------------------------+------------------+------------------+-------------------+-------------------+-------------------+--------------------+------------------+--------------------+--------------------+
|  13.435000000000002|             227.0|           5.33625|             2.4925|1018.

In [159]:
# Selecting independent (feature) and dependent variable (price)
finalized_data = output.select("Independent Features", "DarkSky (visibility)")

finalized_data.show()

+--------------------+--------------------+
|Independent Features|DarkSky (visibility)|
+--------------------+--------------------+
|[0.107,0.15921739...|  13.435000000000002|
|[0.092,0.12575,0....|  12.920000000000002|
|[0.0925,0.2096458...|  12.968823529411766|
|[0.11599999999999...|   11.94764705882353|
|[0.095,0.12318750...|  12.796923076923076|
|[0.08900000000000...|  13.064444444444442|
|[0.08900000000000...|  13.297058823529412|
|[0.1005,0.1716249...|  12.532857142857141|
|[0.532,0.61254166...|  11.042941176470588|
|[0.098,0.2043125,...|   11.22235294117647|
|[0.36649999999999...|  13.042352941176471|
|[0.55149999999999...|  10.803529411764705|
|[0.1335,0.1788541...|  13.011176470588234|
|[0.08249999999999...|   9.331764705882353|
|[0.117,0.1611875,...|   13.05764705882353|
|[0.105,0.22202083...|  12.755625000000002|
|[0.1225,0.1929375...|             13.4225|
|[0.113,0.11768750...|  13.366999999999999|
|[0.11149999999999...|                12.6|
|[0.111,0.11622916...|  13.08176

In [160]:
# Splitting data into train and test
train_data, test_data = finalized_data.randomSplit([0.7, 0.3], seed=42)

In [161]:
# applying linear regression on selected feature and label data
regressor = LinearRegression(featuresCol = "Independent Features", labelCol = "DarkSky (visibility)")

In [162]:
# training our model
trained_model = regressor.fit(train_data)

23/06/11 02:40:03 WARN Instrumentation: [609a6ade] regParam is zero, which might cause numerical instability and overfitting.


                                                                                

In [163]:
# evaluating the trained model with train data
train_results = trained_model.evaluate(train_data)

# Calculating Rsquared and printing value
print("The value of Rsquared is:", train_results.r2)
print("The model accuracy is {0:.0f}% with train data".format(train_results.r2*100))

[Stage 586:>                                                        (0 + 8) / 8]

The value of Rsquared is: 0.003394646251341449
The model accuracy is 0% with train data


                                                                                

In [167]:
from pyspark.ml.feature import VectorAssembler

# converting columns into independent features (variable)
# exclude string data type
featureassembler = VectorAssembler(inputCols=["avg(windBearing)", 
                                              "avg(temperature)",
                                              "avg(dewPoint)",
                                              "avg(pressure)",
                                              "avg(apparentTemperature)",
                                              "avg(windSpeed)",
                                              "avg(humidity)"],
                                    outputCol="Independent Features")

In [168]:
# transforming
output = featureassembler.transform(weatherJoinEnergy)

output.show()

+--------------------+------------------+------------------+-------------------+------------------+------------------------+------------------+------------------+-------------------+-------------------+-------------------+--------------------+------------------+--------------------+--------------------+
|DarkSky (visibility)|  avg(windBearing)|  avg(temperature)|      avg(dewPoint)|     avg(pressure)|avg(apparentTemperature)|    avg(windSpeed)|     avg(humidity)|      energy_median|        energy_mean|         energy_max|          energy_std|        energy_sum|          energy_min|Independent Features|
+--------------------+------------------+------------------+-------------------+------------------+------------------------+------------------+------------------+-------------------+-------------------+-------------------+--------------------+------------------+--------------------+--------------------+
|  13.435000000000002|             227.0|           5.33625|             2.4925|1018.

In [169]:
# Selecting independent (feature) and dependent variable (price)
finalized_data = output.select("Independent Features", "energy_sum")

finalized_data.show()

+--------------------+------------------+
|Independent Features|        energy_sum|
+--------------------+------------------+
|[227.0,5.33625,2....| 3.662000000000001|
|[256.5,4.68812499...|             6.036|
|[240.352941176470...|        10.0630001|
|[204.117647058823...|        11.7689999|
|[235.615384615384...| 5.913000000000001|
|[231.555555555555...| 4.551000000000002|
|[217.470588235294...| 4.812000000000003|
|[192.5,5.50857142...| 8.237999900000004|
|[213.352941176470...|29.401999900000003|
|[289.470588235294...|             9.807|
|[295.235294117647...|        22.9910001|
|[242.647058823529...|24.404999900000004|
|[265.058823529411...| 8.584999999999999|
|[247.117647058823...|6.0560000000000045|
|[253.058823529411...| 7.737000000000001|
|[261.375,8.848125...|10.656999900000002|
|[256.6875,5.67687...| 9.261000000000003|
|[204.7,9.3,5.5319...| 5.649000000000001|
|[232.176470588235...| 5.583000000000002|
|[263.705882352941...| 5.578999999999999|
+--------------------+------------

In [170]:
# Splitting data into train and test
train_data, test_data = finalized_data.randomSplit([0.7, 0.3], seed=42)

In [171]:
# applying linear regression on selected feature and label data
regressor = LinearRegression(featuresCol = "Independent Features", labelCol = "energy_sum")

In [172]:
# training our model
trained_model = regressor.fit(train_data)

23/06/11 02:47:59 WARN Instrumentation: [f91145c4] regParam is zero, which might cause numerical instability and overfitting.


                                                                                

In [173]:
# evaluating the trained model with train data
train_results = trained_model.evaluate(train_data)

# Calculating Rsquared and printing value
print("The value of Rsquared is:", train_results.r2)
print("The model accuracy is {0:.0f}% with train data".format(train_results.r2*100))

[Stage 636:>                                                        (0 + 8) / 8]

The value of Rsquared is: 0.03223064821031474
The model accuracy is 3% with train data


                                                                                