In [None]:
pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 70kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 52.1MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=7a7f10548f26b1392b3768752dbbe584378ee4151dd99d816a47a7a316fd30bb
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


Now that pyspark is installed, we need to import the necessary libraries and create our Spark session

*   List item
*   List item



In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import *

#create the session
conf = SparkConf()

#create the context
sc = pyspark.SparkContext('local[*]')
spark = SparkSession.builder.getOrCreate()

Next create our dataframes for the energy and weather datasets

In [None]:
#inferSchema allows different datatypes
energy_df = spark.read.csv('energy_dataset.csv', inferSchema=True, header=True)
energy_df.show(10)

weather_df = spark.read.csv('weather_features.csv', inferSchema=True, header=True)
weather_df.show(10)


+--------------------+------------------+------------------------------------+----------------------------------+---------------------+---------------------------+---------------------+---------------------------+----------------------+---------------------+------------------------------------------+-------------------------------------------+------------------------------------------+--------------------------------+-----------------+------------------+----------------+--------------------------+----------------+----------------+------------------------+-----------------------+------------------------+---------------------------------+-------------------------------+-------------------+-----------------+---------------+------------+
|                time|generation biomass|generation fossil brown coal/lignite|generation fossil coal-derived gas|generation fossil gas|generation fossil hard coal|generation fossil oil|generation fossil oil shale|generation fossil peat|generation geotherma

As we can see, the two dataframes have similar entries for the 'dt_iso' and 'time' columns. We want to join the energy and weather tables on this column so that we can look for correlations between the two datasets.

In [None]:
combined_df = weather_df.join(energy_df, weather_df.dt_iso == energy_df.time, 'inner')
combined_df.show(10)

+--------------------+---------+------------------+------------------+------------------+--------+--------+----------+--------+-------+-------+-------+----------+----------+------------+-------------------+------------+--------------------+------------------+------------------------------------+----------------------------------+---------------------+---------------------------+---------------------+---------------------------+----------------------+---------------------+------------------------------------------+-------------------------------------------+------------------------------------------+--------------------------------+-----------------+------------------+----------------+--------------------------+----------------+----------------+------------------------+-----------------------+------------------------+---------------------------------+-------------------------------+-------------------+-----------------+---------------+------------+
|              dt_iso|city_name|      

We wanted to do some analsis using the K-nearest neighbors algorithm to try to make some predictions with our datasets. I played around using a few different sets of analysis data to see which produced a better accuracy score. A better accuracy score could show a stronger relationship between the analysis and target data. This helps us to have a better idea of which of the weather features have more of an impact on different energy features. First I wanted to see if different renewable energy generation data could be used to predict what the main weather description was for that day. For example, would high solar generation times correspond to days when the weather description was 'clear'?

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# drop rows that contain null values for the selected columns
knn_df = combined_df.select('generation solar', 'generation wind onshore', 'generation wind offshore', 'weather_main')
knn_df_drop = knn_df.dropna()

#create feature and target arrays
knn_data = np.array(knn_df_drop.select('generation solar', 'generation wind onshore', 'generation wind offshore').collect())
knn_target = np.array(knn_df_drop.select('weather_main').collect())

#use np.ravel() to convert the target array to the proper format
knn_target = np.ravel(knn_target)

#split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(knn_data, knn_target, test_size=0.2, random_state=42)

#create the model
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(X_train, y_train)

#test the model
print(knn.predict(X_test))

#get the accuracy score for the model
print(knn.score(X_test, y_test))


['clear' 'clouds' 'clouds' ... 'clouds' 'clouds' 'clear']
0.4859794739498626


The accuracy for this model which predicted the main weather description based on onshore wind generation, offshore wind generation, and solar generation had an accuracy score of .486

Next we will run the same KNN model, but this time we will see if we are able to use energy prices to predict the temperature. This time we will look at just one city, since variations in average temperatures would impact our data. We will also just look at data for one year since pricing fluctuates over time. For this example, we will use data from Madrid in 2017.

In [None]:
from sklearn import preprocessing
from sklearn import utils

# drop rows that contain null values for the selected columns
knn_df = combined_df.select('price actual', 'temp')
knn_df_drop = knn_df.dropna()

#create feature and target arrays
knn_data = np.array(knn_df_drop.select('price actual').filter((weather_df.city_name == 'Madrid') & (weather_df.dt_iso.like('2017-11%'))).collect())
knn_target = np.array(knn_df_drop.select('temp').filter((weather_df.city_name == 'Madrid') & (weather_df.dt_iso.like('2017-11%'))).collect())

#use np.ravel() to convert the target array to the proper format
knn_target = np.ravel(knn_target)

#convert target data to categorical. KNN does not allow float values in the target set
le = preprocessing.LabelEncoder()
knn_target= le.fit_transform(knn_target)

#split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(knn_data, knn_target, test_size=0.2, random_state=42)

#create the model
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(X_train, y_train)

#test the model
print(knn.predict(X_test))

#get the accuracy score for the model
print(knn.score(X_test, y_test))

[  3  64  15  20   0  77  24  63  62  46  77   4  51  29  74 139 114 139
  24 236  42  18  58  15 222 117  24  18  29 222  85 236  57 126  15  24
  15 236 117 125  38 114  42 130  81  15  15  20  96 128  15  64  96 122
   6  10  29  68  63  15  19 126  43 139  80  29  62  19  42 103 126  20
  71  42  28  62  23  23  96  15 128 236  68  42  89  29 103  42  46  73
 126  15  29  15 126  96 176 133 236 243  29  46 122 105 120  71 114  70
  83  15   5 114  28  37  15 126  29  15  42  57  96   1  73 243 120  50
 126  18  15 126  77 139  18  73 146  15  78  35  62  42 139  80  15 103
  24  29]
0.02054794520547945


This model had a significantly lower accuracy score of 0.0205. This means that temperature alone was not useful in determining the energy prices. This could be due to the location of Madrid not having as large of temperature fluctuations. 

Next we will try looking at how strong the relationship is between energy features such as price, total load, and waste and the main weather description for the day. Again we will look at Madrid in 2017.

In [None]:
# drop rows that contain null values for the selected columns
knn_df = combined_df.select('price actual', 'generation waste', 'total load actual', 'weather_main')
knn_df_drop = knn_df.dropna()

#create feature and target arrays
knn_data = np.array(knn_df_drop.select('price actual', 'generation waste', 'total load actual').filter((weather_df.city_name == 'Madrid') & (weather_df.dt_iso.like('2017-11%'))).collect())
knn_target = np.array(knn_df_drop.select('weather_main').filter((weather_df.city_name == 'Madrid') & (weather_df.dt_iso.like('2017-11%'))).collect())

#use np.ravel() to convert the target array to the proper format
knn_target = np.ravel(knn_target)

#split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(knn_data, knn_target, test_size=0.2, random_state=42)

#create the model
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(X_train, y_train)

#test the model
print(knn.predict(X_test))

#get the accuracy score for the model
print(knn.score(X_test, y_test))

['clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear'
 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear'
 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear'
 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clouds' 'clear'
 'clear' 'clouds' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear'
 'clouds' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear'
 'clear' 'clouds' 'clear' 'clouds' 'clear' 'clouds' 'clear' 'clear'
 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear'
 'clear' 'clear' 'clouds' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear'
 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear'
 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clouds'
 'clear' 'clouds' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear'
 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear'
 'clear' 'clear' 'clear' 'clouds' 'clear' 'clear' 

This model had an accuracy of .596. This shows that there is a relationship between the weather and features like energy price and use. It is likely that a location with more drastic weather changes - or a location like Texas that is only prepared for one end of the temperature spectrum - would see a stronger correlation between these variables. 

In [None]:
# drop rows that contain null values for the selected columns
knn_df = combined_df.select('price actual', 'price day ahead', 'weather_main')
knn_df_drop = knn_df.dropna()

#create feature and target arrays
knn_data = np.array(knn_df_drop.select('price actual', 'price day ahead').filter((weather_df.city_name == 'Madrid') & (weather_df.dt_iso.like('2017-11%'))).collect())
knn_target = np.array(knn_df_drop.select('weather_main').filter((weather_df.city_name == 'Madrid') & (weather_df.dt_iso.like('2017-11%'))).collect())

#use np.ravel() to convert the target array to the proper format
knn_target = np.ravel(knn_target)

#split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(knn_data, knn_target, test_size=0.2, random_state=42)

#create the model
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(X_train, y_train)

#test the model
print(knn.predict(X_test))

#get the accuracy score for the model
print(knn.score(X_test, y_test))

['clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clouds' 'clear' 'clouds'
 'clear' 'clear' 'clear' 'clear' 'clear' 'clouds' 'clouds' 'clear' 'clear'
 'clear' 'clear' 'clear' 'clouds' 'clear' 'clear' 'clouds' 'clear' 'clear'
 'clouds' 'clear' 'clear' 'clear' 'clouds' 'clouds' 'clouds' 'clear'
 'clouds' 'clear' 'clouds' 'clouds' 'clear' 'clear' 'clear' 'clear'
 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear'
 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clouds' 'clouds' 'clear'
 'clear' 'clear' 'clear' 'clear' 'clear' 'clouds' 'clear' 'clear' 'clear'
 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clouds' 'clear' 'clear'
 'clear' 'clear' 'clear' 'clear' 'clear' 'clouds' 'clear' 'clear' 'clear'
 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear'
 'clear' 'clear' 'clouds' 'clouds' 'clear' 'clear' 'clear' 'clear'
 'clouds' 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'clouds' 'clear'
 'clear' 'clear' 'clear' 'clear' 'clear' 'clear' 'cle

The model using 'price day ahead' and 'price actual' was by far the best at predicting the 'weather_main' variable - with an accuracy score of 0.7397.