In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.svm import SVR

In [2]:
df = pd.read_csv("Resources/electricity_prices.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.head()

Unnamed: 0,DateTime,Holiday,HolidayFlag,DayOfWeek,WeekOfYear,Day,Month,Year,PeriodOfDay,ForecastWindProduction,SystemLoadEA,SMPEA,ORKTemperature,ORKWindspeed,CO2Intensity,ActualWindProduction,SystemLoadEP2,SMPEP2
0,01/11/2011 00:00,,0,1,44,1,11,2011,0,315.31,3388.77,49.26,6.0,9.3,600.71,356.0,3159.6,54.32
1,01/11/2011 00:30,,0,1,44,1,11,2011,1,321.8,3196.66,49.26,6.0,11.1,605.42,317.0,2973.01,54.23
2,01/11/2011 01:00,,0,1,44,1,11,2011,2,328.57,3060.71,49.1,5.0,11.1,589.97,311.0,2834.0,54.23
3,01/11/2011 01:30,,0,1,44,1,11,2011,3,335.6,2945.56,48.04,6.0,9.3,585.94,313.0,2725.99,53.47
4,01/11/2011 02:00,,0,1,44,1,11,2011,4,342.9,2849.34,33.75,6.0,11.1,571.52,346.0,2655.64,39.87


In [4]:
#Checking Dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38014 entries, 0 to 38013
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   DateTime                38014 non-null  object
 1   Holiday                 38014 non-null  object
 2   HolidayFlag             38014 non-null  int64 
 3   DayOfWeek               38014 non-null  int64 
 4   WeekOfYear              38014 non-null  int64 
 5   Day                     38014 non-null  int64 
 6   Month                   38014 non-null  int64 
 7   Year                    38014 non-null  int64 
 8   PeriodOfDay             38014 non-null  int64 
 9   ForecastWindProduction  38014 non-null  object
 10  SystemLoadEA            38014 non-null  object
 11  SMPEA                   38014 non-null  object
 12  ORKTemperature          38014 non-null  object
 13  ORKWindspeed            38014 non-null  object
 14  CO2Intensity            38014 non-null  object
 15  Ac

In [5]:
#Convert to numeric data type and convert values that return errors to NAN
df['ForecastWindProduction'] = pd.to_numeric(df['ForecastWindProduction'], errors='coerce')
df['SystemLoadEA'] = pd.to_numeric(df['SystemLoadEA'], errors='coerce')
df['SMPEA'] = pd.to_numeric(df['SMPEA'], errors='coerce')
df['ORKTemperature'] = pd.to_numeric(df['ORKTemperature'], errors='coerce')
df['ORKWindspeed'] = pd.to_numeric(df['ORKWindspeed'], errors='coerce')
df['CO2Intensity'] = pd.to_numeric(df['CO2Intensity'], errors='coerce')
df['ActualWindProduction'] = pd.to_numeric(df['ActualWindProduction'], errors='coerce')
df['SystemLoadEP2'] = pd.to_numeric(df['SystemLoadEP2'], errors='coerce')
df['SMPEP2'] = pd.to_numeric(df['SMPEP2'], errors='coerce')

In [6]:
#Convert DateTime to datetime dtype
df['DateTime']=pd.to_datetime(df['DateTime'],dayfirst=True)

In [7]:
#Confirming Dtypes have changed to float
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38014 entries, 0 to 38013
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   DateTime                38014 non-null  datetime64[ns]
 1   Holiday                 38014 non-null  object        
 2   HolidayFlag             38014 non-null  int64         
 3   DayOfWeek               38014 non-null  int64         
 4   WeekOfYear              38014 non-null  int64         
 5   Day                     38014 non-null  int64         
 6   Month                   38014 non-null  int64         
 7   Year                    38014 non-null  int64         
 8   PeriodOfDay             38014 non-null  int64         
 9   ForecastWindProduction  38009 non-null  float64       
 10  SystemLoadEA            38012 non-null  float64       
 11  SMPEA                   38012 non-null  float64       
 12  ORKTemperature          37719 non-null  float6

In [8]:
df.isnull().sum()

DateTime                    0
Holiday                     0
HolidayFlag                 0
DayOfWeek                   0
WeekOfYear                  0
Day                         0
Month                       0
Year                        0
PeriodOfDay                 0
ForecastWindProduction      5
SystemLoadEA                2
SMPEA                       2
ORKTemperature            295
ORKWindspeed              299
CO2Intensity                7
ActualWindProduction        5
SystemLoadEP2               2
SMPEP2                      2
dtype: int64

In [9]:
df = df.dropna()

In [10]:
df_cleaned = df.rename(columns={"DateTime":"Date", "PeriodOfDay":"HalfHourPeriod", "SystemLoadEA":"NationalLoadForecast",\
                               "SMPEA":"PriceForecast", "ORKTemperature":"Temperature (C)", "ORKWindspeed": "Windspeed (km/h)", "CO2Intensity":"CO2Intensity (g/kWh)",\
                               "ActualWindProduction":"WindProduction (MWh)", "SystemLoadEP2":"NationalSystemLoad (MWh)", "SMPEP2":"Price ($)"})

In [11]:
df_cleaned.corr()[['Price ($)']].sort_values(by='Price ($)', ascending=False)

Unnamed: 0,Price ($)
Price ($),1.0
PriceForecast,0.618158
NationalSystemLoad (MWh),0.517081
NationalLoadForecast,0.491096
HalfHourPeriod,0.32349
Year,0.045456
HolidayFlag,-0.001838
Temperature (C),-0.009087
Day,-0.012801
Month,-0.014918


In [12]:
df_dropped = df_cleaned.drop(["Holiday"], axis=1)

In [13]:
df_dropped.head()

Unnamed: 0,Date,HolidayFlag,DayOfWeek,WeekOfYear,Day,Month,Year,HalfHourPeriod,ForecastWindProduction,NationalLoadForecast,PriceForecast,Temperature (C),Windspeed (km/h),CO2Intensity (g/kWh),WindProduction (MWh),NationalSystemLoad (MWh),Price ($)
0,2011-11-01 00:00:00,0,1,44,1,11,2011,0,315.31,3388.77,49.26,6.0,9.3,600.71,356.0,3159.6,54.32
1,2011-11-01 00:30:00,0,1,44,1,11,2011,1,321.8,3196.66,49.26,6.0,11.1,605.42,317.0,2973.01,54.23
2,2011-11-01 01:00:00,0,1,44,1,11,2011,2,328.57,3060.71,49.1,5.0,11.1,589.97,311.0,2834.0,54.23
3,2011-11-01 01:30:00,0,1,44,1,11,2011,3,335.6,2945.56,48.04,6.0,9.3,585.94,313.0,2725.99,53.47
4,2011-11-01 02:00:00,0,1,44,1,11,2011,4,342.9,2849.34,33.75,6.0,11.1,571.52,346.0,2655.64,39.87


In [14]:
X = df_dropped.drop(["Price ($)","Date"], axis=1)
y = df_dropped["Price ($)"]

In [15]:
def test_model(model, data):
    X_train_scaled, X_test_scaled, y_train, y_test = data
    reg = model.fit(X_train_scaled, y_train)
    print(f'Model: {type(reg).__name__}')
    print(f'Train score: {reg.score(X_train_scaled, y_train)}')
    print(f'Test Score: {reg.score(X_test_scaled, y_test)}\n')
    plt.show()    

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=6)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
data = [X_train_scaled, X_test_scaled, y_train, y_test]

In [17]:
test_model(LinearRegression(), data)

test_model(KNeighborsRegressor(), data)
test_model(RandomForestRegressor(), data)
test_model(ExtraTreesRegressor(), data)
test_model(AdaBoostRegressor(), data)
test_model(SVR(C=1.0, epsilon=0.2), data)

Model: LinearRegression
Train score: 0.44048735487652135
Test Score: 0.44985277271361546

Model: KNeighborsRegressor
Train score: 0.707840317079194
Test Score: 0.5944539220663292

Model: RandomForestRegressor
Train score: 0.9466235129365198
Test Score: 0.64739773585417

Model: ExtraTreesRegressor
Train score: 0.9999999999973193
Test Score: 0.6861431090067439

Model: AdaBoostRegressor
Train score: 0.19527965656056345
Test Score: 0.07457721202957313

Model: SVR
Train score: 0.3798016494864296
Test Score: 0.4097960317930893

