In [38]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import tensorflow as tf

from matplotlib.dates import DateFormatter
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

energy = pd.read_csv("energy_dataset.csv")
weather = pd.read_csv("weather_features.csv")


# -> Checking for null values and types in dataframe.
#energy.info()
#energy.isna().sum()


# -> Temporarly removing "time" column to better handle the nulls, will be hading it latter again.
time_df = pd.DataFrame(data = energy["time"])
#time_df


# -> Dropping columns we don't need.
energy = energy.drop(["price day ahead", "time", "generation fossil coal-derived gas", "generation fossil oil shale", "generation fossil peat", "generation geothermal", "generation marine", "generation wind offshore", "generation hydro pumped storage aggregated", "forecast wind offshore eday ahead"], axis = 1)


# -> Impute missing values function.
def impute_missing_values(energy):
    for column in energy:
        energy.loc[:,column] = np.where(energy[column].isna(), energy[column].mean(), energy[column])
    return energy
impute_missing_values(energy)


# -> Divided all values by all 5 cities.
energy = energy/5


# -> Hading back the colum to our cleaned dataset.
energy = time_df.join(energy)


# -> Droping collums we don't need.
weather = weather.drop(["weather_icon", "weather_id", "weather_description"], axis = 1)


# -> Merging data frames.
sample = energy.merge(weather, left_on = "time", right_on = "dt_iso")


# -> Droping "duplicade" columns.
sample = sample.drop(["dt_iso"], axis =1)


# -> Uniforming columns names.
sample.columns = sample.columns.str.replace(" ", "_")


# -> OneHotEncoding Function.
def oneHotEncode(sample, colNames):
    for col in colNames:
        if( sample[col].dtype == np.dtype("object")):
            dummies = pd.get_dummies(sample[col], prefix = col)
            sample = pd.concat([sample,dummies], axis = 1)
# drop the encoded column
            sample.drop([col],axis = 1 , inplace = True)
    return sample


# -> Checking uniques values for one hot encoding.
#sample["weather_main"].unique()
sample = oneHotEncode(sample, ["weather_main"])


# -> Temporarly removing "city_name" column to better handle data for the model.
city_name_df = pd.DataFrame(data = sample["city_name"])
####### DON'T FORGET TO ADD IN THE LAST PART OF THE CODE !!!!! ###########
sample = sample.drop(["city_name"], axis = 1)


# -> Passing time-object to datetime type.
sample["time"] = pd.to_datetime(sample["time"], utc = True)


# -> Seperating time values to single colums.
sample["Month"] = pd.to_datetime(sample["time"]).dt.month
sample["Day"] = pd.to_datetime(sample["time"]).dt.day
sample["Hours"] = pd.to_datetime(sample["time"]).dt.hour


# -> Droping time colum.
sample = sample.drop(["time"], axis = 1)

In [2]:
################ ADD CITY NAMES AT THE END #####################

# -> Hading back the colum to our cleaned dataset.
#sample = city_name_df.join(sample)

In [32]:
#sample
#sample.info()

In [4]:
# Correlation Heatmap Energy

#plt.figure(figsize=(16, 6))
#heatmap = sns.heatmap(energy.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
#heatmap.set_title('Correlation Heatmap Energy', fontdict={'fontsize':18}, pad=12);

In [5]:
# Correlation Heatmap Weather

#plt.figure(figsize=(16, 6))
#heatmap = sns.heatmap(weather.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
#heatmap.set_title('Correlation Heatmap Weather', fontdict={'fontsize':18}, pad=12);

In [6]:
# Model Selection Test and Train

In [28]:
features = sample.drop(["price_actual"], axis=1)

# Features train set
X_train = features[:124877]

# Label train set
y_train = sample[:124877].price_actual

# Features test set
X_test = features[124877:]

# Label test set
y_test = sample[124877:].price_actual

In [34]:
NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(44, kernel_initializer='normal', input_dim = X_train.shape[1], activation='relu'))

# The Hidden Layers :
NN_model.add(Dense(30, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(30, kernel_initializer='normal',activation='relu'))

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_24 (Dense)             (None, 44)                1980      
_________________________________________________________________
dense_25 (Dense)             (None, 30)                1350      
_________________________________________________________________
dense_26 (Dense)             (None, 30)                930       
_________________________________________________________________
dense_27 (Dense)             (None, 1)                 31        
Total params: 4,291
Trainable params: 4,291
Non-trainable params: 0
_________________________________________________________________


In [30]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [31]:
NN_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split = 0.2, callbacks=callbacks_list)

In [43]:
tree_model = DecisionTreeRegressor()

scores = cross_val_score(tree_model, X_train, y_train, cv=5, scoring = "neg_mean_absolute_error")
print(scores.mean())

-2.425647202045094
