
https://towardsdatascience.com/7-ways-to-handle-missing-values-in-machine-learning-1a6326adf79e
https://www.cienciadedatos.net/documentos/py29-forecasting-electricity-power-demand-python.html

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from keras.layers import Dropout
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import math
from sklearn.metrics import mean_squared_error
from google.colab import files

In [None]:
!git clone https://github.com/Ashutosh-Vermaa/mlpInternshipAss.git

fatal: destination path 'mlpInternshipAss' already exists and is not an empty directory.


In [None]:
data= pd.read_csv("/content/mlpInternshipAss/MLInternshipAssignmentData.csv")

In [None]:
def print_summary(title, value):
    print('-' * 80, title, '-' * 80, value, sep='\n')

def summarize_dataframe(frame):
    print_summary('Dataset Summary', frame.describe())
    print_summary('Null Values Count', frame.isnull().sum())
    # print_summary('Zero Values Count', frame.isnull().sum())

    # Iterate over each column of a data frame & print its summary
    for col in frame:
        print_summary(f'Unique values in column "{col}"', frame[col].unique())
        print_summary(f'Number of Zeros in Column "{col}"', (frame[col] == 0).sum())

In [None]:
summarize_dataframe(data)

In [None]:
data = data.drop(columns='date')

In [None]:
#removing all the zeros with nan so that those can be interpolated
data['wind_speed']=data['wind_speed'].replace(0, np.nan)
data['cloud_cover']=data['cloud_cover'].replace(0, np.nan)

In [None]:
#Interpolating the wind_speed and cloud_cover columns for missing values
data["wind_speed"] = data["wind_speed"].interpolate(method='linear', limit_direction='both', axis=0)
data["cloud_cover"] = data["cloud_cover"].interpolate(method='linear', limit_direction='both', axis=0)

In [None]:
#taking the rows with null values in load column as test data to predict load
test_set=data[data['load'].isna()]
train_set= data[data['load'].notnull()]

In [None]:
test_set.columns

In [None]:
#Data for 14th december 2020
task1_test_set= test_set.loc[test_set['Unnamed: 0']>=103488]
dateTime1=task1_test_set['datetime'][:1536]

In [None]:
#setting datetime as index for the train_set
train_set = train_set.set_index("datetime")
train_set.index = pd.to_datetime(train_set.index)

In [None]:
dateTime= test_set['datetime'][:1536]
test_set = test_set.set_index("datetime")
test_set.index = pd.to_datetime(test_set.index)

In [None]:
data['datetime'] = pd.to_datetime(data['datetime'])
data = data.set_index("datetime")

In [None]:
plt.figure(figsize=(16,10))
plt.plot(data['datetime'], data['load'])
plt.ylabel("load", fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.title('Power Consumption ',fontsize=20)

In [None]:
#scaling the load values
scaler=StandardScaler()
scaler = scaler.fit(train_set[['load']])

train_set['load'] = scaler.transform(train_set[['load']])
test_set['load'] = scaler.transform(test_set[['load']])
#using previous 96 values for the output
last_n = 96

In [None]:
#function for obtaining the x and y values to be used as train and test dataset respectively
def to_sequences(x, y, seq_size=1):
    x_values = []
    y_values = []
    for i in range(len(x)-last_n):
        x_values.append(x.iloc[i:(i+last_n)].values)
        y_values.append(y.iloc[i+last_n])
    return np.array(x_values), np.array(y_values)

In [None]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 101952 entries, 2018-01-01 00:00:00 to 2020-12-12 23:45:00
Data columns (total 8 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Unnamed: 0            101952 non-null  int64  
 1   load                  101952 non-null  float64
 2   apparent_temperature  101952 non-null  float64
 3   temperature           101952 non-null  float64
 4   humidity              101952 non-null  float64
 5   dew_point             101952 non-null  float64
 6   wind_speed            101952 non-null  float64
 7   cloud_cover           101952 non-null  float64
dtypes: float64(7), int64(1)
memory usage: 7.0 MB


In [None]:

x_train, y_train = to_sequences(train_set[['apparent_temperature',"temperature",	'humidity', 'dew_point', 'wind_speed', "cloud_cover"]], train_set['load'], last_n)
x_test, y_test = to_sequences(test_set[['apparent_temperature',"temperature",	'humidity', 'dew_point', 'wind_speed', "cloud_cover"]], test_set['load'], last_n)

In [None]:
(x_test)

In [None]:
#using the LSTM model to predict the load
model=Sequential()
model.add(LSTM(380,return_sequences=True,input_shape=(last_n,6)))
model.add(Dropout(0.2))
model.add(LSTM(190,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100,return_sequences=True))
# model.add(Dropout(0.3))
model.add(LSTM(50,return_sequences=True))
# model.add(LSTM(32, return_sequences=True))
model.add(LSTM(16))
model.add(Dense(1))
model.compile(loss='mean_squared_error',optimizer='adam',run_eagerly=True)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 96, 380)           588240    
                                                                 
 dropout (Dropout)           (None, 96, 380)           0         
                                                                 
 lstm_1 (LSTM)               (None, 96, 190)           433960    
                                                                 
 dropout_1 (Dropout)         (None, 96, 190)           0         
                                                                 
 lstm_2 (LSTM)               (None, 96, 100)           116400    
                                                                 
 lstm_3 (LSTM)               (None, 96, 50)            30200     
                                                                 
 lstm_4 (LSTM)               (None, 16)                4

In [None]:
#using callback to prevent overfitting by stopping the training process when the val_loss doesn't improve over 5 epochs
callback= tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=5,
    verbose=0,
    mode="min",
    baseline=None,
    restore_best_weights=True
)

In [None]:
history = model.fit(x_train, y_train, epochs=32, batch_size=200, callbacks=[callback], validation_split=0.1)

Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32


In [None]:
plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.legend()  

In [None]:
#TASK 2- forecasting load for the rows where load values are NaN.
# train_predict = model.predict(x_train)
test_predict  = model.predict(x_test)

# predict_train = scaler.inverse_transform(train_predict)
predict_test  = scaler.inverse_transform(test_predict)

In [None]:
obt= pd.DataFrame(dateTime)
obt['load']=predict_test

In [None]:
obt.to_csv('task2.csv', encoding = 'utf-8-sig') 
files.download('task2.csv')