# Loading initial dataset and inporting libraries

In [1]:
from torch import nn
from tqdm import tqdm
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler


import torch
from torch import nn
from tqdm.notebook import tqdm
import os
from torch.utils.tensorboard import SummaryWriter
from IPython import display

In [2]:
!gdown --id 1GZxDATmFkkXhX2mjBq0q5OKoT5dT22Rc -O dataset.csv 
df = pd.read_csv('/content/dataset.csv')
df = df.set_index(['time'])
df = df.drop(columns = ['P_GEN'])

Downloading...
From: https://drive.google.com/uc?id=1GZxDATmFkkXhX2mjBq0q5OKoT5dT22Rc
To: /content/dataset.csv
  0% 0.00/497k [00:00<?, ?B/s]100% 497k/497k [00:00<00:00, 60.9MB/s]


In [3]:
df.head()

Unnamed: 0_level_0,TempOut,WindSpeed,WindRun,SolarRad,SolarEnergy,HeatD-D,CoolD-D,OutHum
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014-06-10 02:30:00,15.7,0,0.0,0,0.0,0.056,0.0,89
2014-06-10 03:00:00,15.2,0,0.0,0,0.0,0.065,0.0,90
2014-06-10 03:30:00,14.9,0,0.0,0,0.0,0.072,0.0,90
2014-06-10 04:00:00,14.4,0,0.0,0,0.0,0.081,0.0,89
2014-06-10 04:30:00,14.2,0,0.0,0,0.0,0.087,0.0,90


In [4]:
def train_validate_test_split(df, train_percent=.6, val_percent=.2):
    m = len(df.index)
    train_end = int(train_percent * m)
    val_end = int(val_percent * m) + train_end
    
    train = df.iloc[:train_end]
    val = df.iloc[train_end:val_end]
    test = df.iloc[val_end:]
    return train, val, test

In [5]:
train, val, test = train_validate_test_split(df, train_percent=.8, val_percent=.1)

Deleting generation column, scaling


In [6]:
scaler = MinMaxScaler()

train = scaler.fit_transform(train)
val = scaler.fit_transform(val)
test = scaler.fit_transform(test)

# Creating a dataset for time-series data

Shifting Y with respect to X on predict_dim nubmer of steps

In [7]:
def create_inout_sequences(training_set_scaled,n_future = 1,n_past = 60): # https://pythobyte.com/time-series-prediction-using-lstm-with-pytorch-in-python-521ce3ed/
  x_train = []
  y_train = []
 
  for i in range(0,len(training_set_scaled)-n_past-n_future+1):
      x_train.append(training_set_scaled[i : i + n_past , :])     
      y_train.append(training_set_scaled[i + n_past : i + n_past + n_future , : ])
  x_train , y_train = np.array(x_train), np.array(y_train)
  print(x_train.shape)
  x_train = np.reshape(x_train, (x_train.shape[0] , x_train.shape[1], 8) )

  return x_train,y_train

predict_dim = 1
x_train,y_train = create_inout_sequences(train, predict_dim)
x_val,y_val = create_inout_sequences(val,predict_dim)
x_test,y_test = create_inout_sequences(test,predict_dim)

(6184, 60, 8)
(720, 60, 8)
(721, 60, 8)


In [8]:
class Dataset():
    def __init__(self, X_data_in, X_data_out):
        self.X_data_in = X_data_in
        self.X_data_out = X_data_out
        
    def __getitem__(self, index):
        return self.X_data_in[index], self.X_data_out[index]
    
    def __len__(self):
        return len(self.X_data_in)

train_dataset = Dataset(torch.from_numpy(x_train).float(), 
                                  torch.from_numpy(y_train).float())

val_dataset = Dataset(torch.from_numpy(x_val).float(), 
                                  torch.from_numpy(y_val).float())

test_dataset = Dataset(torch.from_numpy(x_test).float(), 
                                  torch.from_numpy(y_test).float())


In [9]:
test_dataset.X_data_in.shape

torch.Size([721, 60, 8])

In [10]:
test_dataset.X_data_out.shape

torch.Size([721, 1, 8])

# Defining the model

In [11]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

In [12]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim,predict_dim, output_dim = 8,device = device):
        super(LSTMModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim
        self.hidden_dim = hidden_dim

        # Number of hidden layers
        self.layer_dim = layer_dim
        self.device = device
        #self.lstm1 = nn.LSTM(input_dim, hidden_dim, layer_dim, bidirectional = True,batch_first=True)
        self.lstm1 = nn.LSTM(input_dim, hidden_dim, layer_dim, bidirectional = False,batch_first=True)
        #self.lstm2 = nn.LSTM(hidden_dim*2, hidden_dim, layer_dim*2, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, layer_dim, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.relu = nn.ReLU()

        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.fc2 = nn.Linear(1, predict_dim)

    def forward(self, x):
        # bidirectional
        #h0 = torch.zeros(self.layer_dim*2, x.size(0), self.hidden_dim).requires_grad_().to(self.device) 
        #c0 = torch.zeros(self.layer_dim*2, x.size(0), self.hidden_dim).requires_grad_().to(self.device)

        #unidirectional
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(self.device)
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(self.device)
        out, (hn, cn) = self.lstm1(x, (h0.detach(), c0.detach())) # return sequences = true
        
        out2, (hn1, cn1) = self.lstm2(self.dropout(out), (hn.detach(), cn.detach()))
        out2 =out2[:,-1,:] # return sequences = False
        out2 = self.fc(self.dropout(out2)) # processing features
        out2 = torch.unsqueeze(out2,-1)

        out2 = self.fc2(self.dropout(out2))  # processing sequence
        out2 = out2.permute(0,2,1) 

        return out2

# Useful functions - train, test, plot


In [13]:
def lstm_model_run(model, dataloader, optimizer, loss_fun, phase='train', epoch=0, scheduller=None, writer=None):
    is_train = (phase == 'train')
    if is_train:
        model.train()
    else:
        model.eval()
    
    epoch_loss = 0
    loss_per_epoch = []


    with torch.set_grad_enabled(is_train):        
        for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
            epoch_i = epoch * len(dataloader) + i

            input, output = batch[0].to(device), batch[1].to(device)
            input = model.forward(input)
            
            loss = loss_fun( input,output)

            if is_train:
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                if scheduller:
                    scheduller.step() 
                    
            loss_per_epoch.append(loss.item()) 

            if writer is not None and is_train:
                writer.add_scalar(f"loss/{phase}", loss.item(), epoch_i)
                writer.add_scalar(f"accuracy/{phase}", loss.item(), epoch_i)
                
            epoch_loss += loss.item()
        
        average_loss = epoch_loss / len(dataloader)

        
        if writer is not None:
            writer.add_scalar(f"loss_epoch/{phase}", average_loss, epoch)

        return average_loss


In [14]:

def test_run(best_model, dataloader, loss_fun):
    
    model.to(device) 
    model.eval()
    epoch_loss = 0
    otput_for_plot = []

    with torch.no_grad():        
        for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):

            input, output = batch[0].to(device), batch[1].to(device)
            input = model.forward(input)
            loss = loss_fun(input, output)
                       
            epoch_loss += loss.item()
        average_loss = np.sqrt(epoch_loss / len(dataloader))
        return average_loss, input

In [15]:
from time import time

def plot(train_loss, val_loss, clear_output=True):
    if clear_output:
        display.clear_output(wait=True)
    fig, ax = plt.subplots(1, 1, figsize=(16, 6))
    
    ax.semilogy(train_loss)
    ax.semilogy(val_loss)
    ax.set_title('Train/Val loss')
    ax.set_xlabel('# batches processed')
    ax.set_ylabel('loss value')   
    plt.show()

def train_lstm_model(model,
                     train_dataloader, val_dataloader, test_dataloader,
                      num_epochs=20, batch_size=32, normalize=False):
      
        model.to(device)
        #optimizer = torch.optim.Adam(model.parameters(), lr=5*1e-3)
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
        loss_fun = torch.nn.MSELoss()

        logs_base_dir = "/content/drive/My Drive/lstm_model/writer/"
        os.makedirs(logs_base_dir, exist_ok=True)
        
        writer_name = 'logs'
        writer_path = os.path.join(logs_base_dir, writer_name)

        writer = SummaryWriter(writer_path)

        best_val_loss = float('+inf')

        path = 'lstm_model.pth'
        
        loss_train = []
        loss_vall = []


        st = time() # перед for epoch in epoch


        for epoch in range(0,num_epochs): 
          train_loss = lstm_model_run(model, train_dataloader, optimizer, loss_fun, phase='train', epoch=epoch, scheduller=None, writer=None)
          val_loss = lstm_model_run(model, val_dataloader, None, loss_fun, phase='val', epoch=epoch, scheduller=None, writer=None)

          loss_train.append(train_loss)
          loss_vall.append(val_loss)
          
          if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict()
          
          print(f'Epoch: {epoch+1:02}')
          print(f'\tTrain Loss: {train_loss:.10f}')
          print(f'\t Val. Loss: {val_loss:.10f}') 
        
        print('Train time %.2fs' % (time() - st)) # после фора
        Time = time() - st  
        torch.save(best_model, path)
        plot(loss_train, loss_vall, clear_output=True)
         
        test_loss, input_inv = test_run(model, test_dataloader, loss_fun)

        print(input_inv)
        print(f'\tTest rMSE Loss: {test_loss:.3f}')

        return input_inv, Time


# Тренировка

In [16]:
#model = LSTM()
input_dim = 8
hidden_dim = 100
layer_dim = 2  # ONLY CHANGE IS HERE FROM ONE LAYER TO TWO LAYER
output_dim = 8

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = LSTMModel(input_dim, hidden_dim, layer_dim, predict_dim, output_dim, device)


In [17]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(dataset=train_dataset, batch_size=48, shuffle=False,drop_last=True) 
val_dataloader = DataLoader(dataset=val_dataset, batch_size=8,shuffle=False,drop_last=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size = 8,shuffle=False,drop_last=False)

In [None]:
input_inv,Time = train_lstm_model(model, train_dataloader, 
                                val_dataloader, test_dataloader,
                                num_epochs=80, batch_size=64)

In [19]:
print('Time of training:', Time)

Time of training: 149.99049949645996


In [20]:
def Predict_Y(loader, model, scaler):
  
    y_true = []
    y_pred = []
  
    for batch in loader:

        X,Y = batch[:2]
        
        X = X.to(device)
        Y = Y.to(device)
        
        y_true.append((Y.cpu().detach().numpy()))
        y_pred.append((model(X).cpu().detach().numpy()))
    
    y_true = np.vstack(y_true)
    y_pred = np.vstack(y_pred)

    return y_true, y_pred

In [21]:
test_dataloader = DataLoader(dataset=test_dataset, batch_size = 8,shuffle=False,drop_last=False)

In [22]:
y_test_true,  y_test_LSTM = Predict_Y(test_dataloader, model, scaler)

y_test_true  = y_test_true.reshape(y_test_true.shape[0],y_test_true.shape[-1])
y_test_true = scaler.inverse_transform(y_test_true)

y_test_LSTM = y_test_LSTM.reshape(y_test_LSTM.shape[0],y_test_LSTM.shape[-1])
y_test_LSTM = scaler.inverse_transform(y_test_LSTM)

In [23]:
df.columns

Index(['TempOut', 'WindSpeed', 'WindRun', 'SolarRad', 'SolarEnergy', 'HeatD-D',
       'CoolD-D', 'OutHum'],
      dtype='object')

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
index = 1
df0 = pd.Series(y_test_true[:, index])  #dataframe with ground truth

df_pred_1 = pd.DataFrame(y_test_LSTM) #dataframe with predicted data

fig = make_subplots(rows=2, cols=1,shared_xaxes=True,)

fig.add_trace(go.Scatter(x=df0.index, y=df0, mode='lines',name = 'true',line=dict(color='royalblue', dash='dot'),showlegend=True), row=1, col=1) #
fig.add_trace(go.Scatter(x=df_pred_1.index, y=df_pred_1[index], mode='lines', name='pred',line=dict(color='#EF553B')), row=1, col=1)

fig.update_layout(title='Test data VS Predicted data',width=1200, height=1000, template='plotly_white',showlegend=True, legend=dict(x=0, y=1,bordercolor="Black",borderwidth=2))

fig.update_xaxes( row=2, col=1,showline=True, linewidth=1.5, linecolor='black', mirror=True,ticks='inside',nticks=10)
fig.update_yaxes(title_text="WindSpeed", row=2, col=1,showline=True, linewidth=1.5, linecolor='black', mirror=True)
fig['layout']['xaxis']['title']='Number of observations'
fig['layout']['yaxis']['title']=df.columns[index] + '[m/s]'

fig.show()

with open('/content/test_pred2.html', 'a') as f:

        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height = 1200, default_width = 900))
       
fig.write_html('/content/test_pred2.html')

In [None]:
def multi_step_plot(history, true_future, prediction): 
  plt.figure(figsize=(12, 6))
  num_in = create_time_steps(len(history))
  num_out = len(true_future)
  print(history.shape)
  print(true_future.shape)

  plt.plot(num_in, np.array(history[:]), label='History')
  plt.plot(np.arange(num_out)/1, np.array(true_future), 'bo',
           label='True Future')
  if prediction.any():
    plt.plot(np.arange(num_out)/1, np.array(prediction), 'ro',
             label='Predicted Future')
  plt.legend(loc='upper left')
  plt.show()

def create_time_steps(length):
  return list(range(-length, 0))

In [None]:
df_filled = pd.read_csv('/content/model_GAIN.csv')
df_filled = df_filled.set_index(['time'])

In [None]:
scaler = MinMaxScaler()
test = scaler.fit_transform(df_filled)

In [None]:
x_test,y_test = create_inout_sequences(test,predict_dim)

In [None]:
test_dataset = Dataset(torch.from_numpy(x_test).float(), 
                                  torch.from_numpy(y_test).float())

In [None]:
test_dataloader = DataLoader(dataset=test_dataset, batch_size = 8,shuffle=False,drop_last=False)

In [None]:
y_test_true,  y_test_Transformer = Predict_Y(test_dataloader, model, scaler)

y_test_true  = y_test_true.reshape(y_test_true.shape[0],y_test_true.shape[-1])
y_test_true = scaler.inverse_transform(y_test_true)

y_test_Transformer = y_test_Transformer.reshape(y_test_Transformer.shape[0],y_test_Transformer.shape[-1])
y_test_Transformer = scaler.inverse_transform(y_test_Transformer)

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
index = 0
df0 = pd.Series(y_test_true[:, index])  #dataframe with ground truth

df_pred = pd.DataFrame(y_test_Transformer) #dataframe with predicted data

fig = make_subplots(rows=2, cols=1,shared_xaxes=True,)

fig.add_trace(go.Scatter(x=df0.index, y=df0, mode='lines',name = 'filled',line=dict(color='royalblue', dash='dot'),showlegend=True), row=1, col=1) #
fig.add_trace(go.Scatter(x=df_pred.index, y=df_pred[index], mode='lines', name='pred',line=dict(color='#EF553B')), row=1, col=1)

fig.update_layout(title='Test data VS Predicted data',width=1200, height=1000, template='plotly_white',showlegend=True, legend=dict(x=0, y=1,bordercolor="Black",borderwidth=2))

fig.update_xaxes( row=2, col=1,showline=True, linewidth=1.5, linecolor='black', mirror=True,ticks='inside',nticks=10)
fig.update_yaxes(title_text="WindSpeed", row=2, col=1,showline=True, linewidth=1.5, linecolor='black', mirror=True)


fig.show()

with open('/content/test_pred2.html', 'a') as f:

        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height = 1200, default_width = 900))
       
fig.write_html('/content/test_pred2.html')

In [None]:
df_pred.to_csv('lstm_filledweather.csv')

In [None]:
df_pred.columns = df_filled.columns

In [None]:
df_pred