Put this into a function on the main script, try with more epochs (50 at least), bigger kernels when the data is bigger as well

In [2]:
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os, sys
import pickle

In [3]:
# load data through the data preprocessor
sys.path.append(os.path.abspath('..'))  # add parent directory to sys.path
from data_cleanup import DataProcessor

In [3]:

OUTPUT_STEPS = 24
INPUT_STEPS_ARRAY = [24, 48, 72, 96, 120, 144, 168]

for INPUT_STEPS in INPUT_STEPS_ARRAY:

    print("\n-------------- It: ", INPUT_STEPS, "----------------\n")
    processor = DataProcessor(input_steps=INPUT_STEPS, output_steps=OUTPUT_STEPS, time_sampling='H') # get 24 hour sequences, and a 1 hour output
    Train, Val, Test = processor.load_and_process_data()

    X_train, y_train = Train
    X_val, y_val = Val
    X_test, y_test = Test

    class CNN(nn.Module): 
        def __init__(self):
            super(CNN, self).__init__() # Inherit from nn.Module
            # Define layers: 3 convolutions (1D) and 2 fully connected layers (very standard)
            self.conv1 = nn.Conv1d(in_channels=8, out_channels=32, kernel_size=3)  # in_channels = number of features
            self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3) # out_channels = number of features computed for the next layer to use
            self.conv3 = nn.Conv1d(in_channels=64, out_channels=20, kernel_size=3) # kernel_size = size of the filter/window
            self.pool = nn.MaxPool1d(kernel_size=2) # reusable pool layer

            # auto-detect the flat output dimension
            with torch.no_grad():
                test_input = torch.zeros(1, 8, INPUT_STEPS) # batch size of 1, 8 features, 24 time steps
                out = self.conv1(test_input)
                out = self.conv2(out)
                out = self.conv3(out)
                flat_size = out.view(1, -1).size(1)
                
                print(f"Output shape: {flat_size}")

            self.fc1 = nn.Linear(in_features=flat_size, out_features=256) # here the in_features are 20 but that's just a coincidence, ask chatgpt to compute it for you
            self.fc2 = nn.Linear(in_features=256, out_features=OUTPUT_STEPS)
        
        def forward(self, x): # define how the layers are going to interact
            x = F.relu(self.conv1(x))
            x = F.relu(self.conv2(x))
            x = F.relu(self.conv3(x))
            x = x.view(x.size(0), -1) # flatten the output of the last convolution to be given into a regular linear layer
            x = F.relu(self.fc1(x))
            x = self.fc2(x)
            return x


    # train
    epochs = 20

    net = CNN()

    # Linear ramp: small at t=0 up to 1 at t=T-1
    weights_linear = np.linspace(0.1, 1.0, OUTPUT_STEPS)
    # criterion = WeightedMSE(weights_linear)
    criterion = nn.MSELoss()

    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

    # turn data into torch tensors
    X_train_t = torch.tensor(X_train, dtype=torch.float32)
    y_train_t = torch.tensor(y_train, dtype=torch.float32)
    train_dataset = torch.utils.data.TensorDataset(X_train_t, y_train_t)

    # use DataLoader, an iterable object from pytorch, that helps with batch training
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=100, shuffle=True)

    for epoch in range(epochs):
        running_loss = 0.0
        total = 0
        for i, data in enumerate(train_loader, 0):
            # get the inputs
            inputs, labels = data
            # the inputs are given like (batch, time, features),
            # but the model expects (batch, features, time), permutate it: 
            inputs = inputs.permute(0, 2, 1)  

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            total += inputs.shape[0]
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0

    print('Finished Training')

    # convert test data
    X_test_t = torch.tensor(X_test, dtype=torch.float32)
    y_test_t = torch.tensor(y_test, dtype=torch.float32)

    # create a test loader
    test_loader = torch.utils.data.DataLoader(
        dataset=torch.utils.data.TensorDataset(X_test_t, y_test_t),
        batch_size=4,
        shuffle=False
    )

    # compute errors
    net.eval()
    with torch.no_grad():
        train_error = []
        for inputs, labels in train_loader:
            inputs = inputs.permute(0, 2, 1)
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            train_error.append(loss.item())

        test_error = []
        for inputs, labels in test_loader:
            inputs = inputs.permute(0, 2, 1)
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            test_error.append(loss.item())

    net.train()  # go back to training mode

    print("Train Error: %.3f, Test Error: %.3f" % (np.mean(train_error), np.mean(test_error)))


-------------- It:  24 ----------------

Step 1/5: Fetching, cleaning, and engineering features...


  df = pd.read_csv(data_url)
  df = df.fillna(method='ffill')


Step 2/5: Resampling data to hourly and setting 'Global_active_power' as target...


  df_timesampled = df.resample(self.time_sampling).agg(agg_dict) # 'T' for minute-wise, 'H' for hourly
  df_timesampled = df_timesampled.fillna(method='ffill')


Step 3/5: Splitting data and applying scaler...
Step 4/5: Creating time-series windows...
Processed 10% of data...
Processed 20% of data...
Processed 30% of data...
Processed 40% of data...
Processed 50% of data...
Processed 60% of data...
Processed 70% of data...
Processed 80% of data...
Processed 90% of data...
Processed 100% of data...
Processed 100% of data...
Processed 100% of data...
Step 5/5: Data processing complete.
Output shape: 360
Finished Training
Train Error: 0.021, Test Error: 0.012

-------------- It:  48 ----------------

Step 1/5: Fetching, cleaning, and engineering features...


  df = pd.read_csv(data_url)
  df = df.fillna(method='ffill')


Step 2/5: Resampling data to hourly and setting 'Global_active_power' as target...


  df_timesampled = df.resample(self.time_sampling).agg(agg_dict) # 'T' for minute-wise, 'H' for hourly
  df_timesampled = df_timesampled.fillna(method='ffill')


Step 3/5: Splitting data and applying scaler...
Step 4/5: Creating time-series windows...
Processed 50% of data...
Processed 100% of data...
Processed 100% of data...
Processed 100% of data...
Step 5/5: Data processing complete.
Output shape: 840
Finished Training
Train Error: 0.020, Test Error: 0.012

-------------- It:  72 ----------------

Step 1/5: Fetching, cleaning, and engineering features...


  df = pd.read_csv(data_url)
  df = df.fillna(method='ffill')


Step 2/5: Resampling data to hourly and setting 'Global_active_power' as target...


  df_timesampled = df.resample(self.time_sampling).agg(agg_dict) # 'T' for minute-wise, 'H' for hourly
  df_timesampled = df_timesampled.fillna(method='ffill')


Step 3/5: Splitting data and applying scaler...
Step 4/5: Creating time-series windows...
Processed 50% of data...
Processed 100% of data...
Processed 100% of data...
Processed 100% of data...
Step 5/5: Data processing complete.
Output shape: 1320
Finished Training
Train Error: 0.020, Test Error: 0.012

-------------- It:  96 ----------------

Step 1/5: Fetching, cleaning, and engineering features...


  df = pd.read_csv(data_url)
  df = df.fillna(method='ffill')


Step 2/5: Resampling data to hourly and setting 'Global_active_power' as target...


  df_timesampled = df.resample(self.time_sampling).agg(agg_dict) # 'T' for minute-wise, 'H' for hourly
  df_timesampled = df_timesampled.fillna(method='ffill')


Step 3/5: Splitting data and applying scaler...
Step 4/5: Creating time-series windows...
Processed 50% of data...
Processed 100% of data...
Processed 20% of data...
Processed 40% of data...
Processed 60% of data...
Processed 80% of data...
Processed 100% of data...
Processed 100% of data...
Step 5/5: Data processing complete.
Output shape: 1800
Finished Training
Train Error: 0.020, Test Error: 0.012

-------------- It:  120 ----------------

Step 1/5: Fetching, cleaning, and engineering features...


  df = pd.read_csv(data_url)
  df = df.fillna(method='ffill')


Step 2/5: Resampling data to hourly and setting 'Global_active_power' as target...


  df_timesampled = df.resample(self.time_sampling).agg(agg_dict) # 'T' for minute-wise, 'H' for hourly
  df_timesampled = df_timesampled.fillna(method='ffill')


Step 3/5: Splitting data and applying scaler...
Step 4/5: Creating time-series windows...
Processed 50% of data...
Processed 100% of data...
Processed 100% of data...
Processed 20% of data...
Processed 40% of data...
Processed 60% of data...
Processed 80% of data...
Processed 100% of data...
Step 5/5: Data processing complete.
Output shape: 2280
Finished Training
Train Error: 0.020, Test Error: 0.012

-------------- It:  144 ----------------

Step 1/5: Fetching, cleaning, and engineering features...


  df = pd.read_csv(data_url)
  df = df.fillna(method='ffill')


Step 2/5: Resampling data to hourly and setting 'Global_active_power' as target...


  df_timesampled = df.resample(self.time_sampling).agg(agg_dict) # 'T' for minute-wise, 'H' for hourly
  df_timesampled = df_timesampled.fillna(method='ffill')


Step 3/5: Splitting data and applying scaler...
Step 4/5: Creating time-series windows...
Processed 10% of data...
Processed 20% of data...
Processed 30% of data...
Processed 40% of data...
Processed 50% of data...
Processed 60% of data...
Processed 70% of data...
Processed 80% of data...
Processed 90% of data...
Processed 100% of data...
Processed 100% of data...
Processed 100% of data...
Step 5/5: Data processing complete.
Output shape: 2760
Finished Training
Train Error: 0.020, Test Error: 0.012

-------------- It:  168 ----------------

Step 1/5: Fetching, cleaning, and engineering features...


  df = pd.read_csv(data_url)
  df = df.fillna(method='ffill')


Step 2/5: Resampling data to hourly and setting 'Global_active_power' as target...


  df_timesampled = df.resample(self.time_sampling).agg(agg_dict) # 'T' for minute-wise, 'H' for hourly
  df_timesampled = df_timesampled.fillna(method='ffill')


Step 3/5: Splitting data and applying scaler...
Step 4/5: Creating time-series windows...
Processed 50% of data...
Processed 100% of data...
Processed 100% of data...
Processed 100% of data...
Step 5/5: Data processing complete.
Output shape: 3240
Finished Training
Train Error: 0.019, Test Error: 0.012


Seems like the model doesen't benefit from more information from more previous steps, let's try adjusting the kernel sizes to hopefully take advantage of the incresed data. 

In [4]:
OUTPUT_STEPS = 24
INPUT_STEPS_ARRAY = [24, 48, 72, 96, 120, 144, 168]

for INPUT_STEPS in INPUT_STEPS_ARRAY:

    print("\n-------------- It: ", INPUT_STEPS, "----------------\n")
    processor = DataProcessor(input_steps=INPUT_STEPS, output_steps=OUTPUT_STEPS, time_sampling='H') # get 24 hour sequences, and a 1 hour output
    Train, Val, Test = processor.load_and_process_data()

    X_train, y_train = Train
    X_val, y_val = Val
    X_test, y_test = Test

    kernel_size = INPUT_STEPS // 8

    class CNN(nn.Module): 
        def __init__(self):
            super(CNN, self).__init__() # Inherit from nn.Module
            # Define layers: 3 convolutions (1D) and 2 fully connected layers (very standard)
            self.conv1 = nn.Conv1d(in_channels=8, out_channels=32, kernel_size=kernel_size)  # in_channels = number of features
            self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=kernel_size) # out_channels = number of features computed for the next layer to use
            self.conv3 = nn.Conv1d(in_channels=64, out_channels=20, kernel_size=kernel_size) # kernel_size = size of the filter/window
            self.pool = nn.MaxPool1d(kernel_size=2) # reusable pool layer

            # auto-detect the flat output dimension
            with torch.no_grad():
                test_input = torch.zeros(1, 8, INPUT_STEPS) # batch size of 1, 8 features, 24 time steps
                out = self.conv1(test_input)
                out = self.conv2(out)
                out = self.conv3(out)
                flat_size = out.view(1, -1).size(1)
                
                print(f"Output shape: {flat_size}")

            self.fc1 = nn.Linear(in_features=flat_size, out_features=256) # here the in_features are 20 but that's just a coincidence, ask chatgpt to compute it for you
            self.fc2 = nn.Linear(in_features=256, out_features=OUTPUT_STEPS)
        
        def forward(self, x): # define how the layers are going to interact
            x = F.relu(self.conv1(x))
            x = F.relu(self.conv2(x))
            x = F.relu(self.conv3(x))
            x = x.view(x.size(0), -1) # flatten the output of the last convolution to be given into a regular linear layer
            x = F.relu(self.fc1(x))
            x = self.fc2(x)
            return x


    # train
    epochs = 20

    net = CNN()

    # Linear ramp: small at t=0 up to 1 at t=T-1
    weights_linear = np.linspace(0.1, 1.0, OUTPUT_STEPS)
    # criterion = WeightedMSE(weights_linear)
    criterion = nn.MSELoss()

    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

    # turn data into torch tensors
    X_train_t = torch.tensor(X_train, dtype=torch.float32)
    y_train_t = torch.tensor(y_train, dtype=torch.float32)
    train_dataset = torch.utils.data.TensorDataset(X_train_t, y_train_t)

    # use DataLoader, an iterable object from pytorch, that helps with batch training
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=100, shuffle=True)

    for epoch in range(epochs):
        running_loss = 0.0
        total = 0
        for i, data in enumerate(train_loader, 0):
            # get the inputs
            inputs, labels = data
            # the inputs are given like (batch, time, features),
            # but the model expects (batch, features, time), permutate it: 
            inputs = inputs.permute(0, 2, 1)  

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            total += inputs.shape[0]
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0

    print('Finished Training')

    # convert test data
    X_test_t = torch.tensor(X_test, dtype=torch.float32)
    y_test_t = torch.tensor(y_test, dtype=torch.float32)

    # create a test loader
    test_loader = torch.utils.data.DataLoader(
        dataset=torch.utils.data.TensorDataset(X_test_t, y_test_t),
        batch_size=4,
        shuffle=False
    )

    # compute errors
    net.eval()
    with torch.no_grad():
        train_error = []
        for inputs, labels in train_loader:
            inputs = inputs.permute(0, 2, 1)
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            train_error.append(loss.item())

        test_error = []
        for inputs, labels in test_loader:
            inputs = inputs.permute(0, 2, 1)
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            test_error.append(loss.item())

    net.train()  # go back to training mode

    print("Train Error: %.3f, Test Error: %.3f" % (np.mean(train_error), np.mean(test_error)))


-------------- It:  24 ----------------

Step 1/5: Fetching, cleaning, and engineering features...


  df = pd.read_csv(data_url)
  df = df.fillna(method='ffill')


Step 2/5: Resampling data to hourly and setting 'Global_active_power' as target...


  df_timesampled = df.resample(self.time_sampling).agg(agg_dict) # 'T' for minute-wise, 'H' for hourly
  df_timesampled = df_timesampled.fillna(method='ffill')


Step 3/5: Splitting data and applying scaler...
Step 4/5: Creating time-series windows...
Processed 10% of data...
Processed 20% of data...
Processed 30% of data...
Processed 40% of data...
Processed 50% of data...
Processed 60% of data...
Processed 70% of data...
Processed 80% of data...
Processed 90% of data...
Processed 100% of data...
Processed 100% of data...
Processed 100% of data...
Step 5/5: Data processing complete.
Output shape: 360
Finished Training
Train Error: 0.021, Test Error: 0.013

-------------- It:  48 ----------------

Step 1/5: Fetching, cleaning, and engineering features...


  df = pd.read_csv(data_url)
  df = df.fillna(method='ffill')


Step 2/5: Resampling data to hourly and setting 'Global_active_power' as target...


  df_timesampled = df.resample(self.time_sampling).agg(agg_dict) # 'T' for minute-wise, 'H' for hourly
  df_timesampled = df_timesampled.fillna(method='ffill')


Step 3/5: Splitting data and applying scaler...
Step 4/5: Creating time-series windows...
Processed 50% of data...
Processed 100% of data...
Processed 100% of data...
Processed 100% of data...
Step 5/5: Data processing complete.
Output shape: 660
Finished Training
Train Error: 0.020, Test Error: 0.012

-------------- It:  72 ----------------

Step 1/5: Fetching, cleaning, and engineering features...


  df = pd.read_csv(data_url)
  df = df.fillna(method='ffill')


Step 2/5: Resampling data to hourly and setting 'Global_active_power' as target...


  df_timesampled = df.resample(self.time_sampling).agg(agg_dict) # 'T' for minute-wise, 'H' for hourly
  df_timesampled = df_timesampled.fillna(method='ffill')


Step 3/5: Splitting data and applying scaler...
Step 4/5: Creating time-series windows...
Processed 50% of data...
Processed 100% of data...
Processed 100% of data...
Processed 100% of data...
Step 5/5: Data processing complete.
Output shape: 960
Finished Training
Train Error: 0.019, Test Error: 0.011

-------------- It:  96 ----------------

Step 1/5: Fetching, cleaning, and engineering features...


  df = pd.read_csv(data_url)
  df = df.fillna(method='ffill')


Step 2/5: Resampling data to hourly and setting 'Global_active_power' as target...


  df_timesampled = df.resample(self.time_sampling).agg(agg_dict) # 'T' for minute-wise, 'H' for hourly
  df_timesampled = df_timesampled.fillna(method='ffill')


Step 3/5: Splitting data and applying scaler...
Step 4/5: Creating time-series windows...
Processed 50% of data...
Processed 100% of data...
Processed 20% of data...
Processed 40% of data...
Processed 60% of data...
Processed 80% of data...
Processed 100% of data...
Processed 100% of data...
Step 5/5: Data processing complete.
Output shape: 1260
Finished Training
Train Error: 0.019, Test Error: 0.012

-------------- It:  120 ----------------

Step 1/5: Fetching, cleaning, and engineering features...


  df = pd.read_csv(data_url)
  df = df.fillna(method='ffill')


Step 2/5: Resampling data to hourly and setting 'Global_active_power' as target...


  df_timesampled = df.resample(self.time_sampling).agg(agg_dict) # 'T' for minute-wise, 'H' for hourly
  df_timesampled = df_timesampled.fillna(method='ffill')


Step 3/5: Splitting data and applying scaler...
Step 4/5: Creating time-series windows...
Processed 50% of data...
Processed 100% of data...
Processed 100% of data...
Processed 20% of data...
Processed 40% of data...
Processed 60% of data...
Processed 80% of data...
Processed 100% of data...
Step 5/5: Data processing complete.
Output shape: 1560
Finished Training
Train Error: 0.019, Test Error: 0.012

-------------- It:  144 ----------------

Step 1/5: Fetching, cleaning, and engineering features...


  df = pd.read_csv(data_url)
  df = df.fillna(method='ffill')


Step 2/5: Resampling data to hourly and setting 'Global_active_power' as target...


  df_timesampled = df.resample(self.time_sampling).agg(agg_dict) # 'T' for minute-wise, 'H' for hourly
  df_timesampled = df_timesampled.fillna(method='ffill')


Step 3/5: Splitting data and applying scaler...
Step 4/5: Creating time-series windows...
Processed 10% of data...
Processed 20% of data...
Processed 30% of data...
Processed 40% of data...
Processed 50% of data...
Processed 60% of data...
Processed 70% of data...
Processed 80% of data...
Processed 90% of data...
Processed 100% of data...
Processed 100% of data...
Processed 100% of data...
Step 5/5: Data processing complete.
Output shape: 1860
Finished Training
Train Error: 0.019, Test Error: 0.011

-------------- It:  168 ----------------

Step 1/5: Fetching, cleaning, and engineering features...


  df = pd.read_csv(data_url)
  df = df.fillna(method='ffill')


Step 2/5: Resampling data to hourly and setting 'Global_active_power' as target...


  df_timesampled = df.resample(self.time_sampling).agg(agg_dict) # 'T' for minute-wise, 'H' for hourly
  df_timesampled = df_timesampled.fillna(method='ffill')


Step 3/5: Splitting data and applying scaler...
Step 4/5: Creating time-series windows...
Processed 50% of data...
Processed 100% of data...
Processed 100% of data...
Processed 100% of data...
Step 5/5: Data processing complete.
Output shape: 2160
Finished Training
Train Error: 0.018, Test Error: 0.011
