# This notebook trains NHITS model on ten building only. "all-data" contains the data of all the buildings in the original dataset

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from torchmetrics import MeanSquaredError,SymmetricMeanAbsolutePercentageError
from torch.utils.data import TensorDataset, DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler
import time
from memory_profiler import memory_usage
import resource
import pickle

def get_memory_usage():
    # Return current memory usage in MB
    return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024

class NHiTSBlock(nn.Module):
    def __init__(self, input_size, output_size, num_hidden, num_layers):
        super(NHiTSBlock, self).__init__()
        self.hidden = nn.ModuleList([nn.Linear(input_size, num_hidden)] +
                                    [nn.Linear(num_hidden, num_hidden) for _ in range(num_layers - 1)])
        self.theta_b = nn.Linear(num_hidden, input_size)
        self.theta_f = nn.Linear(num_hidden, output_size)

    def forward(self, x):
        for layer in self.hidden:
            x = torch.relu(layer(x))
        backcast = self.theta_b(x)
        forecast = self.theta_f(x)
        return backcast, forecast

class NHiTS(nn.Module):
    def __init__(self, input_size, output_size, num_blocks, num_hidden, num_layers):
        super(NHiTS, self).__init__()
        self.blocks = nn.ModuleList([NHiTSBlock(input_size, output_size, num_hidden, num_layers) for _ in range(num_blocks)])

    def forward(self, x):
        forecast = torch.zeros((x.size(0), self.blocks[0].theta_f.out_features), device=x.device)
        for block in self.blocks:
            backcast, block_forecast = block(x)
            x = x - backcast
            forecast = forecast + block_forecast
        return forecast

# Example usage:
input_size = 10  # Length of input time series
output_size = 1  # Length of output time series (forecast)
num_blocks = 80
num_hidden = 512
num_layers = 20


# Define a function to create the model for memory profiling
def create_model():
    model = NHiTS(input_size, output_size, num_blocks, num_hidden, num_layers)
    return model

start_mem = get_memory_usage()
model = create_model()
end_mem = get_memory_usage()
print(f"Memory used for model creation: {end_mem - start_mem} MB")
# Create the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def sMAPE(outputs, targets):
    """
    Symmetric Mean Absolute Percentage Error (sMAPE) for evaluating the model.
    It is the sum of the absolute difference between the predicted and actual values divided by the average of
    the predicted and actual value, therefore giving a percentage measuring the amount of error :
    100/n * sum(|F_t - A_t| / ((|F_t| + |A_t|) / 2)) with t = 1 to n

    :param outputs: predicted values
    :param targets: real values
    :return: sMAPE
    """
    return 100 / len(targets) * torch.sum(
        2 * torch.abs(outputs - targets) / (torch.abs(outputs) + torch.abs(targets))
    )

def create_sequences(data, seq_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        x = data[i:i + seq_length]
        y = data[i + seq_length]
        xs.append(x)
        ys.append(y)
    xs = np.array(xs)
    ys = np.array(ys)
    return xs, ys

class Data(Dataset):
    def __init__(self, x_data, y_data):
        self.x_data = x_data
        self.y_data = y_data

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return len(self.x_data)

# Load and preprocess data

file_path3 = '/kaggle/input/all-data/residential_all.pkl'
df3 = pd.read_pickle(file_path3)
print("Datra loaded")
df3=df3[df3["ID"] <= 1050]
df3["ID"] = df3["ID"].astype("category")
df3["time_code"] = df3["time_code"].astype("uint16")
df_test=df3[df3["ID"] == 1003]
scaler = MinMaxScaler()
df3 = df3.set_index(["date_time","ID"])
df_test = df_test.set_index(["date_time","ID"])
df3['consumption'] = scaler.fit_transform(df3[['consumption']])
df_test['consumption'] = scaler.transform(df_test[['consumption']])
df3=df3[["consumption"]]
df_test=df_test[["consumption"]]

def resample_building_data(group):
    group = group.reset_index(level='ID')
    # Specify columns explicitly for summing
    resampled_group = group.resample('h').agg({'consumption': 'sum'})  # Example if 'consumption' is your numeric column
    resampled_group['ID'] = group['ID'].iloc[0]  # Handle non-numeric separately if needed
    resampled_group = resampled_group.set_index('ID', append=True)
    return resampled_group


# Group by 'building_id' and resample each group's data
df3 = df3.groupby('ID', group_keys=False, observed=True).apply(resample_building_data)
df3 = df3.sort_index()
df_test = df_test.groupby('ID', group_keys=False, observed=True).apply(resample_building_data)
df_test = df_test.sort_index()

device = "cuda"
lr = 0.001
n_epochs = 2
window_size = 10

train_data, train_labels = create_sequences(df3["2009-07-14":"2010-12-15"].values, window_size)
test_data, test_labels = create_sequences(df3["2010-12-15":"2011-01-01"].values, window_size)
sample_data, sample_labels = create_sequences(df_test["2010-12-15":"2011-01-01"].values, window_size)

train_dataset = Data(torch.FloatTensor(train_data), torch.FloatTensor(train_labels))
test_dataset = Data(torch.FloatTensor(test_data), torch.FloatTensor(test_labels))
sample_dataset = Data(torch.FloatTensor(sample_data), torch.FloatTensor(sample_labels))

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=1024, drop_last=True)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=1024)
sample_loader = DataLoader(sample_dataset, shuffle=False, batch_size=1)

def train_function(net, criterion, optimizer, train_loader, n_epochs=5, device=torch.device("cpu")):
    from torch.optim.lr_scheduler import ReduceLROnPlateau
    scheduler = ReduceLROnPlateau(optimizer, 'min',verbose =True ,threshold=0.1,patience=3,factor=0.5)
    for epoch in range(n_epochs):
        epoch_loss = 0
        counter = 0
        for seqs, labels in train_loader:
            counter+=1
            seqs, labels = seqs.float().to(device), labels.float().to(device)
            seqs = seqs.view(seqs.size(0), -1)  # Ensure the input shape matches the expected shape
            outputs = net(seqs)
            loss = criterion(outputs, labels)
            epoch_loss += loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if counter % 50 == 0:
                print(f"Batch Number {counter} {loss.item()}")
        scheduler.step(epoch_loss / counter) 
        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")
        with open(f'modelcheckpoint{epoch}.pickle', 'wb') as handle:
            pickle.dump([net,optimizer], handle, protocol=pickle.HIGHEST_PROTOCOL)
    return net

def test_function(net, dataloader_test, scaler, label_scaler, device=torch.device("cuda"),return_data=False):
    mse = MeanSquaredError().to(device)
    smape = SymmetricMeanAbsolutePercentageError().to(device)
    net.eval()
    list_outputs = []
    list_targets = []
    with torch.no_grad():  #to not reservate a memory space for gradients
        for seqs, labels in dataloader_test:
            # Move data to device
            seqs, labels = seqs.float().to(device), labels.float().to(device)
            # Pass seqs to net and squeeze the result
            seqs = seqs.view(seqs.size(0), -1)
            outputs = net(seqs)

            if label_scaler:
                outputs = torch.tensor(scaler.inverse_transform(outputs),
                                       device=device)
                labels = torch.tensor(label_scaler.inverse_transform(labels),
                                      device=device)

            outputs = outputs.squeeze()
            labels = labels.squeeze()

            # Compute loss
            mse(outputs, labels)
            smape(outputs, labels)
            list_targets.append(labels.detach()) #detach() to remove pytorch constraints on the values
            list_outputs.append(outputs.detach())
    test_mse = mse.compute()
    test_smape = smape.compute()
    print(f"Test MSE: {test_mse} , SMAPE {test_smape}")
    if return_data:
        return torch.tensor(list_outputs, device=device), torch.tensor(list_targets, device=device), test_mse

criterion = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#with open('/kaggle/input/model1/modelcheckpoint1.pickle', 'rb') as handle:
#    model, optimizer = pickle.load(handle)
print("Model Started")

start_mem = get_memory_usage()
time1=time.time()
net = train_function(model,
                        criterion,
                        optimizer,
                        train_loader,
                        n_epochs=n_epochs,
                        device=device)
time2=time.time()
print("training time is ",time2-time1)
end_mem = get_memory_usage()
print(f"Memory used for model training: {end_mem - start_mem} MB")



net.to("cuda")
time3=time.time()
# list_outputs, list_targets, test_mse = test_function(net, test_loader, None, None, torch.device("cuda"))
test_function(net, test_loader, None, None, torch.device("cuda"))
time4=time.time()
print("inference time is ",time4-time3)

# s_mape = round(sMAPE(list_outputs, list_targets).cpu().item(), 3)
# print(f"sMAPE: {s_mape}%")
sample_outputs, sample_targets, sample_mse = test_function(net, sample_loader, None, None, torch.device("cuda"),True)
# Visualizations
plt.plot(sample_outputs.to("cpu"), "-o", color="blue", label="N-BEATS Predictions", markersize=3)
plt.plot(sample_targets.to("cpu"), color="red", label="Actual")
plt.ylabel("Energy Consumption (MW)")
plt.title(f"Energy Consumption for Electricity state building number 1003")
plt.legend()
plt.show()
