In [None]:
import shutil
import os

def copy_folders(source_dir, dest_dir, folders):
    for folder in folders:
        src_folder_path = os.path.join(source_dir, folder)
        dest_folder_path = os.path.join(dest_dir, folder)

        # Check if the source folder exists
        if os.path.exists(src_folder_path):
            # Copy the folder and its contents
            shutil.copytree(src_folder_path, dest_folder_path)
            print(f"Copied {src_folder_path} to {dest_folder_path}")
        else:
            print(f"Source folder {src_folder_path} does not exist.")

# Define source and destination directories
source_directory = '../pretraining-forecasting/dataset'
destination_directory = './dataset'

# List of folders to copy
folders_to_copy = ['train', 'val', 'test']

# Call the function to copy folders
copy_folders(source_directory, destination_directory, folders_to_copy)


In [18]:
import numpy as np
import pandas as pd

import os
import sys
sys.path.append('./model')

import torch
from torch.utils.data import Dataset, DataLoader
from model.nbeats_ttm import NBeatsNet
from torch.utils.data import ConcatDataset

from tqdm import tqdm
# from sklearn.metrics import mean_squared_error



# metrics used for evaluation
def cal_cvrmse(pred, true, eps=1e-8):
    pred = np.array(pred)
    true = np.array(true)
    return np.power(np.square(pred - true).sum() / pred.shape[0], 0.5) / (true.sum() / pred.shape[0] + eps)

def cal_mae(pred, true):
    pred = np.array(pred)
    true = np.array(true)
    return np.mean(np.abs(pred - true))

def cal_nrmse(pred, true, eps=1e-8):
    true = np.array(true)
    pred = np.array(pred)

    M = len(true) // 24
    y_bar = np.mean(true)
    NRMSE = 100 * (1/ (y_bar+eps)) * np.sqrt((1 / (24 * M)) * np.sum((true - pred) ** 2))
    return NRMSE








def standardize_series(series, eps=1e-8):
    mean = np.mean(series)
    std = np.std(series)
    standardized_series = (series - mean) / (std+eps)
    return standardized_series, mean, std

def unscale_predictions(predictions, mean, std, eps=1e-8):
    return predictions * (std+eps) + mean


class TimeSeriesDataset(Dataset):
    def __init__(self, data, backcast_length, forecast_length, stride=1):
        # Standardize the time series data
        self.data, self.mean, self.std = standardize_series(data)
        self.backcast_length = backcast_length
        self.forecast_length = forecast_length
        self.stride = stride

    def __len__(self):
        return (len(self.data) - self.backcast_length - self.forecast_length) // self.stride + 1

    def __getitem__(self, index):
        start_index = index * self.stride
        x = self.data[start_index : start_index + self.backcast_length]
        y = self.data[start_index + self.backcast_length : start_index + self.backcast_length + self.forecast_length]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)



def test(model, criterion, device, folder_path, result_path):


    median_res = []  
    for region in os.listdir(folder_path):

        region_path = os.path.join(folder_path, region)

        results_path = os.path.join(result_path, region)
        os.makedirs(results_path, exist_ok=True)

        res = []

        for building in os.listdir(region_path):

            building_id = building.rsplit(".csv",1)[0]

            if building.endswith('.csv'):
                file_path = os.path.join(region_path, building)
                df = pd.read_csv(file_path)
                energy_data = df['energy'].values
                dataset = TimeSeriesDataset(energy_data, backcast_length, forecast_length, stride)
                
                # test phase
                model.eval()
                val_losses = []
                y_true_test = []
                y_pred_test = []

                # test loop
                for x_test, y_test in DataLoader(dataset, batch_size=1):
                    x_test, y_test = x_test.to(device), y_test.to(device)
                    with torch.no_grad():
                        backcast, forecast = model(x_test)
                        loss = criterion(forecast, y_test)
                        val_losses.append(loss.item())
                        
                        # Collect true and predicted values for RMSE calculation
                        y_true_test.extend(y_test.cpu().numpy())
                        y_pred_test.extend(forecast.cpu().numpy())
                        
                # Calculate average validation loss and RMSE
                y_true_combine = np.concatenate(y_true_test, axis=0)
                y_pred_combine = np.concatenate(y_pred_test, axis=0)
                avg_test_loss = np.mean(val_losses)
                
                y_pred_combine_unscaled = unscale_predictions(y_pred_combine, dataset.mean, dataset.std)
                y_true_combine_unscaled = unscale_predictions(y_true_combine, dataset.mean, dataset.std)
                
                # Calculate CVRMSE, NRMSE, MAE on unscaled data
                cvrmse = cal_cvrmse(y_pred_combine_unscaled, y_true_combine_unscaled)
                nrmse = cal_nrmse(y_pred_combine_unscaled, y_true_combine_unscaled)
                mae = cal_mae(y_pred_combine_unscaled, y_true_combine_unscaled)

                res.append([building_id, cvrmse, nrmse, mae, avg_test_loss])

        columns = ['building_ID', 'CVRMSE', 'NRMSE', 'MAE', 'Avg_Test_Loss']
        df = pd.DataFrame(res, columns=columns)
        df.to_csv("{}/{}.csv".format(results_path, 'result'), index=False)

        med_nrmse = df['NRMSE'].median()
        median_res.append([region, med_nrmse])

    med_columns = ['Dataset','NRMSE']
    median_df = pd.DataFrame(median_res, columns=med_columns)
    median_df.to_csv("./results/nbeats_ttm/median_buildings_results.csv", index=False)




                




if __name__ == '__main__':

    
    # Parameters
    backcast_length = 168
    forecast_length = 24
    stride = 24
    batch_size = 64

    # Load datasets

    # Create data loaders


    patch_size = 8 
    num_patches = backcast_length // patch_size

    # check device 
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # Define N-BEATS model
    model = NBeatsNet(
        device=device,
        forecast_length=forecast_length,
        backcast_length=backcast_length,
        patch_size = patch_size, 
        num_patches = num_patches,
        hidden_dim=256
    ).to(device)

    model.load_state_dict(torch.load('./model_weights/nbeats_ttm/best_model.pth'))



    # Define loss and optimizer
    criterion = torch.nn.MSELoss()

    test_dataset_path = './dataset/test'
    result_path = './results/nbeats_ttm'

    # training the model and save best parameters
    test(model=model, criterion=criterion, device=device, folder_path=test_dataset_path, result_path=result_path)


  model.load_state_dict(torch.load('./model_weights/nbeats_ttm/best_model.pth'))


In [6]:
a = np.array([[2,3,4,5]])
b = np.array([[4,5,6,7]])
li = []
li.extend(a)
li.extend(b)
li

[array([2, 3, 4, 5]), array([4, 5, 6, 7])]

In [8]:


# Concatenate the arrays along the second axis (axis=1)
final_array = np.concatenate(li, axis=0)

print(final_array.shape)  # This will print (1, 24*k)

(8,)


In [14]:
import pandas as pd
import os

def convert(input_path, output_path):

    for split in os.listdir(input_path):
        
        split_path = os.path.join(input_path, split)

        for places in os.listdir(split_path):
            
            place_path = os.path.join(split_path, places)

            for building in os.listdir(place_path):

                building_name = building[:-4]
                
                des_path = os.path.join(output_path, split, places)
                os.makedirs(des_path, exist_ok=True)


                file_name = '{}.parquet'.format(building_name)
                # Define the input and output file paths
                csv_file_path = os.path.join(place_path, building)
                parquet_file_path = os.path.join(des_path, file_name)

                # Read the CSV file
                df = pd.read_csv(csv_file_path)

                # Write to Parquet format
                df.to_parquet(parquet_file_path, engine='pyarrow', index=False)

                print(f"File converted successfully: {parquet_file_path}")



input_path = '../../mixBEATS/dataset'
output_path = './parquet_dataset'
convert(input_path, output_path)

File converted successfully: ./parquet_dataset/test/Prayas/eMARC-1H_H134.parquet
File converted successfully: ./parquet_dataset/test/Prayas/eMARC-1H_H010.parquet
File converted successfully: ./parquet_dataset/test/Prayas/eMARC-1H_H042.parquet
File converted successfully: ./parquet_dataset/test/Prayas/eMARC-1H_H057.parquet
File converted successfully: ./parquet_dataset/test/Prayas/eMARC-1H_H004.parquet
File converted successfully: ./parquet_dataset/test/Prayas/eMARC-1H_H052.parquet
File converted successfully: ./parquet_dataset/test/Prayas/eMARC-1H_H084.parquet
File converted successfully: ./parquet_dataset/test/Prayas/eMARC-1H_H122.parquet
File converted successfully: ./parquet_dataset/test/Prayas/eMARC-1H_H066.parquet
File converted successfully: ./parquet_dataset/test/Prayas/eMARC-1H_H019.parquet
File converted successfully: ./parquet_dataset/test/Prayas/eMARC-1H_H091.parquet
File converted successfully: ./parquet_dataset/test/Prayas/eMARC-1H_H069.parquet
File converted successfully:

In [13]:
import pandas as pd

# Path to the sample parquet file
parquet_file_path = './parquet_dataset/train/Bareilly/Bareilly_2020_BR03.parquet'  # replace with actual file path

# Read the Parquet file
df = pd.read_parquet(parquet_file_path, engine='pyarrow')

# Display the first few rows
print(df.head())

# Optional: View info about the dataframe
print(df.info())


                  time  energy
0  2020-01-01 00:00:00   0.032
1  2020-01-01 01:00:00   0.032
2  2020-01-01 02:00:00   0.038
3  2020-01-01 03:00:00   0.032
4  2020-01-01 04:00:00   0.031
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   time    8784 non-null   object 
 1   energy  8784 non-null   float64
dtypes: float64(1), object(1)
memory usage: 137.4+ KB
None
