In [75]:
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.utils.rnn as rnn
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

In [87]:
import inspect

In [74]:
q_config = {
    'prod': False,
    'device': ("cuda" if torch.cuda.is_available() else "cpu"),
    
    'lback': False,
    
    'variable': "Quarterly",
    'run': "50/45 (1,2),(4,8), LR=0.001/{10,1e-4f}, EPOCHS=15, LVP=80 40*",
    'percentile': 50,
    'training_percentile': 45,
    'dilations': ((1, 2), (4, 8)),
    'use_residual_lstm': False,
    'add_nl_layer': False,
    'initial_learning_rate': 1e-3,
    'learning_rates': ((10, 1e-4)),
    'per_series_lr_multip': 1,
    'num_of_train_epochs': 15,
    'state_hsize': 40,
    'seasonality': 4,
    'input_size': 4,
    'output_size': 8,
    'min_inp_seq_len': 0,
    'level_variability_penalty': 80,
    'batch_size': 8
    
    'num_of_categories': 6, # in data provided
    'big_loop': 3,
    'num_of_chunks': 2,
    'eps': 1e-6,
    'averaging_level': 5,
    'use_median': False,
    'middle_pos_for_avg': 2, # if using medians
    'noise_std'=0.001, 
    'freq_of_test': 1,
    'gradient_clipping': 20,
    'c_state_penalty': 0,
    'big_float': 1e38, # numeric_limits<float>::max(),
    'print_diagn': True,
    'max_num_of_series': -1,
    
    'use_auto_learning_rate': False,
    'min_learning_rate': 0.0001f,
    'lr_ratio': sqrt(10),
    'lr_tolerance_multip': 1.005,
    'l3_period': 2,
    'min_epochs_before_changing_lrate': 2
}

q_config['input_size_i'] = q_config['input_size']
q_config['output_size_i'] = q_config['output_size']
q_config['min_series_length'] = q_config['input_size_i'] + q_config['output_size_i'] + q_config['min_inp_seq_len'] + 2
q_config['max_series_length'] = 40 * q_config['seasonality'] + q_config['min_series_length']
q_config['tau']: q_config['percentile'] / 100
q_config['training_tau']: q_config['training_percentile'] / 100
q_config['attention_hsize']: q_config['state_hsize']
    
if not q_config['prod']:
    q_config['batch_size'] = 2
    q_config['max_num_of_series'] = 40
    

In [91]:
info

Unnamed: 0,M4id,category,Frequency,Horizon,SP,StartingDate
0,Y1,Macro,1,6,Yearly,01-01-79 12:00
1,Y2,Macro,1,6,Yearly,01-01-79 12:00
2,Y3,Macro,1,6,Yearly,01-01-79 12:00
3,Y4,Macro,1,6,Yearly,01-01-79 12:00
4,Y5,Macro,1,6,Yearly,01-01-79 12:00
5,Y6,Macro,1,6,Yearly,01-01-89 12:00
6,Y7,Macro,1,6,Yearly,01-01-89 12:00
7,Y8,Macro,1,6,Yearly,01-01-89 12:00
8,Y9,Macro,1,6,Yearly,01-01-89 12:00
9,Y10,Macro,1,6,Yearly,01-01-89 12:00


In [90]:
info = pd.read_csv('./data/info.csv')
var_category = info[info['SP'] == q_config['variable']]['category']

In [45]:
def read_file(file_location):
    series = []
    ids = []
    with open(file_location, 'r') as file:
        data = file.read().split("\n")

    for i in range(1, len(data)-1):
        row = data[i].replace('"', '').split(',')
        series.append(np.array([float(j) for j in row[1:] if j != ""]))
        ids.append(row[0])

    series = np.array(series)
    return series

In [None]:
def create_val_set(train, output_size):
    val = []
    for i in range(len(train)):
        val.append(train[i][-output_size:])
        train[i] = train[i][:-output_size]
    return np.array(val)

In [46]:
def create_datasets(train_file_location, test_file_location, output_size):
    train = read_file(train_file_location)
    _, test = read_file(test_file_location)
    vals = create_val_set(train, output_size)
    return train, vals, test        

In [47]:
train, vals, test = create_datasets('./data/M4DataSetTrain/Quarterly-train.csv', './data/M4DataSetTest/Quarterly-test.csv', q_config['output_size'])

In [70]:
class SeriesDataset(Dataset):
    def __init__(self, dataTrain, dataVal, dataTest, info, device):
        self.dataTrain = [torch.tensor(i) for i in dataTrain]
        self.dataVal = [torch.tensor(i) for i in dataVal]
        self.dataTest = [torch.tensor(i) for i in dataTest]
        self.dataInfo = None
        self.device = device
    
    def __len__(self):
        return len(self.dataTrain)
        
    def __getitem__(self, idx):
        return self.dataTrain[idx].to(self.device), self.dataVal[idx].to(self.device), self.dataTest[idx].to(self.device)

# recitation 7
def collate_lines(seq_list):
    train_, val_, test_ = zip(*seq_list)
    train_lens = [len(seq) for seq in train_]
    seq_order = sorted(range(len(train_lens)), key=train_lens.__getitem__, reverse=True)
    train = [train_[i] for i in seq_order]
    val = [val_[i] for i in seq_order]
    test = [test_[i] for i in seq_order]
    
    return train, val, test

dataset = SeriesDataset(train, test, q_config['device'])
dataloader = DataLoader(dataset, batch_size=q_config['batch_size'], shuffle=True, collate_fn=collate_lines)