In [2]:
from sklearn.metrics import mean_squared_error
import math
from model import *
from dataloading import *
import torch.nn.functional as F

def dataloader_fromfile(filename, batch_size=15):
    df = pd.read_csv(filename)
    inputs = df['Attribute_value']
    targets = df['Numerical_value']
    
    batch_size=15
    n_chunks = math.ceil(len(inputs)/BATCH_SIZE)
    
    inputs = prepare_data(inputs)
    targets = prepare_targets(targets)
    
    inputs = torch.chunk(inputs, n_chunks, dim=1) 
    targets = torch.chunk(targets, n_chunks, dim=1) 
    
    return inputs,targets

def dataloader(filenames, batch_size=15):
    inputs=[]
    targets=[]
    for filename in filenames:
        df = pd.read_csv(filename)
        inputs = df['Attribute_value']
        targets = df['Numerical_value']
        
        inputs+=list(inputs)
        targets+=list(targets)
    
    batch_size=15
    n_chunks = math.ceil(len(inputs)/BATCH_SIZE)
    
    inputs = prepare_data(inputs)
    targets = prepare_targets(targets)
    
    inputs = torch.chunk(inputs, n_chunks, dim=1) 
    targets = torch.chunk(targets, n_chunks, dim=1) 
    
    return inputs,targets

def interpret_output(outputs):
    outputs = F.softmax(outputs, dim=2)
    outputs = torch.argmax(outputs, dim=2)
    nums = []
    seen_period=False
    for j in range(outputs.size()[1]):
        num=''
        for i in range(1, outputs.size()[0]):
            if outputs[i][j]>=13:
                break
            dig = indexToDigit(outputs[i][j])
            if dig=='.':
                if seen_period:
                    continue
                seen_period=True
            num+=dig
            
        #print(num)
        nums.append(float(num))
    return nums


def evaluate(model, inputs, targets, optimizer, criterion, computing_device):
    model=model.to(computing_device)
    model.eval()
    
    total_mse=0.0
    
    for i in range(len(targets)):
            src = inputs[i].to(computing_device)
            trg = targets[i].to(computing_device)
            
            print(len(src))
            print(len(trg))

            optimizer.zero_grad()

            outputs = model(src, trg, teacher_forcing_ratio=0.0)
            outputs = outputs.to(computing_device)

            print('expected')
            print(trg.size())
            print(torch.argmax(trg, dim=2))
            print(torch.argmax(outputs, dim=2))

            num_labels = interpret_output(trg)

            #print(num_labels)

            num_predictions = interpret_output(outputs)

            #print(num_predictions)

            # shape = [seq_len, batch_size]
            mse = mean_squared_error(num_labels,num_predictions)
            print('MSE',mse)
            
    total_mse/=len(targets)
    print(total_mse)
    print(len(targets))
    return total_mse

In [3]:
n_digits = 10
#OUTPUT_DIM = n_digits + 2
n_chars=256
#INPUT_DIM=1
#ENC_EMB_DIM = n_chars+1
#DEC_EMB_DIM = OUTPUT_DIM
INPUT_DIM = n_chars+4
OUTPUT_DIM = n_digits + 5

HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.0
DEC_DROPOUT = 0.0


enc = Encoder(INPUT_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

PATH = "./output/typeE_9.pt"
model = Seq2Seq(enc, dec)
model.load_state_dict(torch.load(PATH))
model.eval()

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=output_pad_index)

computing_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [4]:
import os

data_dir = 'data/numerical_data_set_simple_torch'

filenames = []
filenames_by_type = {'A':[], 'B':[], 'C':[], 'D':[], 'E':[]}
for file in os.listdir(data_dir):
    filename, file_extension = os.path.splitext(file)
    
    typ = filename[-1]
    if typ in filenames_by_type:
        filenames.append(file)
        filenames_by_type[typ].append(file)
        
print(len(filenames))
print(filenames_by_type)
for key in filenames_by_type:
    print(len(filenames_by_type[key]))

27
{'A': ['labelled_gen_data1_A', 'labelled_gen_data3_A', 'labelled_gen_data2_A', 'labelled_extr_data1_A', 'labelled_extr_data3_A'], 'B': ['labelled_extr_data19_B', 'labelled_extr_data18_B', 'labelled_gen_data5_B', 'labelled_gen_data4_B', 'labelled_gen_data6_B'], 'C': ['labelled_gen_data10_C', 'labelled_gen_data11_C', 'labelled_gen_data7_C', 'labelled_gen_data8_C', 'labelled_gen_data9_C'], 'D': ['labelled_extr_data2_D', 'labelled_gen_data12_D', 'labelled_gen_data13_D', 'labelled_extr_data4_D'], 'E': ['labelled_extr_data20_E', 'labelled_gen_data16_E', 'labelled_dir_data91_E', 'labelled_gen_data15_E', 'labelled_dir_data92_E', 'labelled_gen_data14_E', 'labelled_dir_data39_E', 'labelled_dir_data49_E']}
5
5
5
4
8


In [5]:
filename = filenames_by_type['E'][-1]
q = torch.load(os.path.join(data_dir,filename))
inputs,targets = q[0],q[1]

BATCH_SIZE=1
n_chunks = math.ceil(len(inputs)/BATCH_SIZE)

inputs = torch.chunk(inputs, n_chunks, dim=1) 
targets = torch.chunk(targets, n_chunks, dim=1) 

computing_device="cpu"
evaluate(model, inputs, targets, optimizer, criterion, computing_device)

24
24
expected
torch.Size([24, 1731, 15])
tensor([[ 12,  12,  12,  ...,  12,  12,  12],
        [  3,   2,   4,  ...,   5,   8,   1],
        [  0,   3,   5,  ...,   7,   1,   9],
        ...,
        [ 14,  14,  14,  ...,  14,  14,  14],
        [ 14,  14,  14,  ...,  14,  14,  14],
        [ 14,  14,  14,  ...,  14,  14,  14]])
tensor([[  0,   0,   0,  ...,   0,   0,   0],
        [  3,   2,   4,  ...,   5,   8,   1],
        [  0,   2,   3,  ...,   7,   1,   9],
        ...,
        [ 13,   7,  13,  ...,   7,   7,  13],
        [ 13,  13,   7,  ...,  13,  13,  13],
        [ 13,  13,  13,  ...,  13,  13,   9]])
MSE 110626278279843.66
24
24
expected
torch.Size([24, 1731, 15])
tensor([[ 12,  12,  12,  ...,  12,  12,  12],
        [  2,   2,   7,  ...,   3,   3,   1],
        [  1,   7,   3,  ...,   9,   9,   2],
        ...,
        [ 14,  14,  14,  ...,  14,  14,  14],
        [ 14,  14,  14,  ...,  14,  14,  14],
        [ 14,  14,  14,  ...,  14,  14,  14]])
tensor([[  0,   0,   0,

MSE 617801328830717.6
24
24
expected
torch.Size([24, 1731, 15])
tensor([[ 12,  12,  12,  ...,  12,  12,  12],
        [  1,   4,   1,  ...,   0,   1,   5],
        [  0,   0,   0,  ...,  13,   6,   1],
        ...,
        [ 14,  14,  14,  ...,  14,  14,  14],
        [ 14,  14,  14,  ...,  14,  14,  14],
        [ 14,  14,  14,  ...,  14,  14,  14]])
tensor([[  0,   0,   0,  ...,   0,   0,   0],
        [  1,   4,   1,  ...,   0,   1,   5],
        [  0,   0,   0,  ...,  13,   6,   1],
        ...,
        [ 13,  13,  13,  ...,  13,  13,  13],
        [ 13,   7,  13,  ...,   7,   7,  13],
        [ 13,   7,   7,  ...,  13,  13,  13]])
MSE 328576350010325.5
24
24
expected
torch.Size([24, 1731, 15])
tensor([[ 12,  12,  12,  ...,  12,  12,  12],
        [  1,   2,   2,  ...,   7,   7,   2],
        [  6,   0,   4,  ...,   8,   7,   5],
        ...,
        [ 14,  14,  14,  ...,  14,  14,  14],
        [ 14,  14,  14,  ...,  14,  14,  14],
        [ 14,  14,  14,  ...,  14,  14,  14]])
te

0.0