In [3]:
import pandas as pd
import math
import time
import torch
import random

from dataloading import *
from model import *
from utils import *

In [8]:
computing_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_digits =10
n_chars=256

config = {
    'batch_size':512,
    'learning_rate':0.001,
    'weight_decay':0,
    'teacher_forcing_ratio':1.0,
    'hidden_dim':512,
    'n_layers':4, 
    'enc': {
        'hid_dropout':0.0,
        'input_dropout':0.0
    },
    'dec': {
        'hid_dropout':0.0,
        'input_dropout':0.0
    },
    
    'input_dim':n_chars+4,
    'output_dim':n_digits+5
}

def init_seq2seq(config, computing_device):
    enc = Encoder(config['input_dim'], config['hidden_dim'], config['n_layers'], config['enc']['hid_dropout'], config['enc']['input_dropout'])
    dec = Decoder(config['output_dim'], config['hidden_dim'], config['n_layers'], config['dec']['hid_dropout'], config['dec']['input_dropout'])

    model = Seq2Seq(enc, dec,computing_device)#.to(device)

    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'],weight_decay=config['weight_decay'])
    criterion = nn.CrossEntropyLoss(ignore_index=output_pad_index)
    
    model = model.to(computing_device)
    return model

def split_data(filenames_by_type,test_type, train_frac=0.75, BATCH_SIZE=512):
    print('...loading data')
    if test_type != 'A':
        init='A'
    else:
        init='B'
    filename=filenames_by_type[init][0]
    q = torch.load(os.path.join(data_dir,filename))
    inputs,targets = q[0],q[1]

    for typ in filenames_by_type:
        if typ==test_type:
            continue
        if typ==init:
            for filename in filenames_by_type[typ][1:]:
                q = torch.load(os.path.join(data_dir,filename))
                src,trg = q[0],q[1]
                inputs=torch.cat([inputs,src],dim=1)
                targets=torch.cat([targets,trg],dim=1)
        else:
            for filename in filenames_by_type[typ]:
                q = torch.load(os.path.join(data_dir,filename))
                src,trg = q[0],q[1]
                inputs=torch.cat([inputs,src],dim=1)
                targets=torch.cat([targets,trg],dim=1)
    
    #shuffle indices
    indices = list(range(len(targets)))
    random.shuffle(indices)
    
    inputs = inputs[:,indices,:]
    targets = targets[:,indices,:]
    
    # chunk
    n_chunks = math.ceil(inputs.size()[1]/BATCH_SIZE)
    inputs = torch.chunk(inputs, n_chunks, dim=1) 
    targets = torch.chunk(targets, n_chunks, dim=1) 
    
    # split train and val
    train_inputs = inputs[:train_frac*len(train_inputs)]
    val_inputs = inputs[train_frac*len(train_inputs):]
    
    train_targets = targets[:train_frac*len(train_inputs)]
    val_targets = targets[train_frac*len(train_inputs):]
    
    return train_inputs, train_targets, val_inputs, val_targets
    
def get_test_data(filenames_by_type,test_type, BATCH_SIZE=512):
    filename=filenames_by_type[test_type][0]
    q = torch.load(os.path.join(data_dir,filename))
    inputs,targets = q[0],q[1]
    
    for filename in filenames_by_type[test_type][1:]:
                q = torch.load(os.path.join(data_dir,filename))
                src,trg = q[0],q[1]
                inputs=torch.cat([inputs,src],dim=1)
                targets=torch.cat([targets,trg],dim=1)   
    # chunk
    n_chunks = math.ceil(inputs.size()[1]/BATCH_SIZE)
    inputs = torch.chunk(inputs, n_chunks, dim=1) 
    targets = torch.chunk(targets, n_chunks, dim=1) 
    
    return inputs, targets




In [6]:
import os

data_dir = 'data/numerical_data_set_simple_torch'

filenames = []
filenames_by_type = {'A':[], 'B':[], 'C':[], 'D':[], 'E':[]}
for file in os.listdir(data_dir):
    filename, file_extension = os.path.splitext(file)
    
    typ = filename[-1]
    if typ in filenames_by_type:
        filenames.append(file)
        filenames_by_type[typ].append(file)
        
print(len(filenames))
print(filenames_by_type)
for key in filenames_by_type:
    print(len(filenames_by_type[key]))

27
{'A': ['labelled_gen_data1_A', 'labelled_gen_data3_A', 'labelled_gen_data2_A', 'labelled_extr_data1_A', 'labelled_extr_data3_A'], 'B': ['labelled_extr_data19_B', 'labelled_extr_data18_B', 'labelled_gen_data5_B', 'labelled_gen_data4_B', 'labelled_gen_data6_B'], 'C': ['labelled_gen_data10_C', 'labelled_gen_data11_C', 'labelled_gen_data7_C', 'labelled_gen_data8_C', 'labelled_gen_data9_C'], 'D': ['labelled_extr_data2_D', 'labelled_gen_data12_D', 'labelled_gen_data13_D', 'labelled_extr_data4_D'], 'E': ['labelled_extr_data20_E', 'labelled_gen_data16_E', 'labelled_dir_data91_E', 'labelled_gen_data15_E', 'labelled_dir_data92_E', 'labelled_gen_data14_E', 'labelled_dir_data39_E', 'labelled_dir_data49_E']}
5
5
5
4
8


In [None]:
computing_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_digits =10
n_chars=256

config = {
    'epochs':100,
    'batch_size':512,
    'learning_rate':0.001,
    'weight_decay':0,
    'teacher_forcing_ratio':1.0,
    'hidden_dim':512,
    'n_layers':4, 
    'enc': {
        'hid_dropout':0.0,
        'input_dropout':0.0
    },
    'dec': {
        'hid_dropout':0.0,
        'input_dropout':0.0
    },
    
    'input_dim':n_chars+4,
    'output_dim':n_digits+5
}

def train_and_validate(config,test_type, train_inputs, train_targets, val_inputs, val_targets):
    model = init_seq2seq(config, computing_device)
    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'],weight_decay=config['weight_decay'])
    criterion = nn.CrossEntropyLoss(ignore_index=output_pad_index)

    train_inputs, train_targets, val_inputs, val_targets = split_data(filenames_by_type,test_type, BATCH_SIZE=config['batch_size'])

    # train 
    print('...training')
    for epoch in range(num_epochs):
        start=time.time()
        loss = train(model, inputs, targets, optimizer, criterion, computing_device,config)
        print('   epoch {}: train_loss:{}, time:{}'.format(epoch,loss,time.time()-start))

    #validate
    print('...validating')
    start=time.time()
    val_loss = evaluate(model, val_inputs, val_targets, optimizer, criterion, computing_device)
    print('   epoch {}: val_loss:{}, time:{}'.format(epoch,loss,time.time()-start))


test_type = 'E'
test_inputs, test_targets = get_test_data(filenames_by_type,test_type, BATCH_SIZE=config['batch_size'])  


...loading data
