In [1]:
import pandas as pd
import math
import time
import torch
import random
import os

from dataloading import *
from model import *
from utils import *

In [2]:
def init_seq2seq(config, computing_device):
    enc = Encoder(config['input_dim'], config['hidden_dim'], config['n_layers'], config['enc']['hid_dropout'], config['enc']['input_dropout'])
    dec = Decoder(config['output_dim'], config['hidden_dim'], config['n_layers'], config['dec']['hid_dropout'], config['dec']['input_dropout'])

    model = Seq2Seq(enc, dec,computing_device)#.to(device)

    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'],weight_decay=config['weight_decay'])
    criterion = nn.CrossEntropyLoss(ignore_index=output_pad_index)
    
    model = model.to(computing_device)
    return model

def split_data(filenames_by_type,test_type, train_frac=0.75, BATCH_SIZE=512):
    print('...loading data')
    if test_type != 'A':
        init='A'
    else:
        init='B'
    filename=filenames_by_type[init][0]
    q = torch.load(os.path.join(data_dir,filename))
    inputs,targets = q[0],q[1]

    for typ in filenames_by_type:
        if typ==test_type:
            continue
        if typ==init:
            for filename in filenames_by_type[typ][1:]:
                q = torch.load(os.path.join(data_dir,filename))
                src,trg = q[0],q[1]
                inputs=torch.cat([inputs,src],dim=1)
                targets=torch.cat([targets,trg],dim=1)
        else:
            for filename in filenames_by_type[typ]:
                q = torch.load(os.path.join(data_dir,filename))
                src,trg = q[0],q[1]
                inputs=torch.cat([inputs,src],dim=1)
                targets=torch.cat([targets,trg],dim=1)
    
    print(inputs.size())
    print(targets.size())
    
    #shuffle indices
    indices = list(range(targets.size()[1]))

    random.shuffle(indices)
    
    inputs = inputs[:,indices,:]
    targets = targets[:,indices,:]
    
    print(inputs.size())
    print(targets.size())
    
    # chunk
    n_chunks = math.ceil(inputs.size()[1]/BATCH_SIZE)
    inputs = torch.chunk(inputs, n_chunks, dim=1) 
    targets = torch.chunk(targets, n_chunks, dim=1) 
    
    # split train and val
    i=int(train_frac*len(inputs))
    train_inputs = inputs[:i]
    val_inputs = inputs[i:]
    
    train_targets = targets[:i]
    val_targets = targets[i:]
    
    return train_inputs, train_targets, val_inputs, val_targets
    
def get_test_data(filenames_by_type,test_type, BATCH_SIZE=512):
    filename=filenames_by_type[test_type][0]
    q = torch.load(os.path.join(data_dir,filename))
    inputs,targets = q[0],q[1]
    
    for filename in filenames_by_type[test_type][1:]:
                q = torch.load(os.path.join(data_dir,filename))
                src,trg = q[0],q[1]
                inputs=torch.cat([inputs,src],dim=1)
                targets=torch.cat([targets,trg],dim=1)   
    # chunk
    n_chunks = math.ceil(inputs.size()[1]/BATCH_SIZE)
    inputs = torch.chunk(inputs, n_chunks, dim=1) 
    targets = torch.chunk(targets, n_chunks, dim=1) 
    
    return inputs, targets




In [3]:
def train_and_validate(config,test_type, train_inputs, train_targets, val_inputs, val_targets, N=5):
    output_dir='hd={}_nl={}'.format(config['hidden_dim'],config['n_layers':4])
    output_file = 'bs={}_lr={}_wd={}_tf={}_hd={}_id={}_fold={}'.format(config['batch_size'],config['learning_rate'],config['weight_decay'],config['teacher_forcing_ratio'],config['enc']['hid_dropout'],config['enc']['input_dropout'],test_type)
    output_filepath = os.path.join('output',output_dir,output_file+'.csv')   
    
    model = init_seq2seq(config, computing_device)
    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'],weight_decay=config['weight_decay'])
    criterion = nn.CrossEntropyLoss(ignore_index=output_pad_index)

    verbose=config['verbose']
    #train_inputs, train_targets, val_inputs, val_targets = split_data(filenames_by_type,test_type, BATCH_SIZE=config['batch_size'])

    avg_val_loss=0.0
    min_val_loss=100
    min_epoch=0
    best_state_dict=None
    for epoch in range(config['epochs']):
        # train 
        if verbose:
            print('...training')
        start=time.time()
        loss = train(model, train_inputs, train_targets, optimizer, criterion, computing_device,config)
        if verbose:
            print('   epoch {}: train_loss:{}, time:{}'.format(epoch,loss,time.time()-start))

        #validate
        if verbose:
            print('...validating')
        start=time.time()
        val_loss = validate(model, val_inputs, val_targets, optimizer, criterion, computing_device)
        if verbose:
            print('   epoch {}: val_loss:{}, time:{}'.format(epoch,loss,time.time()-start))
    
        avg_val_loss+=val_loss
        
        if epoch%N==0:
            avg_val_loss/=N
            
            with open(output_filepath, 'a') as file: 
                file.write('{}\n'.format(avg_val_loss))
                
            # update min, state_dict
            if avg_val_loss<min_val_loss:
                min_val_loss=avg_val_loss
                min_epoch=epoch 
                best_state_dict = model.state_dict()
            # if not decreasing for a while
            elif epoch - min_epoch >= config['N_early_stop']:
                if best_state_dict:
                    PATH = "./output/{}.pt".format(output_file)
                    torch.save(best_state_dict, PATH)
                return min_val_loss
                
    if best_state_dict:
        PATH = "./output/{}.pt".format(output_file)
        torch.save(best_state_dict, PATH)
    
    return min_val_loss, min_epoch, config

#test_type = 'E'
#test_inputs, test_targets = get_test_data(filenames_by_type,test_type, BATCH_SIZE=config['batch_size'])  


In [4]:
# try to run one config: train + validate (+ test?)

### SET UP ###
data_dir = 'data/numerical_data_set_simple_torch'
filenames=[]
filenames_by_type = {'A':[], 'B':[], 'C':[], 'D':[], 'E':[]}
for file in os.listdir(data_dir):
    filename, file_extension = os.path.splitext(file)
    
    typ = filename[-1]
    if typ in filenames_by_type:
        filenames.append(file)
        filenames_by_type[typ].append(file)
        
## MODIFY JUST FOR TEST, COMMENT OUT FOR REAL RUNS ###
for typ in filenames_by_type:
    filenames_by_type[typ]=[filenames_by_type[typ][0]]
    
print(filenames_by_type)

computing_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_digits =10
n_chars=256

config = {
    'epochs':1,
    'N_early_stop':10,
    'batch_size':512,
    'learning_rate':0.001,
    'weight_decay':0,
    'teacher_forcing_ratio':1.0,
    'hidden_dim':512,
    'n_layers':4, 
    'enc': {
        'hid_dropout':0.0,
        'input_dropout':0.0
    },
    'dec': {
        'hid_dropout':0.0,
        'input_dropout':0.0
    },
    
    'input_dim':n_chars+4,
    'output_dim':n_digits+5,
    
    'verbose':True
}

 
### GET DATA ###
test_type = 'E'
train_inputs, train_targets, val_inputs, val_targets = split_data(filenames_by_type,test_type, BATCH_SIZE=config['batch_size'])

print(len(train_inputs))
print(len(train_targets))
print(len(val_inputs))
print(len(val_targets))

### TRAIN AND VALIDATE ###
min_val_loss,min_epoch,curr_config = train_and_validate(config,test_type, train_inputs, train_targets, val_inputs, val_targets, N=5)
print('min val loss: {}, min epoch: {}'.format(min_val_loss,min_epoch))

min_config=curr_config


{'A': ['labelled_gen_data1_A'], 'B': ['labelled_extr_data19_B'], 'C': ['labelled_gen_data10_C'], 'D': ['labelled_extr_data2_D'], 'E': ['labelled_extr_data20_E']}
...loading data
torch.Size([24, 236678, 260])
torch.Size([24, 236678, 15])
torch.Size([24, 236678, 260])
torch.Size([24, 236678, 15])
347
347
116
116
...training
   epoch 0: train_loss:1.737488609226018, time:232.73550868034363
...validating
   epoch 0: val_loss:1.737488609226018, time:47.52167367935181
min val loss: 1.2176886747623312, min epoch: 0


In [29]:
### TRAIN AND VALIDATE ###
min_val_loss,min_epoch,curr_config = train_and_validate(config,test_type, train_inputs, train_targets, val_inputs, val_targets, N=5)
print('min val loss: {}, min epoch: {}'.format(min_val_loss,min_epoch))

min_config=curr_config

...training
   epoch 0: train_loss:1.702537173496543, time:236.5699977874756
...validating
   epoch 0: val_loss:1.702537173496543, time:55.62280201911926


AttributeError: 'Seq2Seq' object has no attribute 'stat_dict'

In [8]:
### TEST ON BEST MODEL ###
test_inputs, test_targets = get_test_data(filenames_by_type,test_type, BATCH_SIZE=config['batch_size'])  
model = init_seq2seq(min_config, computing_device)
output_dir='hd={}_nl={}'.format(config['hidden_dim'],config['n_layers':4])
file = 'bs={}_lr={}_wd={}_tf={}_hd={}_id={}_fold={}'.format(config['batch_size'],config['learning_rate'],config['weight_decay'],config['teacher_forcing_ratio'],config['enc']['hid_dropout'],config['enc']['input_dropout'],test_type)
PATH = "./output/{}/{}.pt".format(output_dir,file)
model.load_state_dict(torch.load(PATH))
optimizer = optim.Adam(model.parameters(), lr=min_config['learning_rate'],weight_decay=min_config['weight_decay'])
criterion = nn.CrossEntropyLoss(ignore_index=output_pad_index)
test_loss = validate(model, test_inputs, test_targets, optimizer, criterion, computing_device)

if config['verbose']:
    print('min test loss: {}'.format(test_loss))

min test loss: 5.904045765216534
