In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 0

from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling
import torch.distributed as dist 
from torch.multiprocessing import Process

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
import collections

import gc
import multiprocessing as mp
mgr = mp.Manager()
ns = mgr.Namespace()

from datetime import datetime

In [18]:
class M:
    sample = 1.0
    cores = 3
    no_cuda = True
    #cores = mp.cpu_count()-1
    format = 'csv'
    index = 'id'
    sort_var = 'item_description'
    # first process item_description then name
    text_vars = ['item_description', 'name']
    cat_vars = ['item_condition_id', 'brand_name', 'shipping']
    combine = {'item_description': ['name', 'item_description'], 'name': ['name']}
    split_cat_for_text = True
    nbrands = 1000
    split_cat = 0
    start_time = datetime.now()
    
    #category_names = ['category_name_'+str(i) for i in range(split_cat)]
    
    category_names = ['category_name']
    
    cat_vars += category_names
    range_vars = ['min', 'max']
    contin_vars = [f'len_{tv}' for tv in text_vars]
    #contin_vars += range_vars
    target = 'price'
    dep = 'price'
    features = [index]+text_vars+cat_vars+contin_vars
    min_freq = {'item_description': int(75), 'name': int(45*sample+5)}

    size = 1
    nfcore = 5
    end_train = False

USE_GPU = False if M.no_cuda else True

M.enc_bptt, M.enc_max_seq=[50, 20], [100, 30]
M.text_emb_sz = [70, 30]  # size of each embedding vector
M.enc_n_hid = [100, 50]   # number of hidden activations per layer
M.enc_n_layers = [1, 1]   # number of layers
M.opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

M.text_seq_last, M.ignore_pad = False, True
M.lm_dropouti, M.lm_dropoute, M.lm_dropouth = 0.05, 0.02, 0.05 
M.lm_dropout, M.lm_wdrop = 0.05, 0.1
M.lm_opt_fn = partial(optim.Adam, betas=(0.7, 0.99))
M.lm_reg_fn = partial(seq2seq_reg, alpha=2, beta=1)

M.dropouti, M.dropoute, M.dropouth=0.065, 0.01, 0.02
M.wdrop=0.03
M.cat_emb_drop=0.04
M.fc_layers=[512, 256, 64]
M.fc_drops =[0.001, 0.01, 0.1]

#M.lrs=np.array([1.0e-3, 1.0e-3,1.0e-3,1.0e-3])
M.lrs = 0.01
M.bs = 256

In [3]:
def override_defs(module, fns):
    module = sys.modules[module]
    for fn in fns:
        setattr(module, fn, fn)
        sys.modules[module] = module
        from module import fn
    return 

def show(s='.'):
    sys.stdout.write(s)
    sys.stdout.flush()

def save_df(df, file=None, fidx=None, st=None, index=False):
    # fidx can be 0 
    if fidx!=None: file = f'{SCRATCH}/{st}_{fidx}.{M.format}'
    if file==None: show('******ERROR**** in save_df ')
    df.to_feather(file) if (M.format=='feather') else df.to_csv(file, index=index)
    return file

def read_df(file, processed=False):
    df = pd.read_feather(file) if (M.format=='feather') else pd.read_csv(file)
    if processed: 
        for tv in M.text_vars: df[tv] = df[tv].apply(eval)
    return df

def run_distributed(fn, fn_args, cores=None):
    if cores==None or cores > mp.cpu_count()-1: cores = mp.cpu_count()-1
    pool = mp.Pool(processes = cores)
    res = pool.starmap(fn, fn_args)
    show(' done\n')
    pool.close()
    pool.join()
    return res

# DF Processing

In [4]:
from spacy.symbols import ORTH
from spacy.pipeline import SentenceSegmenter

smpl_tok = spacy.load('en')
smpl_tok.pipeline.append(SentenceSegmenter(smpl_tok.vocab))

re_rm = re.compile(r'\s*\[\s*rm\s*\]\s*')
re_sp = re.compile(r'\s+')
smpl_tok.tokenizer.add_special_case('[rm]', [{ORTH: '[rm]'}])

def simple_tok(x):
    x = str(x).lower()
    x = re_rm.sub(' [rm] ', x)
    x = ''.join(filter(lambda el: el in string.printable, x))
    x = re_sp.sub(r' ', x)
    #x = ' '.join(x.split())
    ret = [tok.text for tok in smpl_tok.tokenizer(x)]
    return ret

def fix_brand_col(df, brands):
    idxs = set(df.loc[df['brand_name']=='missing'].index)
    ol = len(idxs)
    for i,b in enumerate(brands):
        if len(b.split()) > 1: 
            b1 = b.lower()
            splt = df['name'].loc[idxs].apply(lambda x: len(str(x).lower().split(b1)))
        else:
            splt = df['name'].loc[idxs].str.split(b).apply(len)    
        fix_idx = splt[splt >1].index
        df.loc[fix_idx, 'brand_name'] = b
        idxs -= set(fix_idx)
    show(f'{round((ol-len(idxs))*100/ol)}% ')
    return
    
def preprocess_df(df, fidx, st):
    df.loc[df['item_description']=='No description yet', 'item_description'] = 'missing'
    for c in ['name', 'category_name', 'brand_name', 'item_description']:
        df.loc[:, c] = df[c].fillna('missing')
    fix_brand_col(df, M.brands)
    if st=='train':
        df.drop(df[df['price'] < 3.0].index, inplace=True)
    
    df.reset_index(drop=True, inplace=True)
    show('.')
    return save_df(df, fidx=fidx, st=st)

def numericalize(str_tok, data_fld):
    return [data_fld.vocab.stoi[x] for x in str_tok]

def get_cat_col(df, col):
    if M.split_cat_for_text: 
        ret = ['<category_name>'] + df['category_name'].str.split("/")
    else:
        ret = ['<category_name>'] + df['category_name'].apply(lambda x: [x])
    return ret

def tokenize(df, combine, data_fld=None):
    ret = get_cat_col(df, combine[-1])
    for col in combine: 
        ret += [f"<{col}>"] + df[col].apply(simple_tok)
    ret += ['<eos>']
    if data_fld:
        ret = ret.apply(partial(numericalize, data_fld=data_fld))
    show('.')
    return ret 

def numericalize_text(dfile, combine_text_vars, data_fld_files):
    df = read_df(dfile)
    # numericalize in order
    for text_var in M.text_vars:
        data_fld = pickle.load(open(data_fld_files[text_var],'rb'))
        combine = combine_text_vars[text_var]
        ret = get_cat_col(df, text_var)
        for col in combine:
            ret += [f"<{col}>"] + df[col].apply(simple_tok)
        ret += ['<eos>']
        ret = ret.apply(partial(numericalize, data_fld=data_fld))
        # convert to string as feather does not support list
        df.loc[:,text_var] = ret.apply(str)
        show('.')
    return save_df(df, dfile)

def get_examples(files, flds):
    examples = []
    #flds = [fld for fld in data_flds]
    for f in files:
        show(f'{f}        ')
        df = read_df(f, processed=True)
            #examples += list(df.apply(lambda x: data.Example.fromlist(list(x[cols]), \
            #                                                         fields), axis=1))
        for i in range(len(df)):
            ex = data.Example()
            for col in flds: setattr(ex, col, df[col].iloc[i])
            #print(col, type(getattr(examples[i], col)))
            examples.append(ex)
    return examples
    
def split_col(x, fillna=str(np.nan)):
    n = 5
    x = x.split("/")
    return x+[fillna]*(n-len(x)) if len(x)<n else x[:n]

def add_cols(df):
    cn = M.category_names
    if M.split_cat > 0:
        df[cn[0]], df[cn[1]], df[cn[2]], df[cn[3]], df[cn[4]] = zip(*df['category_name'].apply(split_col))
    for i,tv in enumerate(M.text_vars):
        df[f'len_{tv}'] = df[tv].apply(len)  

In [5]:
class ProcessDataFrame:
    '''Only Sequential processes go here. 
    Distributed processes need to be defined outside to keep it light.'''
    def __init__(self, path, st, ref=None, n=10):
        self.path, self.split = path, st
        self.preprocessed = []
        self.current_processed = []
        if ref==None:
            self.mapper, self.nas, self.df_sample, self.y_range, self.brands = None, None, None, None, None
            self.data_flds = {}
        else: 
            self.mapper, self.nas, self.df_sample, self.y_range, self.brands = \
                                    ref.mapper, ref.nas, ref.df_sample, ref.y_range, ref.brands
            self.data_flds = ref.data_flds
        self.n, self.iter = n, 1
        self.missing_idx, self.idx = None, None

    def preprocess(self):
        df = pd.read_csv(self.path, sep='\t')
        df = df.sample(frac=M.sample) if self.iter==1 else df.loc[self.missing_idx]     
        
        if self.split=='train' and self.iter==1:
            counter=collections.Counter(df['brand_name'].loc[df['brand_name'].isnull() == False])
            M.brands = [b for b,n in counter.most_common(M.nbrands)]
        df.rename(columns={f'{self.split}_id' : 'id'}, inplace=True)
        #cores = mp.cpu_count()-1
        self.n = min(self.n, int(1+len(df)/10000))
        show(f'Pre-processing {self.split} iter {self.iter} ({self.n})')
        if self.iter==1:
            self.missing_idx = set(df['id'].copy())
            self.idx = set(df['id'].copy())
        dfa = np.array_split(df,self.n)
        file_ids = [f'{self.iter}{i}' for i in range(self.n)]
        self.current_processed = run_distributed(partial(preprocess_df, st=self.split), \
                                            list(zip(dfa, file_ids)), M.cores)
        self.preprocessed += self.current_processed
        self.preprocessed = list(set(self.preprocessed))
        show(' done\n')
        gc.collect()
        return 
    
    def build_vocab(self, text_var, combine, f=0.2, eos=[]):
        show(f' building vocab for {text_var}... ')
        f1 = np.sqrt(f)
        # f1 sample files and f1 samples from each so f1*f1
        df = pd.concat([read_df(file) for file in random.sample(self.preprocessed, int(round(f1*self.n)))])
        df = df.sample(frac=f1)
        text = pd.Series()
        dfa = np.array_split(df, self.n)
        for res in run_distributed(partial(tokenize, combine=combine), \
                            list(zip(dfa)), M.cores):
            text=text.append(res)
        data_fld = data.Field(sequential=True, use_vocab=False)
        data_fld.build_vocab(text, min_freq=M.min_freq[text_var])
        # pad-token is numericalized, used for dataloader iterator 
        data_fld.pad_token = data_fld.vocab.stoi[data_fld.pad_token]  
        show(f'{text_var} tokens {len(data_fld.vocab.itos[:])}\n')
        #outf = f'{SCRATCH}/{text_var}_field.pkl'
        #pickle.dump(data_fld, open(outf,'wb'))
        self.data_flds[text_var] = data_fld
        gc.collect()
        return
        
    def numericalize_text(self):
        show('Numericalizing ...')
        # star map does not handle passing of datafields 
        dff = {}
        for text_var in M.text_vars:
            outf = f'{SCRATCH}/{text_var}_field.pkl'
            pickle.dump(self.data_flds[text_var], open(outf,'wb'))
            dff[text_var] = outf
        run_distributed(partial(numericalize_text, combine_text_vars=M.combine, data_fld_files=dff), \
                                       list(zip(self.current_processed)), M.cores)
        gc.collect()
        return

    def postprocess_df(self):        
        show('Post processing ')
        ## takes 20sec. not dist required. 
        df = pd.concat([read_df(f, processed=True) for f in self.current_processed])
        if len(df) == 0 : return 
        
        add_cols(df)
        for v in M.cat_vars: df[v] = df[v].astype('category').cat.as_ordered()
        # keep a small copy to apply cat to test later
        if self.split == 'train' and self.iter==1: 
            self.df_sample = df.sample(2).copy()
        
        # make float continuous variables if any
        if self.split == 'test': df[M.dep] = 0
        for v in M.contin_vars+[M.dep]: df[v] = df[v].astype('float32')
        skip_flds = [c for c in df.columns if c not in M.cat_vars+M.contin_vars+[M.dep]]
        if self.split == 'train' and self.iter==1:
            self.cat_sz = [(c, len(df[c].cat.categories)+1) for c in M.cat_vars]
            self.cat_emb_szs = [(c, min(50, (c+1)//2)) for _,c in self.cat_sz]
            
            if len(M.contin_vars)==0:
                processed_df, y, nas = proc_df(df, M.dep, skip_flds=skip_flds, do_scale=False)
            else:
                processed_df, y, self.nas, self.mapper = proc_df(df, M.dep, skip_flds=skip_flds, do_scale=True)
                
            #convert to log scale
            processed_df[M.dep] = np.log(y+1)
            y_group = processed_df.groupby(M.cat_vars)[M.dep]
            y_range = pd.DataFrame({'min': y_group.agg('min'), 'max': y_group.agg('max')})
            self.y_range = save_df(y_range, f'{SCRATCH}/y_range.csv', index=True)
        else: 
            apply_cats(df, self.df_sample)
            
            # PROCESSED DF from train
            if len(M.contin_vars)==0:
                processed_df, _, nas = proc_df(df, M.dep, skip_flds=skip_flds, do_scale=False)
            else:
                processed_df, _, nas, mapper = proc_df(df, M.dep, do_scale=True, skip_flds=skip_flds,
                                  mapper=self.mapper, na_dict=self.nas)
            processed_df[M.dep] = df[M.dep]  
        
        y_range_df = read_df(self.y_range)
        processed_df[M.index] = df[M.index]
        processed_df[M.text_vars] = df[M.text_vars]
        processed_df = pd.merge(processed_df, y_range_df, how='left', on=M.cat_vars)
        processed_df['min'].fillna(min(y_range_df['min']), inplace=True)
        processed_df['max'].fillna(min(y_range_df['max']), inplace=True)
        
        df = processed_df
        self.missing_idx -= set(df[M.index])
        gc.collect()

        # shuffle and save
        dfa = np.array_split(df, self.n)
        run_distributed(save_df, list(zip(dfa, self.current_processed)), M.cores)
        gc.collect()
        return 
    
    def get_examples(self):
        examples = []
        # we'll need to keep ids as torch-text shuffles the data based on text length
        self.data_flds[M.index] = data.Field(sequential=False, tensor_type=torch.LongTensor, use_vocab=False)
        # text var already built
        #for tv in M.text_vars:
        #    data_fields[tv] = pickle.load(open(self.data_flds[tv],'rb'))
        
        for v in M.cat_vars: 
            self.data_flds[v] = data.Field(sequential=False, use_vocab=False)
        
        for v in M.contin_vars+M.range_vars+[M.dep]:
            self.data_flds[v] = data.Field(sequential=False, tensor_type=torch.FloatTensor, use_vocab=False)
        flds = [fld[0] for fld in self.data_flds.items()]
        for exs in run_distributed(get_examples, [[[f], flds] for f in self.preprocessed], M.cores):
            examples += exs
        return examples
    
    def process(self, tol=0.0):
        show(f'{self.split} iter: {self.iter}\n')
        self.preprocess()                                            # 45% time (40% in brand fix)

        if self.split=='train' and self.iter==1:
            for tv in M.text_vars: self.build_vocab(tv, M.combine[tv])   # 5% 
        self.numericalize_text()                                     # 40%
        self.postprocess_df()                                        # 10%
        missing = len(self.missing_idx)/len(self.idx)
        if missing<=tol or self.iter>2:
            if missing>tol and self.iter>2: 
                show(f'''{self.iter} iteration incomplete: {round(missing*100)}%!!
                                 Problem with the file system!''')
            return
        else:
            self.iter += 1
            return self.process(tol)
        #train.save(f'{SCRATCH}/train.obj')        
    
    def get_df(self, idxs=[]):
        files = self.preprocessed if len(idxs)==0 else [self.preprocessed[i] for i in idxs]
        df = pd.concat([read_df(f, processed=True) for f in files])    
        return df
    
    def save(self, file):
        pickle.dump(self, open(file,'wb'))

# Deinfitions for LM and Final Model

In [6]:
def to_gpu(x, *args, **kwargs):
    if USE_GPU:
        # is cuda available check kiils time/mem 
        return x.cuda(*args, **kwargs) if torch.cuda.is_available() else x
    else:
        return x

def T(a):
    if torch.is_tensor(a): res = a
    else:
        a = np.array(np.ascontiguousarray(a))
        if a.dtype in (np.int8, np.int16, np.int32, np.int64):
            res = torch.LongTensor(a.astype(np.int64))
        elif a.dtype in (np.float32, np.float64):
            return torch.FloatTensor(a.astype(np.float32))
        else: raise NotImplementedError
    return to_gpu(res, async=True)

# Making a Variable in torch invokes GPU kills time
def create_variable(x, volatile, requires_grad=False):
    if not isinstance(x, Variable):
        x = Variable(T(x), volatile=volatile, requires_grad=requires_grad)
    return x

def VV1_(x): return create_variable(x, True)     

def V1_(x, requires_grad=False):
    return create_variable(x, False, requires_grad=requires_grad)
    
def V1(x, requires_grad=False):
    return [V1(o, requires_grad) for o in x] if isinstance(x,list) else V1_(x, requires_grad)

def VV1(x):  return [VV1(o) for o in x] if isinstance(x,list) else VV1_(x)

def set_lrs(opt, lrs):
    if not isinstance(lrs, Iterable): lrs=[lrs]
    if len(lrs)==1: 
        lrs=lrs*len(opt.param_groups)
    elif len(lrs) != len(opt.param_groups):
        raise "Exception"
    for pg,lr in zip(opt.param_groups,lrs): pg['lr'] = lr
        

class StepperDist():
    def __init__(self, m, opt, crit, clip=0, reg_fn=None):
        self.m,self.opt,self.crit,self.clip,self.reg_fn = m,opt,crit,clip,reg_fn
        self.reset(True)

    def reset(self, train=True):
        if train: apply_leaf(self.m, set_train_mode)
        else: self.m.eval()
        if hasattr(self.m, 'reset'): self.m.reset()

    def step(self, xs, y):
        xtra = []
        output = self.m(*xs)
        if isinstance(output,(tuple,list)): output,*xtra = output
        self.opt.zero_grad()
        loss = raw_loss = self.crit(output, y)
        if self.reg_fn: loss = self.reg_fn(output, xtra, raw_loss)
        loss.backward()
        if is_dist(): apply_leaf(self.m, average_gradients)
        if self.clip:   # Gradient clipping
            nn.utils.clip_grad_norm(trainable_params_(self.m), self.clip)
        self.opt.step()
        return raw_loss.data[0]

    def evaluate(self, xs, y):
        preds = self.m(*xs)
        if isinstance(preds,(tuple,list)): preds=preds[0]
        return preds, self.crit(preds,y)


class DistLearner(Learner):
    def fit_gen(self, model, data, layer_opt, n_cycle, cycle_len=None, cycle_mult=1, cycle_save_name=None,
                metrics=None, callbacks=None, use_wd_sched=False, norm_wds=False, wds_sched_mult=None, **kwargs):
        if callbacks is None: callbacks=[]
        if metrics is None: metrics=self.metrics

        if use_wd_sched:
            # This needs to come before CosAnneal() because we need to read the initial learning rate from
            # layer_opt.lrs - but CosAnneal() alters the layer_opt.lrs value initially (divides by 100)
            if np.sum(layer_opt.wds) == 0:
                print('fit() warning: use_wd_sched is set to True, but weight decay(s) passed are 0. Use wds to '
                      'pass weight decay values.')
            batch_per_epoch = len(data.trn_dl)
            cl = cycle_len if cycle_len else 1
            self.wd_sched = WeightDecaySchedule(layer_opt, batch_per_epoch, cl, cycle_mult, n_cycle,
                                                norm_wds, wds_sched_mult)
            callbacks += [self.wd_sched]

        if cycle_len:
            cycle_end = self.get_cycle_end(cycle_save_name)
            cycle_batches = len(data.trn_dl)*cycle_len
            self.sched = CosAnneal(layer_opt, cycle_batches, on_cycle_end=cycle_end, cycle_mult=cycle_mult)
        elif not self.sched: self.sched=LossRecorder(layer_opt)
        callbacks+=[self.sched]
        for cb in callbacks: cb.on_train_begin()
        n_epoch = sum_geom(cycle_len if cycle_len else 1, cycle_mult, n_cycle)
        fit_dist(model, data, n_epoch, layer_opt.opt, self.crit,
            metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, **kwargs)    
        
def fit_dist(model, data, epochs, opt, crit, metrics=None, callbacks=None, **kwargs):
    """ Fits a model

    Arguments:
       model (model): any pytorch module
           net = to_gpu(net)
       data (ModelData): see ModelData class and subclasses
       opt: optimizer. Example: opt=optim.Adam(net.parameters())
       epochs(int): number of epochs
       crit: loss function to optimize. Example: F.cross_entropy
    """
    stepper = StepperDist(model, opt, crit, **kwargs)
    metrics = metrics or []
    callbacks = callbacks or []
    avg_mom=0.98
    batch_num,avg_loss=0,0.

    for epoch in tnrange(epochs, desc='Epoch'):
        stepper.reset(True)
        t = tqdm(iter(data.trn_dl), leave=False, total=len(data.trn_dl))
        for (*x,y) in t:
            batch_num += 1
            loss = stepper.step(V1(x),V1(y))
            avg_loss = avg_loss * avg_mom + loss * (1-avg_mom)
            debias_loss = avg_loss / (1 - avg_mom**batch_num)
            t.set_postfix(loss=debias_loss)
            stop=False
            for cb in callbacks: stop = stop or cb.on_batch_end(debias_loss)
            if stop: return 

        vals = validate(stepper, data.val_dl, metrics)
        print(np.round([epoch, debias_loss] + vals, 6))
        stop=False
        for cb in callbacks: stop = stop or cb.on_epoch_end(vals)
        if stop: break

def validate(stepper, dl, metrics):
    loss,res = [],[]
    stepper.reset(False)
    for (*x,y) in iter(dl):
        preds,l = stepper.evaluate(VV1(x), VV1(y))
        loss.append(to_np(l))
        idx, x_txt, x_cat, x_cont = x
        preds = fix_range(to_np(preds).ravel(), x_cont[2], x_cont[3])
        res.append([f(preds,to_np(y)) for f in metrics])
    return [np.mean(loss)] + list(np.mean(np.stack(res),0))

def average_gradients(model):
    """ Gradient averaging. """
    size = float(dist.get_world_size())
    #show(f'In avg gradient : {get_rank()}')
    for param in model.parameters():
        if hasattr(param, 'grad') and hasattr(param.grad, 'data'):
            dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM, group=0)
            param.grad.data /= size

def is_dist(): return dist._initialized

def get_rank(): return dist.get_rank() if dist._initialized else 0

def init_processes(rank, size, fn, backend='gloo'):
    """ Initialize the distributed environment. """
    os.environ['MASTER_ADDR'] = '127.0.0.1'
    os.environ['MASTER_PORT'] = '29500'
    dist.init_process_group(backend, rank=rank, world_size=size)
    fn(rank, size)
    
def torch_distributed(fn, size=2):
    processes = []
    for rank in range(size):
        p = Process(target=init_processes, args=(rank, size, fn))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()
        
#mod = sys.modules['fastai.core']
#mod.V, mod.V_, mod.VV, mod.VV_, mod.T = V, V_, VV, VV_, T
#mod.create_variable, mod.to_gpu = create_variable, to_gpu
#sys.modules['fastai.core'] = mod
#from fastai.core import *

In [7]:
class MixedTextDataset(torchtext.data.Dataset):
    @staticmethod
    def sort_key(ex):   return len(ex.item_description)
    
# copy latest class definitions
class EmbeddingDropout(nn.Module):

    def __init__(self, embed):
        super().__init__()
        self.embed = embed

    def forward(self, words, dropout=0.1, scale=None):

        if dropout:
            size = (self.embed.weight.size(0),1)
            mask = Variable(dropout_mask(self.embed.weight.data, size, dropout))
            masked_embed_weight = mask * self.embed.weight
        else:
            masked_embed_weight = self.embed.weight

        if scale:
            masked_embed_weight = scale * masked_embed_weight

        padding_idx = self.embed.padding_idx

        if padding_idx is None:
            padding_idx = -1

        X = self.embed._backend.Embedding.apply(words,
                 masked_embed_weight, padding_idx, self.embed.max_norm,
                     self.embed.norm_type, self.embed.scale_grad_by_freq, self.embed.sparse)

        return X
    
class RNN_Encoder(nn.Module):

    initrange=0.1

    def __init__(self, bs, ntoken, emb_sz, nhid, nlayers, pad_token, bidir=False,
                 dropouth=0.3, dropouti=0.65, dropoute=0.1, wdrop=0.5):

        super().__init__()
        self.ndir = 2 if bidir else 1
        self.encoder = nn.Embedding(ntoken, emb_sz, padding_idx=pad_token)
        self.encoder_with_dropout = EmbeddingDropout(self.encoder)
        self.rnns = [nn.LSTM(emb_sz if l == 0 else nhid, (nhid if l != nlayers - 1 else emb_sz)//self.ndir,
             1, bidirectional=bidir, dropout=dropouth) for l in range(nlayers)]
        if wdrop: self.rnns = [WeightDrop(rnn, wdrop) for rnn in self.rnns]
        self.rnns = torch.nn.ModuleList(self.rnns)
        self.encoder.weight.data.uniform_(-self.initrange, self.initrange)

        self.bs,self.emb_sz,self.nhid,self.nlayers,self.dropoute = bs,emb_sz,nhid,nlayers,dropoute
        self.dropouti = LockedDropout(dropouti)
        self.dropouths = nn.ModuleList([LockedDropout(dropouth) for l in range(nlayers)])

    def forward(self, input):
        sl,bs = input.size()
        if bs!=self.bs:
            self.bs=bs
            self.reset()

        emb = self.encoder_with_dropout(input, dropout=self.dropoute if self.training else 0)
        emb = self.dropouti(emb)

        raw_output = emb
        new_hidden,raw_outputs,outputs = [],[],[]
        for l, (rnn,drop) in enumerate(zip(self.rnns, self.dropouths)):
            current_input = raw_output
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                raw_output, new_h = rnn(raw_output, self.hidden[l])
            new_hidden.append(new_h)
            raw_outputs.append(raw_output)
            if l != self.nlayers - 1: raw_output = drop(raw_output)
            outputs.append(raw_output)

        self.hidden = repackage_var(new_hidden)
        return raw_outputs, outputs

    def one_hidden(self, l):
        nh = (self.nhid if l != self.nlayers - 1 else self.emb_sz)//self.ndir
        return Variable(self.weights.new(self.ndir, self.bs, nh).zero_(), volatile=not self.training)

    def reset(self):
        self.weights = next(self.parameters()).data
        self.hidden = [(self.one_hidden(l), self.one_hidden(l)) for l in range(self.nlayers)]


class MultiBatchRNN(RNN_Encoder):
    '''Either Takes first or last'''
    def __init__(self, max_sl, bptt, last=False, train_text=False, *args, **kwargs):
        self.max_sl,self.bptt, self.last, self.train_text = max_sl,bptt, last, train_text
        super().__init__(*args, **kwargs)

    def concat(self, arrs):
        return [torch.cat([l[si] for l in arrs]) for si in range(len(arrs[0]))]

    def forward(self, input):
        sl,bs = input.size()
        for l in self.hidden:
            for h in l: h.data.zero_()
        raw_outputs, outputs = [],[]
        max_sl = min(self.max_sl,sl)
        for i in range(0, max_sl, self.bptt):
            input_seq = input[i : min(i+self.bptt, max_sl)]
            r, o = super().forward(input_seq)
            #if (i<self.bptt*8) or i>(max_sl-self.bptt*8):
            #if i>(max_sl-self.bptt*16):
            if True:
                raw_outputs.append(r)
                outputs.append(o)
            if self.last==False: break

        return self.concat(raw_outputs), self.concat(outputs)

class LinearRNNOutput(nn.Module):
    initrange=0.1
    def __init__(self, n_out, nhid, dropout):
        super().__init__()
        self.decoder = nn.Linear(nhid, n_out)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-self.initrange, self.initrange)
        self.dropout = LockedDropout(dropout)

    def forward(self, input):
        raw_outputs, outputs = input
        output = self.dropout(outputs[-1])
        return output, raw_outputs, outputs

class SequentialRNNText(SequentialRNN):
    def forward(self, input):
        output, raw_outputs, outputs = super().forward(input)
        return (input, output), raw_outputs, outputs
    

class LinearDecoder(LinearRNNOutput):
    """ A custom Linear layer that reads the signals from the output of the RNN_Encoder layer,
    and decodes to a output of size n_tokens.
    """
    def __init__(self, n_out, nhid, dropout, tie_encoder=None):
        super().__init__(n_out, nhid, dropout)
        if tie_encoder: self.decoder.weight = tie_encoder.weight
       
    def forward(self, input):
        output, raw_outputs, outputs = super().forward(input)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        result = decoded.view(-1, decoded.size(1))
        return result, raw_outputs, outputs

    
def chunk_seq(x, max_seq, bptt, last=False, dim=1):
    x = x[:max_seq]
    x = x[-bptt:] if last else x[:bptt]
    return x.view(x.size(0)*x.size(1)) if dim==1 else x

class CrossEntropyPad(nn.CrossEntropyLoss):
    def __init__(self, bptt, max_seq, last=False, size_average=True):
        self.bptt, self.max_seq, self.last = bptt, max_seq, last
        super().__init__()
        
    def forward(self, input, target):
        _assert_no_grad(target)
        input_seq, result = input
        #chunk2d = partial(chunk_seq, max_seq=self.max_seq, bptt=self.bptt, last=self.last, dim=2)
        #print("INPUT_SEQ=", chunk2d(input_seq), "\nTARGET=", chunk2d(target))
        target = chunk_seq(target, self.max_seq, self.bptt, self.last)
        ret = F.cross_entropy(result, target, size_average=self.size_average)
        return ret

class RNNLearnerEmbedded(DistLearner):    
    def __init__(self, data, models, **kwargs):
        super().__init__(data, models, **kwargs)
        self.crit = F.cross_entropy

    def save_encoder(self, name): save_model(self.model[0], self.get_model_path(name))
    def load_encoder(self, name): load_model(self.model[0], self.get_model_path(name))
    

class MixedTextInputModel(nn.Module):
    def __init__(self, emb_szs, n_cont, emb_drop, out_sz, szs, drops,
                 y_range=None, use_bn=False, text_emb_sz=[], rnn_encoders=[]):
        super().__init__()
        
        self.embs = nn.ModuleList([nn.Embedding(c, s) for c,s in emb_szs])
        for emb in self.embs: emb_init(emb)
        n_emb = sum(e.embedding_dim for e in self.embs)
        self.n_emb, self.n_cont=n_emb, n_cont
        
        if not isinstance(rnn_encoders,list): 
            rnn_encoders = [rnn_encoders]
            text_emb_sz = [text_emb_sz]
        
        self.nrnn = 2
        self.rnn_bns = nn.ModuleList([nn.BatchNorm1d(self.nrnn*sz) for sz in text_emb_sz])
        
        self.text_emb_sz, self.rnn_encoders = text_emb_sz, nn.ModuleList(rnn_encoders)
        
        szs = [n_emb+n_cont+self.nrnn*sum(text_emb_sz)] + szs
        self.lins = nn.ModuleList([
            nn.Linear(szs[i], szs[i+1]) for i in range(len(szs)-1)])
        self.bns = nn.ModuleList([
            nn.BatchNorm1d(sz) for sz in szs[1:]])
        for o in self.lins: kaiming_normal(o.weight.data)
        self.outp = nn.Linear(szs[-1], out_sz)
        kaiming_normal(self.outp.weight.data)

        self.emb_drop = nn.Dropout(emb_drop)
        self.drops = nn.ModuleList([nn.Dropout(drop) for drop in drops])
        self.bn = nn.BatchNorm1d(n_cont)
        self.use_bn,self.y_range = use_bn,y_range

    def pool(self, x, bs, is_max):
        f = F.adaptive_max_pool1d if is_max else F.adaptive_avg_pool1d
        return f(x.permute(1,2,0), (1,)).view(bs,-1)
        
    def forward(self, idx, x_text, x_cat, x_cont):
        if self.n_emb != 0:
            x = [e(x_cat[i]) for i,e in enumerate(self.embs)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)

        y_min, y_max = x_cont[-2], x_cont[-1]
        y_min, y_max = y_min.view(y_min.size(0), 1), y_min.view(y_min.size(0), 1)
        if self.n_cont != 0:
            x_cont = torch.cat([xc.view(xc.size(0), 1) for xc in x_cont[:-2]], 1)
            x2 = self.bn(x_cont)
            x = torch.cat([x, x2], 1) if self.n_emb != 0 else x2
        
        for i,rnn_enc in enumerate(self.rnn_encoders):
            enc_raw_outputs, enc_outputs = rnn_enc(x_text[i])
            enc_last_layer = enc_outputs[-1]
            sl,bs,_ = enc_last_layer.size()
            #avgpool = self.pool(enc_last_layer, bs, False)
            mxpool = self.pool(enc_last_layer, bs, True)
            x3 = torch.cat([enc_last_layer[-1], mxpool], 1)
            #x3 = enc_last_layer[-1]
            x3 = self.rnn_bns[i](x3)
            x = torch.cat([x, x3], 1)
            
        for l,d,b in zip(self.lins, self.drops, self.bns):
            x = F.relu(l(x))
            if self.use_bn: x = b(x)
            x = d(x)
        x = self.outp(x)
        #if self.y_range: and bs will always come from rnn_encoders
        #x = F.sigmoid(x)
        #x = x*(y_max-y_min) 
        #x = x + y_min
        #self.pred = x
        # encoder outputs are needed by the regularization function
        return x, enc_raw_outputs, enc_outputs

    def reset(self, module=None):
        if not module: module = self
        for c in module.children():
            if hasattr(c, 'children'): MixedTextInputModel.reset(c) 
            if hasattr(c, 'reset'): c.reset()


class MixedTextLearner(DistLearner):
    def __init__(self, data, models, **kwargs):
        super().__init__(data, models, **kwargs)
        self.crit = RMSLELoss()
        # self.crit = F.cross_entropy

    def save_encoder(self, name, idx): save_model(self.model.rnn_encoders[idx], self.get_model_path(name))
    def load_encoder(self, name, idx): load_model(self.model.rnn_encoders[idx], self.get_model_path(name))
        
def _assert_no_grad(variable):
    assert not variable.requires_grad, \
        "nn criterions don't compute the gradient w.r.t. targets - please " \
        "mark these variables as volatile or not requiring gradients"
        
class RMSLELoss(nn.Module):
    def __init__(self, size_average=True):
        super().__init__()
        self.size_average = size_average
        
    def forward(self, input, target):
        _assert_no_grad(target)
        ret = F.mse_loss(input, target, size_average=self.size_average)**0.5
        return ret

class MixedTextDataLoader():
    def __init__(self, src, index, txt_flds, cat_flds, contin_flds, y_fld):
        self.src,self.index,self.txt_flds,self.cat_flds, self.contin_flds, self.y_fld = \
             src,index,txt_flds,cat_flds,contin_flds, y_fld
        self.train_txt = None
        self.bptt = None

    def __len__(self): return len(self.src)-1

    def __iter__(self):
        it = iter(self.src)
        for i in range(len(self)):
            b = next(it)
            if self.train_txt == None: 
                yield getattr(b, self.index), [getattr(b, x) for x in self.txt_flds], \
                    [getattr(b, x) for x in self.cat_flds], [getattr(b, x) for x in self.contin_flds], \
                    getattr(b, self.y_fld)
            else:
                x_txt = getattr(b, self.txt_flds[self.train_txt])
                yield x_txt[:-1], x_txt[1:]
            

class MixedTextModel(BasicModel):
    def get_layer_groups(self):
        # defines how the differential learning rates would be applied to model groups
        m=self.model
        mecs = m.rnn_encoders
        enc_embed = [e.encoder for e in mecs] +[e.encoder_with_dropout for e in mecs]
        enc_midl_layers = [e.rnns for e in mecs]+[e.dropouths for e in mecs]
        ip_layers = enc_midl_layers+[e for e in m.embs]+[m.bn]
        enc_op_layers = [e.dropouti for e in m.rnn_encoders]
        midl_layers = enc_op_layers + [m.rnn_bns] + children(m.lins)+children(m.bns)
        op_layer = m.outp
        return [enc_embed, ip_layers, midl_layers, op_layer]

class MixedTextData(ModelData):
    def __init__(self, path, trn_dl, val_dl, test_dl):
        self.path = path
        self.nts, self.pad_idxs, self.encoders = [], [], []
        super().__init__(path, trn_dl, val_dl, test_dl)
    
    def create_td(self, it): return TextDataLoader(it, self.text_fld, self.label_fld)
    
    def get_text_fld(self, tv):
        return pickle.load(open(self.text_fld_files[tv],'rb'))

    @classmethod
    def from_splits(cls, path, splits, data_flds, bs):
        '''cteates one split at a time only when needed
        files: need to be full set of files. Creates splits 
        tuple of size for distributed processing. size=1 for GPU.'''
            
        device = -1 if M.no_cuda else None
        iters = torchtext.data.BucketIterator.splits(splits, batch_size=bs, device=device)
        dls = [MixedTextDataLoader(it, M.index, M.text_vars, M.cat_vars, 
                                   M.contin_vars+M.range_vars, M.target) for it in iters]
        # can't have multiple dls per modeldata in fastai but can have multiple mds
        trn_dl, val_dl = dls[:2]
        test_dl = dls[2] if len(iters)==3 else None
        obj = cls.from_dls(path, trn_dl, val_dl, test_dl)
        
        for tv in M.text_vars:
            tf = data_flds[tv] 
            obj.nts.append(len(tf.vocab))
            obj.pad_idxs.append(tf.pad_token)
        return obj

    def text_learner(self, i, callbacks=[], get_learner=False):
        '''i is text_var index'''
        tie_weights=True
        text_var = M.text_vars[i]
        # val dl size is 1 always
        # trn_dl and val_dls are referenced by fit()
        #self.trn_dl, self.val_dl = self.trn_dls[rank], self.val_dls[0]
        #M.nbatch = int(0.2*min([len(md.trn_dl) for md in mds]))
        M.nbatch=1000*M.sample
        M.iter = 0 
        
        self.val_dl.train_txt = self.trn_dl.train_txt = i 
        self.val_dl.bptt = self.trn_dl.bptt = M.enc_bptt[i] 
        # ******WARNING******* bs argument is DIFFERENT is versions for RNN_Encoder bidir=False, after pad_ix
        train_text=True
        rnn_enc = MultiBatchRNN(M.enc_max_seq[i], M.enc_bptt[i], M.text_seq_last, train_text, M.bs, 
                                self.nts[i], M.text_emb_sz[i], M.enc_n_hid[i], M.enc_n_layers[i], 
                                pad_token=self.pad_idxs[i], bidir=False, dropouth=M.lm_dropouth, 
                                dropouti=M.lm_dropouti, dropoute=M.lm_dropoute, wdrop=M.lm_wdrop)
        enc = rnn_enc.encoder if tie_weights else None
        text_model = SequentialRNNText(rnn_enc, LinearDecoder(self.nts[i], M.text_emb_sz[i], 
                                                                    M.lm_dropout, tie_encoder=enc))
        if M.no_cuda == False: text_model = to_gpu(text_model)
        model = SingleModel(text_model)
        learner = RNNLearnerEmbedded(self, model, opt_fn=M.lm_opt_fn)
        learner.crit = CrossEntropyPad(bptt=M.enc_bptt[i], max_seq=M.enc_max_seq[i], last=M.text_seq_last)
        if M.ignore_pad: learner.crit.ignore_index = self.pad_idxs[i]
        learner.reg_fn = M.lm_reg_fn
        learner.clip=0.3
        
        if get_learner: return learner
        learner.fit(2.0e-2, 1, wds=1e-6, cycle_len=1, cycle_mult=2, callbacks=callbacks)
        # save learner in rank=0 process
        #if rank==0:
            #show(f'Process {rank} is saving encoder for {text_var}\n')
            #learner.save_encoder(f'{text_var}')
        #if get_learner: return learner
        return

    def model_mixed(self, cat_emb_szs, y_range_file, use_bn=False, callbacks=[], 
                    get_learner=False, model_text=False, **kwargs):
        
        M.nbatch = 5000*M.bs/128
        M.iter = 0 
        
        self.val_dl.train_txt = self.trn_dl.train_txt = None
        train_text = False
        if len(M.text_vars)!=0:
            # bidir=False, 
            rnn_encs = [MultiBatchRNN(M.enc_max_seq[i], M.enc_bptt[i], M.text_seq_last, 
                                  train_text, M.bs, self.nts[i], M.text_emb_sz[i], M.enc_n_hid[i], 
                                  M.enc_n_layers[i], pad_token=self.pad_idxs[i], bidir=False, dropouth=M.dropouth, 
                                  dropouti=M.dropouti, dropoute=M.dropoute, wdrop=M.wdrop) for i in range(len(M.text_vars))]
        
        # reconstruct y_rtange
        y_range = read_df(y_range_file)
        # recreate groupings
        y_group = y_range.groupby(M.cat_vars)
        y_range = pd.DataFrame({'min': y_group['min'].agg('min'), 'max': y_group['max'].agg('max')})
        
        out_sz = 1
        m = MixedTextInputModel(cat_emb_szs, len(M.contin_vars), M.cat_emb_drop, out_sz, M.fc_layers, 
                                M.fc_drops, y_range, use_bn, M.text_emb_sz, rnn_encs)
        
        if M.no_cuda == False: m = to_gpu(m)
        #model = BasicModel(m)
        model = MixedTextModel(m)
        learner = MixedTextLearner(self, model, opt_fn=M.opt_fn)
        learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
        learner.clip=0.3
        
        if len(M.text_vars)!= 0 and model_text==True:
            for i,tv in enumerate(M.text_vars): learner.load_encoder(f'{tv}', i)
        if get_learner: return learner
        learner.fit(M.lrs, 1, cycle_len=3, cycle_mult=1, cycle_save_name=f'mercari', callbacks=callbacks)
        if (rank==0): learner.save('mercari')
        return

In [8]:
def end_training(loss): 
    M.iter += 1
    if M.end_train: return True
    if M.iter%10==0: 
        t = (datetime.now()-M.start_time).seconds
        if t > 3300: M.end_train = True
    return False

def train_begin(): 
    M.end_train = False
    #M.start_time = datetime.now()
    M.iter = 0

cbs = Callback()
cbs.on_train_begin = train_begin
cbs.on_batch_end = end_training
cbs.on_epoch_end = end_training


def predict_structured(m, md, split, ref, index='id', recon=True):
    m.eval()
    if hasattr(m, 'reset'): m.reset()
    
    dl = getattr(md, f'{split}_dl')
    pred = 'pred_'+M.target
    cols = [[M.target, pred]+M.text_vars+M.cat_vars+M.contin_vars]
    df = pd.DataFrame(columns=cols)
    for *x,y in iter(dl):
        idx, xtxt, xcat, x_cont = x
        bdf = pd.DataFrame(columns=cols)
        for j,tv in enumerate(M.text_vars):
            xdf = pd.DataFrame(to_np(xtxt[j]).transpose())
            tf = ref.data_flds[tv]
            xdf = xdf.apply(lambda x: x.apply(lambda x1: tf.vocab.itos[x1]))
            bdf[tv] = xdf.apply(lambda x: '--'.join(x), axis=1)
        for j,cv in enumerate(M.cat_vars):
            cats = ['NaN']+list(ref.df_sample[cv].cat.categories)
            bdf[cv] = pd.Series(to_np(xcat[j])).apply(lambda x: cats[x])
        
        for i,cv in enumerate(M.contin_vars+M.range_vars):
            bdf[cv] = to_np(x_cont[i])
        bdf[index] = to_np(idx)
        bdf[M.dep] = to_np(y)
        y_pred = to_np(get_prediction(m(*VV1(x)))).ravel()
        bdf[pred] = fix_range(y_pred, x_cont[2], x_cont[3])
        df = df.append(bdf)
    df[index] = df[index].astype(int)
    df['diff'] = (df[M.target]-df[pred]).abs()
    #new_idx = split+'_'+M.index
    #df.rename(columns={M.index : new_idx}, inplace=True)
    df = df.sort_values('diff', ascending=False).set_index(M.index)
    return df[M.text_vars+M.cat_vars+M.contin_vars+M.range_vars+[M.target, 'pred_'+M.target, 'diff']]

def fix_range(y_pred, y_min, y_max):
    y_min = to_np(y_min)
    y_max = to_np(y_max)
    y_pred[y_pred<y_min] = y_min[y_pred<y_min]
    y_pred[y_pred>y_max] = y_max[y_pred>y_max]
    return y_pred
        
def create_submission(m, dl):
    index = 'test_id'
    cols = [index, M.target]
    df = pd.DataFrame(columns=cols)
    for *x,y in iter(dl):
        idx, x_txt, x_cat, x_cont = x
        bdf = pd.DataFrame(columns=cols)
        y_pred = to_np(get_prediction(m(*VV1(x)))).ravel()
        y_pred = fix_range(y_pred, x_cont[2], x_cont[3])
        bdf[M.target] = np.exp(y_pred)-1.0
        bdf[index] = to_np(idx).astype(int)
        df = df.append(bdf)
    df.sort_values(index, inplace=True)
    df.set_index(index, inplace=True)
    df.to_csv('sample_submission.csv')
    return 
    
def final_model(mds, cat_emb_szs, y_range_file, rank=0, size=1, model_text=False, 
                model_final=True, get_learner=False, use_bn=False): 
    
    mds[rank].create_dls()
    ret = []
    if model_text:
        for i, text_var in enumerate(M.text_vars):
            #print(f'Rank {get_rank()} : Modeling {text_var}...\n')
            ret.append(text_learner(mds, i, rank=rank, callbacks=[cbs], get_learner=get_learner))
    if model_final:
        ret.append(model_mixed(mds, cat_emb_szs, y_range_file, rank, callbacks=[cbs], use_bn=False, 
                               get_learner=get_learner, model_text=model_text))
    return ret     

In [9]:
PATH='data/'
IN = 'Input/'
SCRATCH = 'tmp/'
!mkdir -p {SCRATCH}
!ls {IN}

models		       train_nospcl_val20.csv
processed	       train_nospcl_val20_text20.csv
raw		       train.tsv
test_nospcl_val20.csv  val_nospcl_val20.csv
test.tsv	       val_nospcl_val20_text20.csv
tmp


## DataFrame Proc

In [10]:
%%time
train = ProcessDataFrame(f'{IN}/train.tsv','train', n=10)
train.process(tol=0.01)
gc.collect()
test = ProcessDataFrame(f'{IN}/test.tsv','test', ref=train, n=10)
test.process(tol=0.0)
gc.collect()

train iter: 1
Pre-processing train iter 1 (10)30% .30% .30% .30% .31% .30% .30% .31% .30% .31% . done
 done
 building vocab for item_description... .......... done
item_description tokens 6724
 building vocab for name... .......... done
name tokens 3134
Numericalizing ....................... done
Post processing  done
test iter: 1
Pre-processing test iter 1 (10)30% .31% .30% .30% .31% .30% .30% .31% .31% .30% . done
 done
Numericalizing ....................... done
Post processing  done
CPU times: user 2min 32s, sys: 4 s, total: 2min 36s
Wall time: 21min 8s


In [11]:
%%time
train_exs = train.get_examples()
s = int(len(train_exs)*0.9)
trn_examples, val_examples = train_exs[:s], train_exs[s:]
# data is already shuffled by sample operation
trn_ds = MixedTextDataset(trn_examples, train.data_flds)
val_ds = MixedTextDataset(val_examples, train.data_flds)
test_ds = MixedTextDataset(test.get_examples(), test.data_flds)
gc.collect()

tmp//train_13.csv        tmp//train_18.csv        tmp//train_12.csv        tmp//train_15.csv        tmp//train_14.csv        tmp//train_19.csv        tmp//train_16.csv        tmp//train_10.csv        tmp//train_11.csv        tmp//train_17.csv         done
tmp//test_14.csv        tmp//test_18.csv        tmp//test_16.csv        tmp//test_11.csv        tmp//test_13.csv        tmp//test_12.csv        tmp//test_17.csv        tmp//test_10.csv        tmp//test_15.csv        tmp//test_19.csv         done
CPU times: user 39.3 s, sys: 2.97 s, total: 42.3 s
Wall time: 3min 22s


In [19]:
M.bs, M.lrs= 256, 1e-2
M.enc_bptt=[40, 15]

md = MixedTextData.from_splits(SCRATCH, (trn_ds, val_ds, test_ds), train.data_flds, M.bs)
learner = md.model_mixed(train.cat_emb_szs, train.y_range, callbacks=[cbs], use_bn=False, 
                               get_learner=True, model_text=False)
learner.fit(1.0e-2, 1, wds=1e-6, cycle_len=2, cycle_mult=1, callbacks=[cbs])

A Jupyter Widget


  0%|          | 0/5208 [00:00<?, ?it/s][A
  0%|          | 0/5208 [00:01<?, ?it/s, loss=3.2][A
  0%|          | 1/5208 [00:01<2:44:44,  1.90s/it, loss=3.2][A
  0%|          | 1/5208 [00:02<3:06:12,  2.15s/it, loss=3.19][A
  0%|          | 2/5208 [00:02<1:33:08,  1.07s/it, loss=3.19][A
  0%|          | 2/5208 [00:02<1:42:11,  1.18s/it, loss=3.1] [A
  0%|          | 3/5208 [00:02<1:08:09,  1.27it/s, loss=3.1][A
  0%|          | 3/5208 [00:02<1:12:32,  1.20it/s, loss=3.09][A
  0%|          | 4/5208 [00:02<54:25,  1.59it/s, loss=3.09]  [A
  0%|          | 4/5208 [00:02<58:38,  1.48it/s, loss=3.08][A
  0%|          | 5/5208 [00:02<46:55,  1.85it/s, loss=3.08][A
  0%|          | 5/5208 [00:02<48:49,  1.78it/s, loss=3.02][A
  0%|          | 6/5208 [00:02<40:41,  2.13it/s, loss=3.02][A
  0%|          | 6/5208 [00:03<43:33,  1.99it/s, loss=3.02][A
  0%|          | 7/5208 [00:03<37:20,  2.32it/s, loss=3.02][A
  0%|          | 7/5208 [00:03<39:19,  2.20it/s, loss=3.01][A
  0%|  

  1%|          | 65/5208 [00:12<16:40,  5.14it/s, loss=1.73][A
  1%|▏         | 66/5208 [00:12<16:24,  5.22it/s, loss=1.73][A
  1%|▏         | 66/5208 [00:12<16:34,  5.17it/s, loss=1.7] [A
  1%|▏         | 67/5208 [00:12<16:19,  5.25it/s, loss=1.7][A
  1%|▏         | 67/5208 [00:12<16:29,  5.20it/s, loss=1.68][A
  1%|▏         | 68/5208 [00:12<16:14,  5.27it/s, loss=1.68][A
  1%|▏         | 68/5208 [00:13<16:29,  5.19it/s, loss=1.66][A
  1%|▏         | 69/5208 [00:13<16:14,  5.27it/s, loss=1.66][A
  1%|▏         | 69/5208 [00:13<16:29,  5.19it/s, loss=1.64][A
  1%|▏         | 70/5208 [00:13<16:15,  5.27it/s, loss=1.64][A
  1%|▏         | 70/5208 [00:13<16:27,  5.20it/s, loss=1.62][A
  1%|▏         | 71/5208 [00:13<16:13,  5.28it/s, loss=1.62][A
  1%|▏         | 71/5208 [00:13<16:27,  5.20it/s, loss=1.6] [A
  1%|▏         | 72/5208 [00:13<16:14,  5.27it/s, loss=1.6][A
  1%|▏         | 72/5208 [00:13<16:27,  5.20it/s, loss=1.58][A
  1%|▏         | 73/5208 [00:13<16:14,  5.

  3%|▎         | 131/5208 [00:23<15:15,  5.55it/s, loss=0.972][A
  3%|▎         | 131/5208 [00:23<15:23,  5.50it/s, loss=0.968][A
  3%|▎         | 132/5208 [00:23<15:16,  5.54it/s, loss=0.968][A
  3%|▎         | 132/5208 [00:23<15:21,  5.51it/s, loss=0.961][A
  3%|▎         | 133/5208 [00:23<15:14,  5.55it/s, loss=0.961][A
  3%|▎         | 133/5208 [00:24<15:17,  5.53it/s, loss=0.957][A
  3%|▎         | 133/5208 [00:24<15:25,  5.48it/s, loss=0.951][A
  3%|▎         | 135/5208 [00:24<15:11,  5.57it/s, loss=0.951][A
  3%|▎         | 135/5208 [00:24<15:16,  5.53it/s, loss=0.945][A
  3%|▎         | 136/5208 [00:24<15:09,  5.57it/s, loss=0.945][A
  3%|▎         | 136/5208 [00:24<15:17,  5.53it/s, loss=0.941][A
  3%|▎         | 137/5208 [00:24<15:10,  5.57it/s, loss=0.941][A
  3%|▎         | 137/5208 [00:24<15:18,  5.52it/s, loss=0.938][A
  3%|▎         | 138/5208 [00:24<15:11,  5.56it/s, loss=0.938][A
  3%|▎         | 138/5208 [00:24<15:17,  5.53it/s, loss=0.934][A
  3%|▎    

  4%|▎         | 195/5208 [00:33<14:33,  5.74it/s, loss=0.765][A
  4%|▍         | 196/5208 [00:33<14:28,  5.77it/s, loss=0.765][A
  4%|▍         | 196/5208 [00:34<14:33,  5.74it/s, loss=0.765][A
  4%|▍         | 197/5208 [00:34<14:29,  5.76it/s, loss=0.765][A
  4%|▍         | 197/5208 [00:34<14:34,  5.73it/s, loss=0.764][A
  4%|▍         | 198/5208 [00:34<14:29,  5.76it/s, loss=0.764][A
  4%|▍         | 198/5208 [00:34<14:35,  5.72it/s, loss=0.761][A
  4%|▍         | 199/5208 [00:34<14:30,  5.75it/s, loss=0.761][A
  4%|▍         | 199/5208 [00:34<14:36,  5.72it/s, loss=0.761][A
  4%|▍         | 200/5208 [00:34<14:31,  5.75it/s, loss=0.761][A
  4%|▍         | 200/5208 [00:34<14:36,  5.72it/s, loss=0.758][A
  4%|▍         | 201/5208 [00:34<14:31,  5.75it/s, loss=0.758][A
  4%|▍         | 201/5208 [00:35<14:35,  5.72it/s, loss=0.756][A
  4%|▍         | 202/5208 [00:35<14:31,  5.74it/s, loss=0.756][A
  4%|▍         | 202/5208 [00:35<14:34,  5.73it/s, loss=0.754][A
  4%|▍    

  5%|▍         | 258/5208 [00:44<14:19,  5.76it/s, loss=0.697][A
  5%|▍         | 258/5208 [00:44<14:22,  5.74it/s, loss=0.695][A
  5%|▍         | 259/5208 [00:44<14:19,  5.76it/s, loss=0.695][A
  5%|▍         | 259/5208 [00:45<14:21,  5.74it/s, loss=0.692][A
  5%|▍         | 260/5208 [00:45<14:18,  5.77it/s, loss=0.692][A
  5%|▍         | 260/5208 [00:45<14:21,  5.74it/s, loss=0.692][A
  5%|▌         | 261/5208 [00:45<14:18,  5.76it/s, loss=0.692][A
  5%|▌         | 261/5208 [00:45<14:22,  5.74it/s, loss=0.693][A
  5%|▌         | 262/5208 [00:45<14:18,  5.76it/s, loss=0.693][A
  5%|▌         | 262/5208 [00:45<14:22,  5.73it/s, loss=0.693][A
  5%|▌         | 263/5208 [00:45<14:19,  5.75it/s, loss=0.693][A
  5%|▌         | 263/5208 [00:45<14:21,  5.74it/s, loss=0.691][A
  5%|▌         | 263/5208 [00:45<14:24,  5.72it/s, loss=0.689][A
  5%|▌         | 265/5208 [00:45<14:17,  5.77it/s, loss=0.689][A
  5%|▌         | 265/5208 [00:46<14:20,  5.74it/s, loss=0.689][A
  5%|▌    

  6%|▌         | 320/5208 [00:56<14:15,  5.71it/s, loss=0.661][A
  6%|▌         | 321/5208 [00:56<14:12,  5.73it/s, loss=0.661][A
  6%|▌         | 321/5208 [00:56<14:14,  5.72it/s, loss=0.658][A
  6%|▌         | 322/5208 [00:56<14:11,  5.74it/s, loss=0.658][A
  6%|▌         | 322/5208 [00:56<14:14,  5.72it/s, loss=0.658][A
  6%|▌         | 323/5208 [00:56<14:11,  5.74it/s, loss=0.658][A
  6%|▌         | 323/5208 [00:56<14:13,  5.73it/s, loss=0.656][A
  6%|▌         | 324/5208 [00:56<14:10,  5.74it/s, loss=0.656][A
  6%|▌         | 324/5208 [00:56<14:13,  5.72it/s, loss=0.655][A
  6%|▌         | 325/5208 [00:56<14:10,  5.74it/s, loss=0.655][A
  6%|▌         | 325/5208 [00:56<14:13,  5.72it/s, loss=0.656][A
  6%|▋         | 326/5208 [00:56<14:11,  5.74it/s, loss=0.656][A
  6%|▋         | 326/5208 [00:56<14:13,  5.72it/s, loss=0.654][A
  6%|▋         | 327/5208 [00:56<14:10,  5.74it/s, loss=0.654][A
  6%|▋         | 327/5208 [00:57<14:13,  5.72it/s, loss=0.654][A
  6%|▋    

  7%|▋         | 382/5208 [01:07<14:09,  5.68it/s, loss=0.633][A
  7%|▋         | 383/5208 [01:07<14:06,  5.70it/s, loss=0.633][A
  7%|▋         | 383/5208 [01:07<14:10,  5.68it/s, loss=0.634][A
  7%|▋         | 384/5208 [01:07<14:07,  5.69it/s, loss=0.634][A
  7%|▋         | 384/5208 [01:07<14:09,  5.68it/s, loss=0.634][A
  7%|▋         | 385/5208 [01:07<14:07,  5.69it/s, loss=0.634][A
  7%|▋         | 385/5208 [01:07<14:10,  5.67it/s, loss=0.634][A
  7%|▋         | 386/5208 [01:07<14:08,  5.68it/s, loss=0.634][A
  7%|▋         | 386/5208 [01:08<14:10,  5.67it/s, loss=0.632][A
  7%|▋         | 387/5208 [01:08<14:08,  5.68it/s, loss=0.632][A
  7%|▋         | 387/5208 [01:08<14:11,  5.66it/s, loss=0.632][A
  7%|▋         | 388/5208 [01:08<14:08,  5.68it/s, loss=0.632][A
  7%|▋         | 388/5208 [01:08<14:11,  5.66it/s, loss=0.634][A
  7%|▋         | 389/5208 [01:08<14:09,  5.67it/s, loss=0.634][A
  7%|▋         | 389/5208 [01:08<14:12,  5.66it/s, loss=0.634][A
  7%|▋    

  9%|▊         | 445/5208 [01:18<14:00,  5.66it/s, loss=0.627][A
  9%|▊         | 446/5208 [01:18<13:58,  5.68it/s, loss=0.627][A
  9%|▊         | 446/5208 [01:18<14:00,  5.67it/s, loss=0.625][A
  9%|▊         | 447/5208 [01:18<13:58,  5.68it/s, loss=0.625][A
  9%|▊         | 447/5208 [01:18<14:00,  5.67it/s, loss=0.624][A
  9%|▊         | 448/5208 [01:18<13:58,  5.68it/s, loss=0.624][A
  9%|▊         | 448/5208 [01:19<14:00,  5.66it/s, loss=0.623][A
  9%|▊         | 449/5208 [01:19<13:58,  5.68it/s, loss=0.623][A
  9%|▊         | 449/5208 [01:19<13:59,  5.67it/s, loss=0.622][A
  9%|▊         | 450/5208 [01:19<13:57,  5.68it/s, loss=0.622][A
  9%|▊         | 450/5208 [01:19<13:59,  5.67it/s, loss=0.621][A
  9%|▊         | 451/5208 [01:19<13:57,  5.68it/s, loss=0.621][A
  9%|▊         | 451/5208 [01:19<14:00,  5.66it/s, loss=0.623][A
  9%|▊         | 452/5208 [01:19<13:58,  5.67it/s, loss=0.623][A
  9%|▊         | 452/5208 [01:19<14:00,  5.66it/s, loss=0.624][A
  9%|▊    

 10%|▉         | 507/5208 [01:29<13:50,  5.66it/s, loss=0.612][A
 10%|▉         | 508/5208 [01:29<13:48,  5.67it/s, loss=0.612][A
 10%|▉         | 508/5208 [01:29<13:50,  5.66it/s, loss=0.614][A
 10%|▉         | 509/5208 [01:29<13:48,  5.67it/s, loss=0.614][A
 10%|▉         | 509/5208 [01:29<13:49,  5.66it/s, loss=0.614][A
 10%|▉         | 510/5208 [01:29<13:48,  5.67it/s, loss=0.614][A
 10%|▉         | 510/5208 [01:29<13:48,  5.67it/s, loss=0.614][A
 10%|▉         | 510/5208 [01:30<13:50,  5.66it/s, loss=0.613][A
 10%|▉         | 512/5208 [01:30<13:46,  5.68it/s, loss=0.613][A
 10%|▉         | 512/5208 [01:30<13:48,  5.67it/s, loss=0.613][A
 10%|▉         | 513/5208 [01:30<13:46,  5.68it/s, loss=0.613][A
 10%|▉         | 513/5208 [01:30<13:48,  5.67it/s, loss=0.613][A
 10%|▉         | 514/5208 [01:30<13:46,  5.68it/s, loss=0.613][A
 10%|▉         | 514/5208 [01:30<13:48,  5.67it/s, loss=0.613][A
 10%|▉         | 515/5208 [01:30<13:46,  5.68it/s, loss=0.613][A
 10%|▉    

 11%|█         | 571/5208 [01:41<13:41,  5.65it/s, loss=0.977][A
 11%|█         | 572/5208 [01:41<13:39,  5.66it/s, loss=0.977][A
 11%|█         | 572/5208 [01:41<13:40,  5.65it/s, loss=0.973][A
 11%|█         | 573/5208 [01:41<13:39,  5.66it/s, loss=0.973][A
 11%|█         | 573/5208 [01:41<13:40,  5.65it/s, loss=0.966][A
 11%|█         | 574/5208 [01:41<13:38,  5.66it/s, loss=0.966][A
 11%|█         | 574/5208 [01:41<13:39,  5.65it/s, loss=0.959][A
 11%|█         | 575/5208 [01:41<13:38,  5.66it/s, loss=0.959][A
 11%|█         | 575/5208 [01:41<13:39,  5.65it/s, loss=0.958][A
 11%|█         | 576/5208 [01:41<13:38,  5.66it/s, loss=0.958][A
 11%|█         | 576/5208 [01:41<13:39,  5.65it/s, loss=0.954][A
 11%|█         | 577/5208 [01:41<13:37,  5.66it/s, loss=0.954][A
 11%|█         | 577/5208 [01:42<13:38,  5.66it/s, loss=0.947][A
 11%|█         | 578/5208 [01:42<13:37,  5.67it/s, loss=0.947][A
 11%|█         | 578/5208 [01:42<13:38,  5.66it/s, loss=0.94] [A
 11%|█    

 12%|█▏        | 634/5208 [01:52<13:30,  5.65it/s, loss=0.795][A
 12%|█▏        | 634/5208 [01:52<13:31,  5.64it/s, loss=0.793][A
 12%|█▏        | 635/5208 [01:52<13:30,  5.64it/s, loss=0.793][A
 12%|█▏        | 635/5208 [01:52<13:31,  5.64it/s, loss=0.792][A
 12%|█▏        | 636/5208 [01:52<13:30,  5.64it/s, loss=0.792][A
 12%|█▏        | 636/5208 [01:52<13:31,  5.63it/s, loss=0.789][A
 12%|█▏        | 637/5208 [01:52<13:30,  5.64it/s, loss=0.789][A
 12%|█▏        | 637/5208 [01:53<13:31,  5.63it/s, loss=0.785][A
 12%|█▏        | 638/5208 [01:53<13:30,  5.64it/s, loss=0.785][A
 12%|█▏        | 638/5208 [01:53<13:31,  5.63it/s, loss=0.783][A
 12%|█▏        | 639/5208 [01:53<13:29,  5.64it/s, loss=0.783][A
 12%|█▏        | 639/5208 [01:53<13:31,  5.63it/s, loss=0.786][A
 12%|█▏        | 640/5208 [01:53<13:29,  5.64it/s, loss=0.786][A
 12%|█▏        | 640/5208 [01:53<13:30,  5.63it/s, loss=0.785][A
 12%|█▏        | 641/5208 [01:53<13:29,  5.64it/s, loss=0.785][A
 12%|█▏   

 13%|█▎        | 696/5208 [02:04<13:25,  5.60it/s, loss=0.699][A
 13%|█▎        | 696/5208 [02:04<13:27,  5.59it/s, loss=0.701][A
 13%|█▎        | 697/5208 [02:04<13:25,  5.60it/s, loss=0.701][A
 13%|█▎        | 697/5208 [02:04<13:26,  5.59it/s, loss=0.703][A
 13%|█▎        | 698/5208 [02:04<13:25,  5.60it/s, loss=0.703][A
 13%|█▎        | 698/5208 [02:04<13:26,  5.59it/s, loss=0.701][A
 13%|█▎        | 699/5208 [02:04<13:24,  5.60it/s, loss=0.701][A
 13%|█▎        | 699/5208 [02:04<13:25,  5.60it/s, loss=0.702][A
 13%|█▎        | 700/5208 [02:04<13:24,  5.60it/s, loss=0.702][A
 13%|█▎        | 700/5208 [02:05<13:25,  5.60it/s, loss=0.704][A
 13%|█▎        | 701/5208 [02:05<13:24,  5.60it/s, loss=0.704][A
 13%|█▎        | 701/5208 [02:05<13:25,  5.59it/s, loss=0.702][A
 13%|█▎        | 702/5208 [02:05<13:24,  5.60it/s, loss=0.702][A
 13%|█▎        | 702/5208 [02:05<13:25,  5.59it/s, loss=0.701][A
 13%|█▎        | 703/5208 [02:05<13:24,  5.60it/s, loss=0.701][A
 13%|█▎   

 15%|█▍        | 758/5208 [02:15<13:14,  5.60it/s, loss=0.656][A
 15%|█▍        | 759/5208 [02:15<13:13,  5.61it/s, loss=0.656][A
 15%|█▍        | 759/5208 [02:15<13:14,  5.60it/s, loss=0.657][A
 15%|█▍        | 760/5208 [02:15<13:13,  5.61it/s, loss=0.657][A
 15%|█▍        | 760/5208 [02:15<13:14,  5.60it/s, loss=0.656][A
 15%|█▍        | 761/5208 [02:15<13:13,  5.61it/s, loss=0.656][A
 15%|█▍        | 761/5208 [02:15<13:14,  5.60it/s, loss=0.654][A
 15%|█▍        | 762/5208 [02:15<13:12,  5.61it/s, loss=0.654][A
 15%|█▍        | 762/5208 [02:16<13:14,  5.60it/s, loss=0.654][A
 15%|█▍        | 763/5208 [02:16<13:12,  5.61it/s, loss=0.654][A
 15%|█▍        | 763/5208 [02:16<13:14,  5.60it/s, loss=0.654][A
 15%|█▍        | 764/5208 [02:16<13:12,  5.60it/s, loss=0.654][A
 15%|█▍        | 764/5208 [02:16<13:13,  5.60it/s, loss=0.652][A
 15%|█▍        | 765/5208 [02:16<13:12,  5.61it/s, loss=0.652][A
 15%|█▍        | 765/5208 [02:16<13:13,  5.60it/s, loss=0.651][A
 15%|█▍   

 16%|█▌        | 821/5208 [02:26<13:04,  5.59it/s, loss=0.615][A
 16%|█▌        | 822/5208 [02:26<13:03,  5.60it/s, loss=0.615][A
 16%|█▌        | 822/5208 [02:27<13:04,  5.59it/s, loss=0.615][A
 16%|█▌        | 823/5208 [02:27<13:03,  5.60it/s, loss=0.615][A
 16%|█▌        | 823/5208 [02:27<13:04,  5.59it/s, loss=0.613][A
 16%|█▌        | 824/5208 [02:27<13:03,  5.60it/s, loss=0.613][A
 16%|█▌        | 824/5208 [02:27<13:04,  5.59it/s, loss=0.612][A
 16%|█▌        | 825/5208 [02:27<13:03,  5.60it/s, loss=0.612][A
 16%|█▌        | 825/5208 [02:27<13:04,  5.59it/s, loss=0.611][A
 16%|█▌        | 826/5208 [02:27<13:03,  5.60it/s, loss=0.611][A
 16%|█▌        | 826/5208 [02:27<13:04,  5.59it/s, loss=0.611][A
 16%|█▌        | 827/5208 [02:27<13:03,  5.59it/s, loss=0.611][A
 16%|█▌        | 827/5208 [02:28<13:04,  5.59it/s, loss=0.612][A
 16%|█▌        | 828/5208 [02:28<13:03,  5.59it/s, loss=0.612][A
 16%|█▌        | 828/5208 [02:28<13:03,  5.59it/s, loss=0.61] [A
 16%|█▌   

 17%|█▋        | 883/5208 [02:39<13:00,  5.54it/s, loss=0.605][A
 17%|█▋        | 884/5208 [02:39<12:59,  5.55it/s, loss=0.605][A
 17%|█▋        | 884/5208 [02:39<13:00,  5.54it/s, loss=0.603][A
 17%|█▋        | 885/5208 [02:39<12:59,  5.55it/s, loss=0.603][A
 17%|█▋        | 885/5208 [02:39<13:00,  5.54it/s, loss=0.603][A
 17%|█▋        | 886/5208 [02:39<12:59,  5.55it/s, loss=0.603][A
 17%|█▋        | 886/5208 [02:39<13:00,  5.54it/s, loss=0.604][A
 17%|█▋        | 887/5208 [02:39<12:59,  5.54it/s, loss=0.604][A
 17%|█▋        | 887/5208 [02:40<13:00,  5.54it/s, loss=0.606][A
 17%|█▋        | 888/5208 [02:40<12:59,  5.54it/s, loss=0.606][A
 17%|█▋        | 888/5208 [02:40<13:00,  5.54it/s, loss=0.609][A
 17%|█▋        | 889/5208 [02:40<12:59,  5.54it/s, loss=0.609][A
 17%|█▋        | 889/5208 [02:40<13:00,  5.53it/s, loss=0.608][A
 17%|█▋        | 890/5208 [02:40<12:59,  5.54it/s, loss=0.608][A
 17%|█▋        | 890/5208 [02:40<12:59,  5.54it/s, loss=0.607][A
 17%|█▋   

 18%|█▊        | 945/5208 [02:53<13:01,  5.46it/s, loss=0.583][A
 18%|█▊        | 946/5208 [02:53<13:00,  5.46it/s, loss=0.583][A
 18%|█▊        | 946/5208 [02:53<13:01,  5.46it/s, loss=0.582][A
 18%|█▊        | 947/5208 [02:53<13:00,  5.46it/s, loss=0.582][A
 18%|█▊        | 947/5208 [02:53<13:00,  5.46it/s, loss=0.582][A
 18%|█▊        | 948/5208 [02:53<12:59,  5.46it/s, loss=0.582][A
 18%|█▊        | 948/5208 [02:53<13:00,  5.46it/s, loss=0.582][A
 18%|█▊        | 949/5208 [02:53<12:59,  5.46it/s, loss=0.582][A
 18%|█▊        | 949/5208 [02:53<13:00,  5.45it/s, loss=0.581][A
 18%|█▊        | 950/5208 [02:53<12:59,  5.46it/s, loss=0.581][A
 18%|█▊        | 950/5208 [02:54<13:00,  5.46it/s, loss=0.58] [A
 18%|█▊        | 951/5208 [02:54<12:59,  5.46it/s, loss=0.58][A
 18%|█▊        | 951/5208 [02:54<13:00,  5.46it/s, loss=0.58][A
 18%|█▊        | 952/5208 [02:54<12:59,  5.46it/s, loss=0.58][A
 18%|█▊        | 952/5208 [02:54<13:00,  5.46it/s, loss=0.58][A
 18%|█▊       

 19%|█▉        | 1007/5208 [03:05<12:52,  5.44it/s, loss=0.57][A
 19%|█▉        | 1008/5208 [03:05<12:51,  5.44it/s, loss=0.57][A
 19%|█▉        | 1008/5208 [03:05<12:52,  5.44it/s, loss=0.57][A
 19%|█▉        | 1009/5208 [03:05<12:51,  5.44it/s, loss=0.57][A
 19%|█▉        | 1009/5208 [03:05<12:51,  5.44it/s, loss=0.571][A
 19%|█▉        | 1010/5208 [03:05<12:51,  5.44it/s, loss=0.571][A
 19%|█▉        | 1010/5208 [03:05<12:51,  5.44it/s, loss=0.571][A
 19%|█▉        | 1011/5208 [03:05<12:51,  5.44it/s, loss=0.571][A
 19%|█▉        | 1011/5208 [03:05<12:51,  5.44it/s, loss=0.569][A
 19%|█▉        | 1012/5208 [03:05<12:50,  5.44it/s, loss=0.569][A
 19%|█▉        | 1012/5208 [03:06<12:51,  5.44it/s, loss=0.569][A
 19%|█▉        | 1013/5208 [03:06<12:50,  5.44it/s, loss=0.569][A
 19%|█▉        | 1013/5208 [03:06<12:51,  5.44it/s, loss=0.57] [A
 19%|█▉        | 1014/5208 [03:06<12:50,  5.44it/s, loss=0.57][A
 19%|█▉        | 1014/5208 [03:06<12:51,  5.43it/s, loss=0.571][A


 21%|██        | 1068/5208 [03:16<12:43,  5.42it/s, loss=0.566][A
 21%|██        | 1069/5208 [03:16<12:42,  5.43it/s, loss=0.566][A
 21%|██        | 1069/5208 [03:17<12:43,  5.42it/s, loss=0.57] [A
 21%|██        | 1070/5208 [03:17<12:42,  5.43it/s, loss=0.57][A
 21%|██        | 1070/5208 [03:17<12:43,  5.42it/s, loss=0.571][A
 21%|██        | 1071/5208 [03:17<12:42,  5.43it/s, loss=0.571][A
 21%|██        | 1071/5208 [03:17<12:42,  5.42it/s, loss=0.571][A
 21%|██        | 1072/5208 [03:17<12:42,  5.43it/s, loss=0.571][A
 21%|██        | 1072/5208 [03:17<12:42,  5.42it/s, loss=0.573][A
 21%|██        | 1073/5208 [03:17<12:41,  5.43it/s, loss=0.573][A
 21%|██        | 1073/5208 [03:17<12:42,  5.42it/s, loss=0.574][A
 21%|██        | 1074/5208 [03:17<12:42,  5.42it/s, loss=0.574][A
 21%|██        | 1074/5208 [03:18<12:42,  5.42it/s, loss=0.573][A
 21%|██        | 1075/5208 [03:18<12:41,  5.43it/s, loss=0.573][A
 21%|██        | 1075/5208 [03:18<12:42,  5.42it/s, loss=0.573]

 22%|██▏       | 1129/5208 [03:29<12:36,  5.39it/s, loss=0.557][A
 22%|██▏       | 1130/5208 [03:29<12:35,  5.40it/s, loss=0.557][A
 22%|██▏       | 1130/5208 [03:29<12:36,  5.39it/s, loss=0.557][A
 22%|██▏       | 1131/5208 [03:29<12:35,  5.39it/s, loss=0.557][A
 22%|██▏       | 1131/5208 [03:29<12:36,  5.39it/s, loss=0.557][A
 22%|██▏       | 1132/5208 [03:29<12:36,  5.39it/s, loss=0.557][A
 22%|██▏       | 1132/5208 [03:30<12:36,  5.39it/s, loss=0.556][A
 22%|██▏       | 1133/5208 [03:30<12:35,  5.39it/s, loss=0.556][A
 22%|██▏       | 1133/5208 [03:30<12:36,  5.39it/s, loss=0.556][A
 22%|██▏       | 1134/5208 [03:30<12:35,  5.39it/s, loss=0.556][A
 22%|██▏       | 1134/5208 [03:30<12:35,  5.39it/s, loss=0.555][A
 22%|██▏       | 1135/5208 [03:30<12:35,  5.39it/s, loss=0.555][A
 22%|██▏       | 1135/5208 [03:30<12:35,  5.39it/s, loss=0.555][A
 22%|██▏       | 1136/5208 [03:30<12:35,  5.39it/s, loss=0.555][A
 22%|██▏       | 1136/5208 [03:30<12:35,  5.39it/s, loss=0.555

 23%|██▎       | 1190/5208 [03:43<12:33,  5.33it/s, loss=0.546][A
 23%|██▎       | 1191/5208 [03:43<12:32,  5.34it/s, loss=0.546][A
 23%|██▎       | 1191/5208 [03:43<12:33,  5.33it/s, loss=0.548][A
 23%|██▎       | 1192/5208 [03:43<12:32,  5.34it/s, loss=0.548][A
 23%|██▎       | 1192/5208 [03:43<12:32,  5.33it/s, loss=0.548][A
 23%|██▎       | 1193/5208 [03:43<12:32,  5.34it/s, loss=0.548][A
 23%|██▎       | 1193/5208 [03:43<12:32,  5.33it/s, loss=0.548][A
 23%|██▎       | 1194/5208 [03:43<12:32,  5.34it/s, loss=0.548][A
 23%|██▎       | 1194/5208 [03:43<12:32,  5.33it/s, loss=0.549][A
 23%|██▎       | 1195/5208 [03:43<12:31,  5.34it/s, loss=0.549][A
 23%|██▎       | 1195/5208 [03:44<12:32,  5.33it/s, loss=0.548][A
 23%|██▎       | 1196/5208 [03:44<12:31,  5.34it/s, loss=0.548][A
 23%|██▎       | 1196/5208 [03:44<12:32,  5.33it/s, loss=0.548][A
 23%|██▎       | 1197/5208 [03:44<12:31,  5.34it/s, loss=0.548][A
 23%|██▎       | 1197/5208 [03:44<12:32,  5.33it/s, loss=0.548

 24%|██▍       | 1251/5208 [03:55<12:24,  5.32it/s, loss=0.545][A
 24%|██▍       | 1252/5208 [03:55<12:23,  5.32it/s, loss=0.545][A
 24%|██▍       | 1252/5208 [03:55<12:24,  5.32it/s, loss=0.545][A
 24%|██▍       | 1253/5208 [03:55<12:23,  5.32it/s, loss=0.545][A
 24%|██▍       | 1253/5208 [03:55<12:23,  5.32it/s, loss=0.544][A
 24%|██▍       | 1254/5208 [03:55<12:23,  5.32it/s, loss=0.544][A
 24%|██▍       | 1254/5208 [03:55<12:23,  5.32it/s, loss=0.543][A
 24%|██▍       | 1255/5208 [03:55<12:23,  5.32it/s, loss=0.543][A
 24%|██▍       | 1255/5208 [03:56<12:23,  5.32it/s, loss=0.543][A
 24%|██▍       | 1256/5208 [03:56<12:22,  5.32it/s, loss=0.543][A
 24%|██▍       | 1256/5208 [03:56<12:23,  5.32it/s, loss=0.543][A
 24%|██▍       | 1257/5208 [03:56<12:22,  5.32it/s, loss=0.543][A
 24%|██▍       | 1257/5208 [03:56<12:23,  5.32it/s, loss=0.543][A
 24%|██▍       | 1258/5208 [03:56<12:22,  5.32it/s, loss=0.543][A
 24%|██▍       | 1258/5208 [03:56<12:22,  5.32it/s, loss=0.543

 25%|██▌       | 1312/5208 [04:07<12:15,  5.30it/s, loss=0.535][A
 25%|██▌       | 1313/5208 [04:07<12:14,  5.30it/s, loss=0.535][A
 25%|██▌       | 1313/5208 [04:07<12:15,  5.30it/s, loss=0.535][A
 25%|██▌       | 1314/5208 [04:07<12:14,  5.30it/s, loss=0.535][A
 25%|██▌       | 1314/5208 [04:08<12:15,  5.30it/s, loss=0.536][A
 25%|██▌       | 1315/5208 [04:08<12:14,  5.30it/s, loss=0.536][A
 25%|██▌       | 1315/5208 [04:08<12:15,  5.30it/s, loss=0.536][A
 25%|██▌       | 1316/5208 [04:08<12:14,  5.30it/s, loss=0.536][A
 25%|██▌       | 1316/5208 [04:08<12:15,  5.29it/s, loss=0.538][A
 25%|██▌       | 1317/5208 [04:08<12:14,  5.30it/s, loss=0.538][A
 25%|██▌       | 1317/5208 [04:08<12:14,  5.30it/s, loss=0.54] [A
 25%|██▌       | 1318/5208 [04:08<12:13,  5.30it/s, loss=0.54][A
 25%|██▌       | 1318/5208 [04:08<12:14,  5.30it/s, loss=0.539][A
 25%|██▌       | 1319/5208 [04:08<12:13,  5.30it/s, loss=0.539][A
 25%|██▌       | 1319/5208 [04:08<12:14,  5.30it/s, loss=0.538]

 26%|██▋       | 1373/5208 [04:19<12:04,  5.29it/s, loss=0.532][A
 26%|██▋       | 1374/5208 [04:19<12:04,  5.30it/s, loss=0.532][A
 26%|██▋       | 1374/5208 [04:19<12:04,  5.29it/s, loss=0.532][A
 26%|██▋       | 1375/5208 [04:19<12:03,  5.29it/s, loss=0.532][A
 26%|██▋       | 1375/5208 [04:19<12:04,  5.29it/s, loss=0.533][A
 26%|██▋       | 1376/5208 [04:19<12:03,  5.29it/s, loss=0.533][A
 26%|██▋       | 1376/5208 [04:20<12:04,  5.29it/s, loss=0.532][A
 26%|██▋       | 1377/5208 [04:20<12:03,  5.29it/s, loss=0.532][A
 26%|██▋       | 1377/5208 [04:20<12:04,  5.29it/s, loss=0.531][A
 26%|██▋       | 1378/5208 [04:20<12:03,  5.29it/s, loss=0.531][A
 26%|██▋       | 1378/5208 [04:20<12:04,  5.29it/s, loss=0.532][A
 26%|██▋       | 1379/5208 [04:20<12:03,  5.29it/s, loss=0.532][A
 26%|██▋       | 1379/5208 [04:20<12:04,  5.29it/s, loss=0.533][A
 26%|██▋       | 1380/5208 [04:20<12:03,  5.29it/s, loss=0.533][A
 26%|██▋       | 1380/5208 [04:21<12:04,  5.29it/s, loss=0.534

 28%|██▊       | 1434/5208 [04:32<11:56,  5.27it/s, loss=0.529][A
 28%|██▊       | 1435/5208 [04:32<11:56,  5.27it/s, loss=0.529][A
 28%|██▊       | 1435/5208 [04:32<11:56,  5.26it/s, loss=0.528][A
 28%|██▊       | 1436/5208 [04:32<11:55,  5.27it/s, loss=0.528][A
 28%|██▊       | 1436/5208 [04:32<11:56,  5.26it/s, loss=0.529][A
 28%|██▊       | 1437/5208 [04:32<11:55,  5.27it/s, loss=0.529][A
 28%|██▊       | 1437/5208 [04:33<11:56,  5.26it/s, loss=0.529][A
 28%|██▊       | 1438/5208 [04:33<11:55,  5.27it/s, loss=0.529][A
 28%|██▊       | 1438/5208 [04:33<11:56,  5.26it/s, loss=0.53] [A
 28%|██▊       | 1439/5208 [04:33<11:55,  5.27it/s, loss=0.53][A
 28%|██▊       | 1439/5208 [04:33<11:55,  5.26it/s, loss=0.528][A
 28%|██▊       | 1440/5208 [04:33<11:55,  5.27it/s, loss=0.528][A
 28%|██▊       | 1440/5208 [04:33<11:55,  5.27it/s, loss=0.528][A
 28%|██▊       | 1441/5208 [04:33<11:54,  5.27it/s, loss=0.528][A
 28%|██▊       | 1441/5208 [04:33<11:55,  5.27it/s, loss=0.526]

 29%|██▊       | 1495/5208 [04:44<11:46,  5.25it/s, loss=0.524][A
 29%|██▊       | 1496/5208 [04:44<11:46,  5.26it/s, loss=0.524][A
 29%|██▊       | 1496/5208 [04:44<11:46,  5.25it/s, loss=0.526][A
 29%|██▊       | 1497/5208 [04:44<11:45,  5.26it/s, loss=0.526][A
 29%|██▊       | 1497/5208 [04:44<11:46,  5.25it/s, loss=0.525][A
 29%|██▉       | 1498/5208 [04:44<11:45,  5.26it/s, loss=0.525][A
 29%|██▉       | 1498/5208 [04:45<11:46,  5.25it/s, loss=0.526][A
 29%|██▉       | 1499/5208 [04:45<11:45,  5.26it/s, loss=0.526][A
 29%|██▉       | 1499/5208 [04:45<11:46,  5.25it/s, loss=0.526][A
 29%|██▉       | 1500/5208 [04:45<11:45,  5.26it/s, loss=0.526][A
 29%|██▉       | 1500/5208 [04:45<11:45,  5.25it/s, loss=0.527][A
 29%|██▉       | 1501/5208 [04:45<11:45,  5.26it/s, loss=0.527][A
 29%|██▉       | 1501/5208 [04:45<11:45,  5.25it/s, loss=0.529][A
 29%|██▉       | 1502/5208 [04:45<11:45,  5.26it/s, loss=0.529][A
 29%|██▉       | 1502/5208 [04:46<11:45,  5.25it/s, loss=0.529

 30%|██▉       | 1556/5208 [04:58<11:39,  5.22it/s, loss=0.524][A
 30%|██▉       | 1557/5208 [04:58<11:39,  5.22it/s, loss=0.524][A
 30%|██▉       | 1557/5208 [04:58<11:39,  5.22it/s, loss=0.524][A
 30%|██▉       | 1558/5208 [04:58<11:38,  5.22it/s, loss=0.524][A
 30%|██▉       | 1558/5208 [04:58<11:39,  5.22it/s, loss=0.524][A
 30%|██▉       | 1559/5208 [04:58<11:38,  5.22it/s, loss=0.524][A
 30%|██▉       | 1559/5208 [04:58<11:39,  5.22it/s, loss=0.524][A
 30%|██▉       | 1560/5208 [04:58<11:38,  5.22it/s, loss=0.524][A
 30%|██▉       | 1560/5208 [04:59<11:39,  5.22it/s, loss=0.525][A
 30%|██▉       | 1561/5208 [04:59<11:38,  5.22it/s, loss=0.525][A
 30%|██▉       | 1561/5208 [04:59<11:39,  5.22it/s, loss=0.524][A
 30%|██▉       | 1562/5208 [04:59<11:38,  5.22it/s, loss=0.524][A
 30%|██▉       | 1562/5208 [04:59<11:39,  5.21it/s, loss=0.524][A
 30%|███       | 1563/5208 [04:59<11:38,  5.22it/s, loss=0.524][A
 30%|███       | 1563/5208 [04:59<11:38,  5.22it/s, loss=0.524

 31%|███       | 1617/5208 [05:10<11:29,  5.21it/s, loss=0.52][A
 31%|███       | 1618/5208 [05:10<11:29,  5.21it/s, loss=0.52][A
 31%|███       | 1618/5208 [05:10<11:29,  5.21it/s, loss=0.519][A
 31%|███       | 1619/5208 [05:10<11:29,  5.21it/s, loss=0.519][A
 31%|███       | 1619/5208 [05:11<11:29,  5.20it/s, loss=0.519][A
 31%|███       | 1620/5208 [05:11<11:29,  5.21it/s, loss=0.519][A
 31%|███       | 1620/5208 [05:11<11:29,  5.20it/s, loss=0.518][A
 31%|███       | 1621/5208 [05:11<11:28,  5.21it/s, loss=0.518][A
 31%|███       | 1621/5208 [05:11<11:29,  5.20it/s, loss=0.52] [A
 31%|███       | 1622/5208 [05:11<11:28,  5.21it/s, loss=0.52][A
 31%|███       | 1622/5208 [05:11<11:29,  5.20it/s, loss=0.52][A
 31%|███       | 1623/5208 [05:11<11:28,  5.21it/s, loss=0.52][A
 31%|███       | 1623/5208 [05:12<11:29,  5.20it/s, loss=0.519][A
 31%|███       | 1624/5208 [05:12<11:28,  5.20it/s, loss=0.519][A
 31%|███       | 1624/5208 [05:12<11:28,  5.20it/s, loss=0.518][A


 32%|███▏      | 1678/5208 [05:23<11:20,  5.19it/s, loss=0.514][A
 32%|███▏      | 1679/5208 [05:23<11:19,  5.19it/s, loss=0.514][A
 32%|███▏      | 1679/5208 [05:23<11:20,  5.19it/s, loss=0.515][A
 32%|███▏      | 1680/5208 [05:23<11:19,  5.19it/s, loss=0.515][A
 32%|███▏      | 1680/5208 [05:24<11:20,  5.18it/s, loss=0.515][A
 32%|███▏      | 1681/5208 [05:24<11:19,  5.19it/s, loss=0.515][A
 32%|███▏      | 1681/5208 [05:24<11:20,  5.18it/s, loss=0.517][A
 32%|███▏      | 1682/5208 [05:24<11:19,  5.19it/s, loss=0.517][A
 32%|███▏      | 1682/5208 [05:24<11:20,  5.18it/s, loss=0.518][A
 32%|███▏      | 1683/5208 [05:24<11:19,  5.19it/s, loss=0.518][A
 32%|███▏      | 1683/5208 [05:24<11:20,  5.18it/s, loss=0.518][A
 32%|███▏      | 1684/5208 [05:24<11:19,  5.18it/s, loss=0.518][A
 32%|███▏      | 1684/5208 [05:25<11:20,  5.18it/s, loss=0.518][A
 32%|███▏      | 1685/5208 [05:25<11:19,  5.18it/s, loss=0.518][A
 32%|███▏      | 1685/5208 [05:25<11:20,  5.18it/s, loss=0.519

 33%|███▎      | 1739/5208 [05:37<11:12,  5.16it/s, loss=0.524][A
 33%|███▎      | 1740/5208 [05:37<11:11,  5.16it/s, loss=0.524][A
 33%|███▎      | 1740/5208 [05:37<11:12,  5.16it/s, loss=0.523][A
 33%|███▎      | 1741/5208 [05:37<11:11,  5.16it/s, loss=0.523][A
 33%|███▎      | 1741/5208 [05:37<11:11,  5.16it/s, loss=0.523][A
 33%|███▎      | 1742/5208 [05:37<11:11,  5.16it/s, loss=0.523][A
 33%|███▎      | 1742/5208 [05:37<11:11,  5.16it/s, loss=0.523][A
 33%|███▎      | 1743/5208 [05:37<11:11,  5.16it/s, loss=0.523][A
 33%|███▎      | 1743/5208 [05:37<11:11,  5.16it/s, loss=0.522][A
 33%|███▎      | 1744/5208 [05:37<11:10,  5.16it/s, loss=0.522][A
 33%|███▎      | 1744/5208 [05:37<11:11,  5.16it/s, loss=0.521][A
 34%|███▎      | 1745/5208 [05:37<11:10,  5.16it/s, loss=0.521][A
 34%|███▎      | 1745/5208 [05:38<11:11,  5.16it/s, loss=0.522][A
 34%|███▎      | 1746/5208 [05:38<11:10,  5.16it/s, loss=0.522][A
 34%|███▎      | 1746/5208 [05:38<11:10,  5.16it/s, loss=0.521

 35%|███▍      | 1800/5208 [05:50<11:03,  5.14it/s, loss=0.516][A
 35%|███▍      | 1801/5208 [05:50<11:03,  5.14it/s, loss=0.516][A
 35%|███▍      | 1801/5208 [05:50<11:03,  5.14it/s, loss=0.516][A
 35%|███▍      | 1802/5208 [05:50<11:02,  5.14it/s, loss=0.516][A
 35%|███▍      | 1802/5208 [05:50<11:03,  5.13it/s, loss=0.515][A
 35%|███▍      | 1803/5208 [05:50<11:02,  5.14it/s, loss=0.515][A
 35%|███▍      | 1803/5208 [05:51<11:03,  5.13it/s, loss=0.515][A
 35%|███▍      | 1804/5208 [05:51<11:02,  5.14it/s, loss=0.515][A
 35%|███▍      | 1804/5208 [05:51<11:03,  5.13it/s, loss=0.515][A
 35%|███▍      | 1805/5208 [05:51<11:02,  5.13it/s, loss=0.515][A
 35%|███▍      | 1805/5208 [05:51<11:03,  5.13it/s, loss=0.514][A
 35%|███▍      | 1806/5208 [05:51<11:02,  5.13it/s, loss=0.514][A
 35%|███▍      | 1806/5208 [05:52<11:03,  5.13it/s, loss=0.513][A
 35%|███▍      | 1807/5208 [05:52<11:02,  5.13it/s, loss=0.513][A
 35%|███▍      | 1807/5208 [05:52<11:03,  5.13it/s, loss=0.514

 36%|███▌      | 1861/5208 [06:04<10:55,  5.11it/s, loss=0.511][A
 36%|███▌      | 1862/5208 [06:04<10:54,  5.11it/s, loss=0.511][A
 36%|███▌      | 1862/5208 [06:04<10:55,  5.11it/s, loss=0.51] [A
 36%|███▌      | 1863/5208 [06:04<10:54,  5.11it/s, loss=0.51][A
 36%|███▌      | 1863/5208 [06:04<10:54,  5.11it/s, loss=0.511][A
 36%|███▌      | 1864/5208 [06:04<10:54,  5.11it/s, loss=0.511][A
 36%|███▌      | 1864/5208 [06:04<10:54,  5.11it/s, loss=0.511][A
 36%|███▌      | 1865/5208 [06:04<10:54,  5.11it/s, loss=0.511][A
 36%|███▌      | 1865/5208 [06:05<10:54,  5.11it/s, loss=0.511][A
 36%|███▌      | 1866/5208 [06:05<10:54,  5.11it/s, loss=0.511][A
 36%|███▌      | 1866/5208 [06:05<10:54,  5.11it/s, loss=0.511][A
 36%|███▌      | 1867/5208 [06:05<10:53,  5.11it/s, loss=0.511][A
 36%|███▌      | 1867/5208 [06:05<10:54,  5.11it/s, loss=0.511][A
 36%|███▌      | 1868/5208 [06:05<10:53,  5.11it/s, loss=0.511][A
 36%|███▌      | 1868/5208 [06:05<10:54,  5.11it/s, loss=0.51] 

 37%|███▋      | 1922/5208 [06:18<10:47,  5.08it/s, loss=0.512][A
 37%|███▋      | 1923/5208 [06:18<10:46,  5.08it/s, loss=0.512][A
 37%|███▋      | 1923/5208 [06:18<10:46,  5.08it/s, loss=0.512][A
 37%|███▋      | 1924/5208 [06:18<10:46,  5.08it/s, loss=0.512][A
 37%|███▋      | 1924/5208 [06:18<10:46,  5.08it/s, loss=0.513][A
 37%|███▋      | 1925/5208 [06:18<10:46,  5.08it/s, loss=0.513][A
 37%|███▋      | 1925/5208 [06:19<10:46,  5.08it/s, loss=0.513][A
 37%|███▋      | 1926/5208 [06:19<10:46,  5.08it/s, loss=0.513][A
 37%|███▋      | 1926/5208 [06:19<10:46,  5.08it/s, loss=0.512][A
 37%|███▋      | 1927/5208 [06:19<10:45,  5.08it/s, loss=0.512][A
 37%|███▋      | 1927/5208 [06:19<10:46,  5.08it/s, loss=0.513][A
 37%|███▋      | 1928/5208 [06:19<10:45,  5.08it/s, loss=0.513][A
 37%|███▋      | 1928/5208 [06:19<10:46,  5.08it/s, loss=0.511][A
 37%|███▋      | 1929/5208 [06:19<10:45,  5.08it/s, loss=0.511][A
 37%|███▋      | 1929/5208 [06:19<10:45,  5.08it/s, loss=0.511

 38%|███▊      | 1983/5208 [06:32<10:37,  5.06it/s, loss=0.509][A
 38%|███▊      | 1984/5208 [06:32<10:37,  5.06it/s, loss=0.509][A
 38%|███▊      | 1984/5208 [06:32<10:37,  5.06it/s, loss=0.509][A
 38%|███▊      | 1985/5208 [06:32<10:37,  5.06it/s, loss=0.509][A
 38%|███▊      | 1985/5208 [06:32<10:37,  5.05it/s, loss=0.509][A
 38%|███▊      | 1986/5208 [06:32<10:37,  5.06it/s, loss=0.509][A
 38%|███▊      | 1986/5208 [06:32<10:37,  5.05it/s, loss=0.508][A
 38%|███▊      | 1987/5208 [06:32<10:36,  5.06it/s, loss=0.508][A
 38%|███▊      | 1987/5208 [06:33<10:37,  5.05it/s, loss=0.509][A
 38%|███▊      | 1988/5208 [06:33<10:36,  5.06it/s, loss=0.509][A
 38%|███▊      | 1988/5208 [06:33<10:37,  5.05it/s, loss=0.508][A
 38%|███▊      | 1989/5208 [06:33<10:36,  5.06it/s, loss=0.508][A
 38%|███▊      | 1989/5208 [06:33<10:37,  5.05it/s, loss=0.508][A
 38%|███▊      | 1990/5208 [06:33<10:36,  5.06it/s, loss=0.508][A
 38%|███▊      | 1990/5208 [06:33<10:36,  5.05it/s, loss=0.508

 39%|███▉      | 2044/5208 [06:46<10:28,  5.03it/s, loss=0.513][A
 39%|███▉      | 2045/5208 [06:46<10:28,  5.03it/s, loss=0.513][A
 39%|███▉      | 2045/5208 [06:46<10:28,  5.03it/s, loss=0.514][A
 39%|███▉      | 2046/5208 [06:46<10:28,  5.03it/s, loss=0.514][A
 39%|███▉      | 2046/5208 [06:46<10:28,  5.03it/s, loss=0.515][A
 39%|███▉      | 2047/5208 [06:46<10:28,  5.03it/s, loss=0.515][A
 39%|███▉      | 2047/5208 [06:46<10:28,  5.03it/s, loss=0.516][A
 39%|███▉      | 2048/5208 [06:46<10:27,  5.03it/s, loss=0.516][A
 39%|███▉      | 2048/5208 [06:47<10:28,  5.03it/s, loss=0.516][A
 39%|███▉      | 2049/5208 [06:47<10:27,  5.03it/s, loss=0.516][A
 39%|███▉      | 2049/5208 [06:47<10:28,  5.03it/s, loss=0.517][A
 39%|███▉      | 2050/5208 [06:47<10:27,  5.03it/s, loss=0.517][A
 39%|███▉      | 2050/5208 [06:47<10:28,  5.03it/s, loss=0.517][A
 39%|███▉      | 2051/5208 [06:47<10:27,  5.03it/s, loss=0.517][A
 39%|███▉      | 2051/5208 [06:48<10:28,  5.03it/s, loss=0.515

 40%|████      | 2105/5208 [07:00<10:19,  5.01it/s, loss=0.505][A
 40%|████      | 2106/5208 [07:00<10:19,  5.01it/s, loss=0.505][A
 40%|████      | 2106/5208 [07:00<10:19,  5.01it/s, loss=0.505][A
 40%|████      | 2107/5208 [07:00<10:19,  5.01it/s, loss=0.505][A
 40%|████      | 2107/5208 [07:00<10:19,  5.01it/s, loss=0.504][A
 40%|████      | 2108/5208 [07:00<10:18,  5.01it/s, loss=0.504][A
 40%|████      | 2108/5208 [07:01<10:19,  5.01it/s, loss=0.505][A
 40%|████      | 2109/5208 [07:01<10:18,  5.01it/s, loss=0.505][A
 40%|████      | 2109/5208 [07:01<10:19,  5.00it/s, loss=0.505][A
 41%|████      | 2110/5208 [07:01<10:18,  5.01it/s, loss=0.505][A
 41%|████      | 2110/5208 [07:01<10:19,  5.00it/s, loss=0.505][A
 41%|████      | 2111/5208 [07:01<10:18,  5.01it/s, loss=0.505][A
 41%|████      | 2111/5208 [07:01<10:18,  5.00it/s, loss=0.505][A
 41%|████      | 2112/5208 [07:01<10:18,  5.01it/s, loss=0.505][A
 41%|████      | 2112/5208 [07:02<10:18,  5.00it/s, loss=0.504

 42%|████▏     | 2166/5208 [07:14<10:09,  4.99it/s, loss=0.505][A
 42%|████▏     | 2167/5208 [07:14<10:09,  4.99it/s, loss=0.505][A
 42%|████▏     | 2167/5208 [07:14<10:09,  4.99it/s, loss=0.504][A
 42%|████▏     | 2168/5208 [07:14<10:09,  4.99it/s, loss=0.504][A
 42%|████▏     | 2168/5208 [07:14<10:09,  4.99it/s, loss=0.505][A
 42%|████▏     | 2169/5208 [07:14<10:09,  4.99it/s, loss=0.505][A
 42%|████▏     | 2169/5208 [07:14<10:09,  4.99it/s, loss=0.504][A
 42%|████▏     | 2170/5208 [07:14<10:08,  4.99it/s, loss=0.504][A
 42%|████▏     | 2170/5208 [07:15<10:09,  4.99it/s, loss=0.505][A
 42%|████▏     | 2171/5208 [07:15<10:08,  4.99it/s, loss=0.505][A
 42%|████▏     | 2171/5208 [07:15<10:09,  4.99it/s, loss=0.505][A
 42%|████▏     | 2172/5208 [07:15<10:08,  4.99it/s, loss=0.505][A
 42%|████▏     | 2172/5208 [07:15<10:08,  4.99it/s, loss=0.506][A
 42%|████▏     | 2173/5208 [07:15<10:08,  4.99it/s, loss=0.506][A
 42%|████▏     | 2173/5208 [07:15<10:08,  4.98it/s, loss=0.506

 43%|████▎     | 2227/5208 [07:28<10:00,  4.97it/s, loss=0.503][A
 43%|████▎     | 2228/5208 [07:28<09:59,  4.97it/s, loss=0.503][A
 43%|████▎     | 2228/5208 [07:28<09:59,  4.97it/s, loss=0.503][A
 43%|████▎     | 2229/5208 [07:28<09:59,  4.97it/s, loss=0.503][A
 43%|████▎     | 2229/5208 [07:28<09:59,  4.97it/s, loss=0.503][A
 43%|████▎     | 2230/5208 [07:28<09:59,  4.97it/s, loss=0.503][A
 43%|████▎     | 2230/5208 [07:28<09:59,  4.97it/s, loss=0.503][A
 43%|████▎     | 2231/5208 [07:28<09:59,  4.97it/s, loss=0.503][A
 43%|████▎     | 2231/5208 [07:29<09:59,  4.97it/s, loss=0.502][A
 43%|████▎     | 2232/5208 [07:29<09:59,  4.97it/s, loss=0.502][A
 43%|████▎     | 2232/5208 [07:29<09:59,  4.96it/s, loss=0.503][A
 43%|████▎     | 2233/5208 [07:29<09:58,  4.97it/s, loss=0.503][A
 43%|████▎     | 2233/5208 [07:29<09:59,  4.96it/s, loss=0.504][A
 43%|████▎     | 2234/5208 [07:29<09:58,  4.97it/s, loss=0.504][A
 43%|████▎     | 2234/5208 [07:30<09:59,  4.96it/s, loss=0.504

 44%|████▍     | 2288/5208 [07:43<09:51,  4.94it/s, loss=0.5][A
 44%|████▍     | 2289/5208 [07:43<09:50,  4.94it/s, loss=0.5][A
 44%|████▍     | 2289/5208 [07:43<09:51,  4.94it/s, loss=0.499][A
 44%|████▍     | 2290/5208 [07:43<09:50,  4.94it/s, loss=0.499][A
 44%|████▍     | 2290/5208 [07:43<09:51,  4.94it/s, loss=0.499][A
 44%|████▍     | 2291/5208 [07:43<09:50,  4.94it/s, loss=0.499][A
 44%|████▍     | 2291/5208 [07:44<09:51,  4.94it/s, loss=0.497][A
 44%|████▍     | 2292/5208 [07:44<09:50,  4.94it/s, loss=0.497][A
 44%|████▍     | 2292/5208 [07:44<09:50,  4.94it/s, loss=0.498][A
 44%|████▍     | 2293/5208 [07:44<09:50,  4.94it/s, loss=0.498][A
 44%|████▍     | 2293/5208 [07:44<09:50,  4.94it/s, loss=0.498][A
 44%|████▍     | 2294/5208 [07:44<09:50,  4.94it/s, loss=0.498][A
 44%|████▍     | 2294/5208 [07:44<09:50,  4.93it/s, loss=0.497][A
 44%|████▍     | 2295/5208 [07:44<09:50,  4.94it/s, loss=0.497][A
 44%|████▍     | 2295/5208 [07:45<09:50,  4.93it/s, loss=0.496][A

 45%|████▌     | 2349/5208 [08:01<09:46,  4.87it/s, loss=0.505][A
 45%|████▌     | 2350/5208 [08:01<09:46,  4.88it/s, loss=0.505][A
 45%|████▌     | 2350/5208 [08:02<09:46,  4.87it/s, loss=0.504][A
 45%|████▌     | 2351/5208 [08:02<09:45,  4.88it/s, loss=0.504][A
 45%|████▌     | 2351/5208 [08:02<09:46,  4.87it/s, loss=0.503][A
 45%|████▌     | 2352/5208 [08:02<09:45,  4.88it/s, loss=0.503][A
 45%|████▌     | 2352/5208 [08:02<09:45,  4.87it/s, loss=0.503][A
 45%|████▌     | 2353/5208 [08:02<09:45,  4.88it/s, loss=0.503][A
 45%|████▌     | 2353/5208 [08:02<09:45,  4.87it/s, loss=0.503][A
 45%|████▌     | 2354/5208 [08:02<09:45,  4.88it/s, loss=0.503][A
 45%|████▌     | 2354/5208 [08:03<09:45,  4.87it/s, loss=0.503][A
 45%|████▌     | 2355/5208 [08:03<09:45,  4.88it/s, loss=0.503][A
 45%|████▌     | 2355/5208 [08:03<09:45,  4.87it/s, loss=0.503][A
 45%|████▌     | 2356/5208 [08:03<09:45,  4.87it/s, loss=0.503][A
 45%|████▌     | 2356/5208 [08:03<09:45,  4.87it/s, loss=0.501

 46%|████▋     | 2410/5208 [08:17<09:37,  4.85it/s, loss=0.503][A
 46%|████▋     | 2411/5208 [08:17<09:36,  4.85it/s, loss=0.503][A
 46%|████▋     | 2411/5208 [08:17<09:37,  4.85it/s, loss=0.502][A
 46%|████▋     | 2412/5208 [08:17<09:36,  4.85it/s, loss=0.502][A
 46%|████▋     | 2412/5208 [08:17<09:37,  4.85it/s, loss=0.501][A
 46%|████▋     | 2413/5208 [08:17<09:36,  4.85it/s, loss=0.501][A
 46%|████▋     | 2413/5208 [08:18<09:36,  4.84it/s, loss=0.5]  [A
 46%|████▋     | 2414/5208 [08:18<09:36,  4.85it/s, loss=0.5][A
 46%|████▋     | 2414/5208 [08:18<09:36,  4.84it/s, loss=0.5][A
 46%|████▋     | 2415/5208 [08:18<09:36,  4.85it/s, loss=0.5][A
 46%|████▋     | 2415/5208 [08:18<09:36,  4.84it/s, loss=0.499][A
 46%|████▋     | 2416/5208 [08:18<09:36,  4.85it/s, loss=0.499][A
 46%|████▋     | 2416/5208 [08:18<09:36,  4.84it/s, loss=0.499][A
 46%|████▋     | 2417/5208 [08:18<09:35,  4.85it/s, loss=0.499][A
 46%|████▋     | 2417/5208 [08:19<09:36,  4.84it/s, loss=0.498][A
 

 47%|████▋     | 2471/5208 [08:32<09:27,  4.82it/s, loss=0.497][A
 47%|████▋     | 2472/5208 [08:32<09:26,  4.83it/s, loss=0.497][A
 47%|████▋     | 2472/5208 [08:32<09:27,  4.82it/s, loss=0.497][A
 47%|████▋     | 2473/5208 [08:32<09:26,  4.83it/s, loss=0.497][A
 47%|████▋     | 2473/5208 [08:32<09:27,  4.82it/s, loss=0.497][A
 48%|████▊     | 2474/5208 [08:32<09:26,  4.83it/s, loss=0.497][A
 48%|████▊     | 2474/5208 [08:32<09:26,  4.82it/s, loss=0.497][A
 48%|████▊     | 2475/5208 [08:32<09:26,  4.83it/s, loss=0.497][A
 48%|████▊     | 2475/5208 [08:33<09:26,  4.82it/s, loss=0.496][A
 48%|████▊     | 2476/5208 [08:33<09:26,  4.83it/s, loss=0.496][A
 48%|████▊     | 2476/5208 [08:33<09:26,  4.82it/s, loss=0.496][A
 48%|████▊     | 2477/5208 [08:33<09:25,  4.83it/s, loss=0.496][A
 48%|████▊     | 2477/5208 [08:33<09:26,  4.82it/s, loss=0.497][A
 48%|████▊     | 2478/5208 [08:33<09:25,  4.83it/s, loss=0.497][A
 48%|████▊     | 2478/5208 [08:33<09:25,  4.82it/s, loss=0.498

 49%|████▊     | 2532/5208 [08:46<09:16,  4.81it/s, loss=0.496][A
 49%|████▊     | 2533/5208 [08:46<09:16,  4.81it/s, loss=0.496][A
 49%|████▊     | 2533/5208 [08:47<09:16,  4.81it/s, loss=0.497][A
 49%|████▊     | 2534/5208 [08:47<09:16,  4.81it/s, loss=0.497][A
 49%|████▊     | 2534/5208 [08:47<09:16,  4.80it/s, loss=0.497][A
 49%|████▊     | 2535/5208 [08:47<09:16,  4.81it/s, loss=0.497][A
 49%|████▊     | 2535/5208 [08:47<09:16,  4.81it/s, loss=0.499][A
 49%|████▊     | 2536/5208 [08:47<09:15,  4.81it/s, loss=0.499][A
 49%|████▊     | 2536/5208 [08:47<09:16,  4.80it/s, loss=0.498][A
 49%|████▊     | 2537/5208 [08:47<09:15,  4.81it/s, loss=0.498][A
 49%|████▊     | 2537/5208 [08:48<09:15,  4.80it/s, loss=0.498][A
 49%|████▊     | 2538/5208 [08:48<09:15,  4.81it/s, loss=0.498][A
 49%|████▊     | 2538/5208 [08:48<09:15,  4.80it/s, loss=0.498][A
 49%|████▉     | 2539/5208 [08:48<09:15,  4.81it/s, loss=0.498][A
 49%|████▉     | 2539/5208 [08:48<09:15,  4.80it/s, loss=0.497

 50%|████▉     | 2594/5208 [09:01<09:06,  4.79it/s, loss=0.5][A
 50%|████▉     | 2594/5208 [09:02<09:06,  4.79it/s, loss=0.499][A
 50%|████▉     | 2595/5208 [09:02<09:05,  4.79it/s, loss=0.499][A
 50%|████▉     | 2595/5208 [09:02<09:06,  4.78it/s, loss=0.498][A
 50%|████▉     | 2596/5208 [09:02<09:05,  4.79it/s, loss=0.498][A
 50%|████▉     | 2596/5208 [09:02<09:05,  4.79it/s, loss=0.497][A
 50%|████▉     | 2597/5208 [09:02<09:05,  4.79it/s, loss=0.497][A
 50%|████▉     | 2597/5208 [09:02<09:05,  4.78it/s, loss=0.496][A
 50%|████▉     | 2598/5208 [09:02<09:05,  4.79it/s, loss=0.496][A
 50%|████▉     | 2598/5208 [09:02<09:05,  4.78it/s, loss=0.497][A
 50%|████▉     | 2599/5208 [09:02<09:05,  4.79it/s, loss=0.497][A
 50%|████▉     | 2599/5208 [09:03<09:05,  4.78it/s, loss=0.496][A
 50%|████▉     | 2600/5208 [09:03<09:04,  4.79it/s, loss=0.496][A
 50%|████▉     | 2600/5208 [09:03<09:05,  4.78it/s, loss=0.496][A
 50%|████▉     | 2601/5208 [09:03<09:04,  4.78it/s, loss=0.496]

 51%|█████     | 2655/5208 [09:16<08:54,  4.77it/s, loss=0.498][A
 51%|█████     | 2655/5208 [09:16<08:55,  4.77it/s, loss=0.498][A
 51%|█████     | 2656/5208 [09:16<08:54,  4.77it/s, loss=0.498][A
 51%|█████     | 2656/5208 [09:16<08:55,  4.77it/s, loss=0.498][A
 51%|█████     | 2657/5208 [09:16<08:54,  4.77it/s, loss=0.498][A
 51%|█████     | 2657/5208 [09:17<08:54,  4.77it/s, loss=0.499][A
 51%|█████     | 2658/5208 [09:17<08:54,  4.77it/s, loss=0.499][A
 51%|█████     | 2658/5208 [09:17<08:54,  4.77it/s, loss=0.499][A
 51%|█████     | 2659/5208 [09:17<08:54,  4.77it/s, loss=0.499][A
 51%|█████     | 2659/5208 [09:17<08:54,  4.77it/s, loss=0.5]  [A
 51%|█████     | 2660/5208 [09:17<08:53,  4.77it/s, loss=0.5][A
 51%|█████     | 2660/5208 [09:17<08:54,  4.77it/s, loss=0.499][A
 51%|█████     | 2661/5208 [09:17<08:53,  4.77it/s, loss=0.499][A
 51%|█████     | 2661/5208 [09:17<08:53,  4.77it/s, loss=0.5]  [A
 51%|█████     | 2662/5208 [09:17<08:53,  4.77it/s, loss=0.5][A

 52%|█████▏    | 2716/5208 [09:30<08:43,  4.76it/s, loss=0.497][A
 52%|█████▏    | 2716/5208 [09:30<08:43,  4.76it/s, loss=0.497][A
 52%|█████▏    | 2717/5208 [09:30<08:43,  4.76it/s, loss=0.497][A
 52%|█████▏    | 2717/5208 [09:31<08:43,  4.76it/s, loss=0.497][A
 52%|█████▏    | 2718/5208 [09:31<08:43,  4.76it/s, loss=0.497][A
 52%|█████▏    | 2718/5208 [09:31<08:43,  4.76it/s, loss=0.496][A
 52%|█████▏    | 2719/5208 [09:31<08:43,  4.76it/s, loss=0.496][A
 52%|█████▏    | 2719/5208 [09:31<08:43,  4.76it/s, loss=0.496][A
 52%|█████▏    | 2720/5208 [09:31<08:42,  4.76it/s, loss=0.496][A
 52%|█████▏    | 2720/5208 [09:31<08:43,  4.76it/s, loss=0.497][A
 52%|█████▏    | 2721/5208 [09:31<08:42,  4.76it/s, loss=0.497][A
 52%|█████▏    | 2721/5208 [09:32<08:42,  4.76it/s, loss=0.497][A
 52%|█████▏    | 2722/5208 [09:32<08:42,  4.76it/s, loss=0.497][A
 52%|█████▏    | 2722/5208 [09:32<08:42,  4.76it/s, loss=0.497][A
 52%|█████▏    | 2723/5208 [09:32<08:42,  4.76it/s, loss=0.497

 53%|█████▎    | 2777/5208 [09:44<08:32,  4.75it/s, loss=0.489][A
 53%|█████▎    | 2777/5208 [09:45<08:32,  4.75it/s, loss=0.489][A
 53%|█████▎    | 2778/5208 [09:45<08:31,  4.75it/s, loss=0.489][A
 53%|█████▎    | 2778/5208 [09:45<08:32,  4.75it/s, loss=0.488][A
 53%|█████▎    | 2779/5208 [09:45<08:31,  4.75it/s, loss=0.488][A
 53%|█████▎    | 2779/5208 [09:45<08:31,  4.75it/s, loss=0.488][A
 53%|█████▎    | 2780/5208 [09:45<08:31,  4.75it/s, loss=0.488][A
 53%|█████▎    | 2780/5208 [09:45<08:31,  4.75it/s, loss=0.489][A
 53%|█████▎    | 2781/5208 [09:45<08:31,  4.75it/s, loss=0.489][A
 53%|█████▎    | 2781/5208 [09:46<08:31,  4.75it/s, loss=0.489][A
 53%|█████▎    | 2782/5208 [09:46<08:31,  4.75it/s, loss=0.489][A
 53%|█████▎    | 2782/5208 [09:46<08:31,  4.75it/s, loss=0.489][A
 53%|█████▎    | 2783/5208 [09:46<08:30,  4.75it/s, loss=0.489][A
 53%|█████▎    | 2783/5208 [09:46<08:31,  4.75it/s, loss=0.489][A
 53%|█████▎    | 2784/5208 [09:46<08:30,  4.75it/s, loss=0.489

 54%|█████▍    | 2838/5208 [09:59<08:20,  4.73it/s, loss=0.49][A
 54%|█████▍    | 2838/5208 [09:59<08:20,  4.73it/s, loss=0.491][A
 55%|█████▍    | 2839/5208 [09:59<08:20,  4.74it/s, loss=0.491][A
 55%|█████▍    | 2839/5208 [09:59<08:20,  4.73it/s, loss=0.492][A
 55%|█████▍    | 2840/5208 [09:59<08:20,  4.73it/s, loss=0.492][A
 55%|█████▍    | 2840/5208 [09:59<08:20,  4.73it/s, loss=0.493][A
 55%|█████▍    | 2841/5208 [09:59<08:19,  4.74it/s, loss=0.493][A
 55%|█████▍    | 2841/5208 [10:00<08:20,  4.73it/s, loss=0.492][A
 55%|█████▍    | 2842/5208 [10:00<08:19,  4.73it/s, loss=0.492][A
 55%|█████▍    | 2842/5208 [10:00<08:19,  4.73it/s, loss=0.493][A
 55%|█████▍    | 2843/5208 [10:00<08:19,  4.73it/s, loss=0.493][A
 55%|█████▍    | 2843/5208 [10:00<08:19,  4.73it/s, loss=0.494][A
 55%|█████▍    | 2844/5208 [10:00<08:19,  4.74it/s, loss=0.494][A
 55%|█████▍    | 2844/5208 [10:00<08:19,  4.73it/s, loss=0.494][A
 55%|█████▍    | 2845/5208 [10:00<08:19,  4.73it/s, loss=0.494]

 56%|█████▌    | 2899/5208 [10:13<08:08,  4.72it/s, loss=0.492][A
 56%|█████▌    | 2899/5208 [10:14<08:09,  4.72it/s, loss=0.493][A
 56%|█████▌    | 2900/5208 [10:14<08:08,  4.72it/s, loss=0.493][A
 56%|█████▌    | 2900/5208 [10:14<08:08,  4.72it/s, loss=0.492][A
 56%|█████▌    | 2901/5208 [10:14<08:08,  4.72it/s, loss=0.492][A
 56%|█████▌    | 2901/5208 [10:14<08:08,  4.72it/s, loss=0.493][A
 56%|█████▌    | 2902/5208 [10:14<08:08,  4.72it/s, loss=0.493][A
 56%|█████▌    | 2902/5208 [10:14<08:08,  4.72it/s, loss=0.493][A
 56%|█████▌    | 2903/5208 [10:14<08:08,  4.72it/s, loss=0.493][A
 56%|█████▌    | 2903/5208 [10:15<08:08,  4.72it/s, loss=0.493][A
 56%|█████▌    | 2904/5208 [10:15<08:07,  4.72it/s, loss=0.493][A
 56%|█████▌    | 2904/5208 [10:15<08:08,  4.72it/s, loss=0.492][A
 56%|█████▌    | 2905/5208 [10:15<08:07,  4.72it/s, loss=0.492][A
 56%|█████▌    | 2905/5208 [10:15<08:08,  4.72it/s, loss=0.493][A
 56%|█████▌    | 2906/5208 [10:15<08:07,  4.72it/s, loss=0.493

 57%|█████▋    | 2960/5208 [10:28<07:57,  4.71it/s, loss=0.491][A
 57%|█████▋    | 2960/5208 [10:28<07:57,  4.71it/s, loss=0.491][A
 57%|█████▋    | 2961/5208 [10:28<07:56,  4.71it/s, loss=0.491][A
 57%|█████▋    | 2961/5208 [10:28<07:57,  4.71it/s, loss=0.492][A
 57%|█████▋    | 2962/5208 [10:28<07:56,  4.71it/s, loss=0.492][A
 57%|█████▋    | 2962/5208 [10:28<07:56,  4.71it/s, loss=0.492][A
 57%|█████▋    | 2963/5208 [10:28<07:56,  4.71it/s, loss=0.492][A
 57%|█████▋    | 2963/5208 [10:29<07:56,  4.71it/s, loss=0.493][A
 57%|█████▋    | 2964/5208 [10:29<07:56,  4.71it/s, loss=0.493][A
 57%|█████▋    | 2964/5208 [10:29<07:56,  4.71it/s, loss=0.492][A
 57%|█████▋    | 2965/5208 [10:29<07:56,  4.71it/s, loss=0.492][A
 57%|█████▋    | 2965/5208 [10:29<07:56,  4.71it/s, loss=0.491][A
 57%|█████▋    | 2966/5208 [10:29<07:55,  4.71it/s, loss=0.491][A
 57%|█████▋    | 2966/5208 [10:29<07:56,  4.71it/s, loss=0.491][A
 57%|█████▋    | 2967/5208 [10:29<07:55,  4.71it/s, loss=0.491

 58%|█████▊    | 3021/5208 [10:41<07:44,  4.71it/s, loss=0.496][A
 58%|█████▊    | 3021/5208 [10:42<07:44,  4.70it/s, loss=0.496][A
 58%|█████▊    | 3022/5208 [10:42<07:44,  4.71it/s, loss=0.496][A
 58%|█████▊    | 3022/5208 [10:42<07:44,  4.70it/s, loss=0.495][A
 58%|█████▊    | 3023/5208 [10:42<07:44,  4.71it/s, loss=0.495][A
 58%|█████▊    | 3023/5208 [10:42<07:44,  4.70it/s, loss=0.495][A
 58%|█████▊    | 3024/5208 [10:42<07:44,  4.71it/s, loss=0.495][A
 58%|█████▊    | 3024/5208 [10:42<07:44,  4.70it/s, loss=0.495][A
 58%|█████▊    | 3025/5208 [10:42<07:43,  4.71it/s, loss=0.495][A
 58%|█████▊    | 3025/5208 [10:43<07:44,  4.70it/s, loss=0.494][A
 58%|█████▊    | 3026/5208 [10:43<07:43,  4.71it/s, loss=0.494][A
 58%|█████▊    | 3026/5208 [10:43<07:43,  4.70it/s, loss=0.495][A
 58%|█████▊    | 3027/5208 [10:43<07:43,  4.71it/s, loss=0.495][A
 58%|█████▊    | 3027/5208 [10:43<07:43,  4.70it/s, loss=0.494][A
 58%|█████▊    | 3028/5208 [10:43<07:43,  4.71it/s, loss=0.494

 59%|█████▉    | 3082/5208 [10:55<07:32,  4.70it/s, loss=0.49][A
 59%|█████▉    | 3082/5208 [10:55<07:32,  4.70it/s, loss=0.489][A
 59%|█████▉    | 3083/5208 [10:55<07:31,  4.70it/s, loss=0.489][A
 59%|█████▉    | 3083/5208 [10:55<07:32,  4.70it/s, loss=0.489][A
 59%|█████▉    | 3084/5208 [10:55<07:31,  4.70it/s, loss=0.489][A
 59%|█████▉    | 3084/5208 [10:56<07:31,  4.70it/s, loss=0.488][A
 59%|█████▉    | 3085/5208 [10:56<07:31,  4.70it/s, loss=0.488][A
 59%|█████▉    | 3085/5208 [10:56<07:31,  4.70it/s, loss=0.489][A
 59%|█████▉    | 3086/5208 [10:56<07:31,  4.70it/s, loss=0.489][A
 59%|█████▉    | 3086/5208 [10:56<07:31,  4.70it/s, loss=0.489][A
 59%|█████▉    | 3087/5208 [10:56<07:31,  4.70it/s, loss=0.489][A
 59%|█████▉    | 3087/5208 [10:56<07:31,  4.70it/s, loss=0.492][A
 59%|█████▉    | 3088/5208 [10:56<07:30,  4.70it/s, loss=0.492][A
 59%|█████▉    | 3088/5208 [10:57<07:31,  4.70it/s, loss=0.491][A
 59%|█████▉    | 3089/5208 [10:57<07:30,  4.70it/s, loss=0.491]

 60%|██████    | 3143/5208 [11:09<07:19,  4.70it/s, loss=0.494][A
 60%|██████    | 3143/5208 [11:09<07:19,  4.69it/s, loss=0.495][A
 60%|██████    | 3144/5208 [11:09<07:19,  4.70it/s, loss=0.495][A
 60%|██████    | 3144/5208 [11:09<07:19,  4.69it/s, loss=0.496][A
 60%|██████    | 3145/5208 [11:09<07:19,  4.70it/s, loss=0.496][A
 60%|██████    | 3145/5208 [11:10<07:19,  4.69it/s, loss=0.496][A
 60%|██████    | 3146/5208 [11:10<07:19,  4.70it/s, loss=0.496][A
 60%|██████    | 3146/5208 [11:10<07:19,  4.69it/s, loss=0.497][A
 60%|██████    | 3147/5208 [11:10<07:18,  4.69it/s, loss=0.497][A
 60%|██████    | 3147/5208 [11:10<07:19,  4.69it/s, loss=0.496][A
 60%|██████    | 3148/5208 [11:10<07:18,  4.69it/s, loss=0.496][A
 60%|██████    | 3148/5208 [11:10<07:18,  4.69it/s, loss=0.494][A
 60%|██████    | 3149/5208 [11:10<07:18,  4.70it/s, loss=0.494][A
 60%|██████    | 3149/5208 [11:10<07:18,  4.69it/s, loss=0.494][A
 60%|██████    | 3150/5208 [11:10<07:18,  4.69it/s, loss=0.494

 62%|██████▏   | 3204/5208 [11:22<07:07,  4.69it/s, loss=0.488][A
 62%|██████▏   | 3204/5208 [11:22<07:07,  4.69it/s, loss=0.489][A
 62%|██████▏   | 3205/5208 [11:22<07:06,  4.69it/s, loss=0.489][A
 62%|██████▏   | 3205/5208 [11:22<07:06,  4.69it/s, loss=0.489][A
 62%|██████▏   | 3206/5208 [11:22<07:06,  4.69it/s, loss=0.489][A
 62%|██████▏   | 3206/5208 [11:23<07:06,  4.69it/s, loss=0.489][A
 62%|██████▏   | 3207/5208 [11:23<07:06,  4.69it/s, loss=0.489][A
 62%|██████▏   | 3207/5208 [11:23<07:06,  4.69it/s, loss=0.488][A
 62%|██████▏   | 3208/5208 [11:23<07:06,  4.69it/s, loss=0.488][A
 62%|██████▏   | 3208/5208 [11:23<07:06,  4.69it/s, loss=0.489][A
 62%|██████▏   | 3209/5208 [11:23<07:05,  4.70it/s, loss=0.489][A
 62%|██████▏   | 3209/5208 [11:23<07:05,  4.69it/s, loss=0.49] [A
 62%|██████▏   | 3210/5208 [11:23<07:05,  4.70it/s, loss=0.49][A
 62%|██████▏   | 3210/5208 [11:23<07:05,  4.69it/s, loss=0.489][A
 62%|██████▏   | 3211/5208 [11:23<07:05,  4.70it/s, loss=0.489]

 63%|██████▎   | 3265/5208 [11:35<06:54,  4.69it/s, loss=0.484][A
 63%|██████▎   | 3265/5208 [11:36<06:54,  4.69it/s, loss=0.485][A
 63%|██████▎   | 3266/5208 [11:36<06:53,  4.69it/s, loss=0.485][A
 63%|██████▎   | 3266/5208 [11:36<06:54,  4.69it/s, loss=0.484][A
 63%|██████▎   | 3267/5208 [11:36<06:53,  4.69it/s, loss=0.484][A
 63%|██████▎   | 3267/5208 [11:36<06:53,  4.69it/s, loss=0.484][A
 63%|██████▎   | 3268/5208 [11:36<06:53,  4.69it/s, loss=0.484][A
 63%|██████▎   | 3268/5208 [11:36<06:53,  4.69it/s, loss=0.484][A
 63%|██████▎   | 3269/5208 [11:36<06:53,  4.69it/s, loss=0.484][A
 63%|██████▎   | 3269/5208 [11:36<06:53,  4.69it/s, loss=0.484][A
 63%|██████▎   | 3270/5208 [11:36<06:53,  4.69it/s, loss=0.484][A
 63%|██████▎   | 3270/5208 [11:37<06:53,  4.69it/s, loss=0.483][A
 63%|██████▎   | 3271/5208 [11:37<06:52,  4.69it/s, loss=0.483][A
 63%|██████▎   | 3271/5208 [11:37<06:52,  4.69it/s, loss=0.482][A
 63%|██████▎   | 3272/5208 [11:37<06:52,  4.69it/s, loss=0.482

 64%|██████▍   | 3326/5208 [11:48<06:41,  4.69it/s, loss=0.489][A
 64%|██████▍   | 3326/5208 [11:49<06:41,  4.69it/s, loss=0.489][A
 64%|██████▍   | 3327/5208 [11:49<06:40,  4.69it/s, loss=0.489][A
 64%|██████▍   | 3327/5208 [11:49<06:41,  4.69it/s, loss=0.488][A
 64%|██████▍   | 3328/5208 [11:49<06:40,  4.69it/s, loss=0.488][A
 64%|██████▍   | 3328/5208 [11:49<06:40,  4.69it/s, loss=0.489][A
 64%|██████▍   | 3329/5208 [11:49<06:40,  4.69it/s, loss=0.489][A
 64%|██████▍   | 3329/5208 [11:49<06:40,  4.69it/s, loss=0.489][A
 64%|██████▍   | 3330/5208 [11:49<06:40,  4.69it/s, loss=0.489][A
 64%|██████▍   | 3330/5208 [11:50<06:40,  4.69it/s, loss=0.489][A
 64%|██████▍   | 3331/5208 [11:50<06:40,  4.69it/s, loss=0.489][A
 64%|██████▍   | 3331/5208 [11:50<06:40,  4.69it/s, loss=0.489][A
 64%|██████▍   | 3332/5208 [11:50<06:39,  4.69it/s, loss=0.489][A
 64%|██████▍   | 3332/5208 [11:50<06:39,  4.69it/s, loss=0.488][A
 64%|██████▍   | 3333/5208 [11:50<06:39,  4.69it/s, loss=0.488

 65%|██████▌   | 3387/5208 [12:02<06:28,  4.69it/s, loss=0.483][A
 65%|██████▌   | 3387/5208 [12:02<06:28,  4.69it/s, loss=0.483][A
 65%|██████▌   | 3388/5208 [12:02<06:27,  4.69it/s, loss=0.483][A
 65%|██████▌   | 3388/5208 [12:02<06:28,  4.69it/s, loss=0.484][A
 65%|██████▌   | 3389/5208 [12:02<06:27,  4.69it/s, loss=0.484][A
 65%|██████▌   | 3389/5208 [12:02<06:27,  4.69it/s, loss=0.484][A
 65%|██████▌   | 3390/5208 [12:02<06:27,  4.69it/s, loss=0.484][A
 65%|██████▌   | 3390/5208 [12:02<06:27,  4.69it/s, loss=0.485][A
 65%|██████▌   | 3391/5208 [12:02<06:27,  4.69it/s, loss=0.485][A
 65%|██████▌   | 3391/5208 [12:03<06:27,  4.69it/s, loss=0.487][A
 65%|██████▌   | 3392/5208 [12:03<06:27,  4.69it/s, loss=0.487][A
 65%|██████▌   | 3392/5208 [12:03<06:27,  4.69it/s, loss=0.486][A
 65%|██████▌   | 3393/5208 [12:03<06:26,  4.69it/s, loss=0.486][A
 65%|██████▌   | 3393/5208 [12:03<06:27,  4.69it/s, loss=0.487][A
 65%|██████▌   | 3394/5208 [12:03<06:26,  4.69it/s, loss=0.487

 66%|██████▌   | 3448/5208 [12:15<06:15,  4.69it/s, loss=0.49][A
 66%|██████▌   | 3448/5208 [12:15<06:15,  4.69it/s, loss=0.49][A
 66%|██████▌   | 3449/5208 [12:15<06:15,  4.69it/s, loss=0.49][A
 66%|██████▌   | 3449/5208 [12:16<06:15,  4.69it/s, loss=0.49][A
 66%|██████▌   | 3450/5208 [12:16<06:15,  4.69it/s, loss=0.49][A
 66%|██████▌   | 3450/5208 [12:16<06:15,  4.69it/s, loss=0.491][A
 66%|██████▋   | 3451/5208 [12:16<06:14,  4.69it/s, loss=0.491][A
 66%|██████▋   | 3451/5208 [12:16<06:14,  4.69it/s, loss=0.49] [A
 66%|██████▋   | 3452/5208 [12:16<06:14,  4.69it/s, loss=0.49][A
 66%|██████▋   | 3452/5208 [12:16<06:14,  4.69it/s, loss=0.49][A
 66%|██████▋   | 3453/5208 [12:16<06:14,  4.69it/s, loss=0.49][A
 66%|██████▋   | 3453/5208 [12:16<06:14,  4.69it/s, loss=0.491][A
 66%|██████▋   | 3454/5208 [12:16<06:14,  4.69it/s, loss=0.491][A
 66%|██████▋   | 3454/5208 [12:17<06:14,  4.69it/s, loss=0.49] [A
 66%|██████▋   | 3455/5208 [12:17<06:14,  4.69it/s, loss=0.49][A
 66%

 67%|██████▋   | 3509/5208 [12:29<06:02,  4.68it/s, loss=0.489][A
 67%|██████▋   | 3509/5208 [12:29<06:02,  4.68it/s, loss=0.489][A
 67%|██████▋   | 3510/5208 [12:29<06:02,  4.68it/s, loss=0.489][A
 67%|██████▋   | 3510/5208 [12:29<06:02,  4.68it/s, loss=0.489][A
 67%|██████▋   | 3511/5208 [12:29<06:02,  4.68it/s, loss=0.489][A
 67%|██████▋   | 3511/5208 [12:29<06:02,  4.68it/s, loss=0.488][A
 67%|██████▋   | 3512/5208 [12:29<06:02,  4.68it/s, loss=0.488][A
 67%|██████▋   | 3512/5208 [12:30<06:02,  4.68it/s, loss=0.489][A
 67%|██████▋   | 3513/5208 [12:30<06:01,  4.68it/s, loss=0.489][A
 67%|██████▋   | 3513/5208 [12:30<06:02,  4.68it/s, loss=0.488][A
 67%|██████▋   | 3514/5208 [12:30<06:01,  4.68it/s, loss=0.488][A
 67%|██████▋   | 3514/5208 [12:30<06:01,  4.68it/s, loss=0.488][A
 67%|██████▋   | 3515/5208 [12:30<06:01,  4.68it/s, loss=0.488][A
 67%|██████▋   | 3515/5208 [12:30<06:01,  4.68it/s, loss=0.489][A
 68%|██████▊   | 3516/5208 [12:30<06:01,  4.68it/s, loss=0.489

 69%|██████▊   | 3570/5208 [12:42<05:49,  4.68it/s, loss=0.489][A
 69%|██████▊   | 3570/5208 [12:42<05:50,  4.68it/s, loss=0.489][A
 69%|██████▊   | 3571/5208 [12:42<05:49,  4.68it/s, loss=0.489][A
 69%|██████▊   | 3571/5208 [12:43<05:49,  4.68it/s, loss=0.489][A
 69%|██████▊   | 3572/5208 [12:43<05:49,  4.68it/s, loss=0.489][A
 69%|██████▊   | 3572/5208 [12:43<05:49,  4.68it/s, loss=0.488][A
 69%|██████▊   | 3573/5208 [12:43<05:49,  4.68it/s, loss=0.488][A
 69%|██████▊   | 3573/5208 [12:43<05:49,  4.68it/s, loss=0.488][A
 69%|██████▊   | 3574/5208 [12:43<05:49,  4.68it/s, loss=0.488][A
 69%|██████▊   | 3574/5208 [12:43<05:49,  4.68it/s, loss=0.488][A
 69%|██████▊   | 3575/5208 [12:43<05:48,  4.68it/s, loss=0.488][A
 69%|██████▊   | 3575/5208 [12:44<05:48,  4.68it/s, loss=0.488][A
 69%|██████▊   | 3576/5208 [12:44<05:48,  4.68it/s, loss=0.488][A
 69%|██████▊   | 3576/5208 [12:44<05:48,  4.68it/s, loss=0.489][A
 69%|██████▊   | 3577/5208 [12:44<05:48,  4.68it/s, loss=0.489

 70%|██████▉   | 3631/5208 [12:56<05:37,  4.68it/s, loss=0.486][A
 70%|██████▉   | 3631/5208 [12:56<05:37,  4.68it/s, loss=0.486][A
 70%|██████▉   | 3632/5208 [12:56<05:36,  4.68it/s, loss=0.486][A
 70%|██████▉   | 3632/5208 [12:56<05:36,  4.68it/s, loss=0.486][A
 70%|██████▉   | 3633/5208 [12:56<05:36,  4.68it/s, loss=0.486][A
 70%|██████▉   | 3633/5208 [12:56<05:36,  4.68it/s, loss=0.486][A
 70%|██████▉   | 3634/5208 [12:56<05:36,  4.68it/s, loss=0.486][A
 70%|██████▉   | 3634/5208 [12:56<05:36,  4.68it/s, loss=0.487][A
 70%|██████▉   | 3635/5208 [12:56<05:36,  4.68it/s, loss=0.487][A
 70%|██████▉   | 3635/5208 [12:57<05:36,  4.68it/s, loss=0.486][A
 70%|██████▉   | 3636/5208 [12:57<05:35,  4.68it/s, loss=0.486][A
 70%|██████▉   | 3636/5208 [12:57<05:36,  4.68it/s, loss=0.485][A
 70%|██████▉   | 3637/5208 [12:57<05:35,  4.68it/s, loss=0.485][A
 70%|██████▉   | 3637/5208 [12:57<05:35,  4.68it/s, loss=0.485][A
 70%|██████▉   | 3638/5208 [12:57<05:35,  4.68it/s, loss=0.485

 71%|███████   | 3692/5208 [13:09<05:24,  4.68it/s, loss=0.49][A
 71%|███████   | 3692/5208 [13:09<05:24,  4.68it/s, loss=0.49][A
 71%|███████   | 3693/5208 [13:09<05:23,  4.68it/s, loss=0.49][A
 71%|███████   | 3693/5208 [13:09<05:23,  4.68it/s, loss=0.489][A
 71%|███████   | 3694/5208 [13:09<05:23,  4.68it/s, loss=0.489][A
 71%|███████   | 3694/5208 [13:09<05:23,  4.68it/s, loss=0.491][A
 71%|███████   | 3695/5208 [13:09<05:23,  4.68it/s, loss=0.491][A
 71%|███████   | 3695/5208 [13:10<05:23,  4.68it/s, loss=0.489][A
 71%|███████   | 3696/5208 [13:10<05:23,  4.68it/s, loss=0.489][A
 71%|███████   | 3696/5208 [13:10<05:23,  4.68it/s, loss=0.488][A
 71%|███████   | 3697/5208 [13:10<05:22,  4.68it/s, loss=0.488][A
 71%|███████   | 3697/5208 [13:10<05:23,  4.68it/s, loss=0.488][A
 71%|███████   | 3698/5208 [13:10<05:22,  4.68it/s, loss=0.488][A
 71%|███████   | 3698/5208 [13:10<05:22,  4.68it/s, loss=0.488][A
 71%|███████   | 3699/5208 [13:10<05:22,  4.68it/s, loss=0.488][

 72%|███████▏  | 3753/5208 [13:23<05:11,  4.67it/s, loss=0.488][A
 72%|███████▏  | 3753/5208 [13:23<05:11,  4.67it/s, loss=0.487][A
 72%|███████▏  | 3754/5208 [13:23<05:11,  4.67it/s, loss=0.487][A
 72%|███████▏  | 3754/5208 [13:23<05:11,  4.67it/s, loss=0.487][A
 72%|███████▏  | 3755/5208 [13:23<05:10,  4.67it/s, loss=0.487][A
 72%|███████▏  | 3755/5208 [13:23<05:11,  4.67it/s, loss=0.486][A
 72%|███████▏  | 3756/5208 [13:23<05:10,  4.67it/s, loss=0.486][A
 72%|███████▏  | 3756/5208 [13:24<05:10,  4.67it/s, loss=0.485][A
 72%|███████▏  | 3757/5208 [13:24<05:10,  4.67it/s, loss=0.485][A
 72%|███████▏  | 3757/5208 [13:24<05:10,  4.67it/s, loss=0.486][A
 72%|███████▏  | 3758/5208 [13:24<05:10,  4.67it/s, loss=0.486][A
 72%|███████▏  | 3758/5208 [13:24<05:10,  4.67it/s, loss=0.486][A
 72%|███████▏  | 3759/5208 [13:24<05:10,  4.67it/s, loss=0.486][A
 72%|███████▏  | 3759/5208 [13:24<05:10,  4.67it/s, loss=0.485][A
 72%|███████▏  | 3760/5208 [13:24<05:09,  4.67it/s, loss=0.485

 73%|███████▎  | 3814/5208 [13:36<04:58,  4.67it/s, loss=0.479][A
 73%|███████▎  | 3814/5208 [13:36<04:58,  4.67it/s, loss=0.48] [A
 73%|███████▎  | 3815/5208 [13:36<04:58,  4.67it/s, loss=0.48][A
 73%|███████▎  | 3815/5208 [13:36<04:58,  4.67it/s, loss=0.481][A
 73%|███████▎  | 3816/5208 [13:36<04:57,  4.67it/s, loss=0.481][A
 73%|███████▎  | 3816/5208 [13:36<04:57,  4.67it/s, loss=0.48] [A
 73%|███████▎  | 3817/5208 [13:36<04:57,  4.67it/s, loss=0.48][A
 73%|███████▎  | 3817/5208 [13:37<04:57,  4.67it/s, loss=0.481][A
 73%|███████▎  | 3818/5208 [13:37<04:57,  4.67it/s, loss=0.481][A
 73%|███████▎  | 3818/5208 [13:37<04:57,  4.67it/s, loss=0.48] [A
 73%|███████▎  | 3819/5208 [13:37<04:57,  4.67it/s, loss=0.48][A
 73%|███████▎  | 3819/5208 [13:37<04:57,  4.67it/s, loss=0.48][A
 73%|███████▎  | 3820/5208 [13:37<04:57,  4.67it/s, loss=0.48][A
 73%|███████▎  | 3820/5208 [13:37<04:57,  4.67it/s, loss=0.479][A
 73%|███████▎  | 3821/5208 [13:37<04:56,  4.67it/s, loss=0.479][A


 74%|███████▍  | 3875/5208 [13:49<04:45,  4.67it/s, loss=0.48][A
 74%|███████▍  | 3875/5208 [13:49<04:45,  4.67it/s, loss=0.481][A
 74%|███████▍  | 3876/5208 [13:49<04:45,  4.67it/s, loss=0.481][A
 74%|███████▍  | 3876/5208 [13:49<04:45,  4.67it/s, loss=0.479][A
 74%|███████▍  | 3877/5208 [13:49<04:44,  4.67it/s, loss=0.479][A
 74%|███████▍  | 3877/5208 [13:50<04:44,  4.67it/s, loss=0.48] [A
 74%|███████▍  | 3878/5208 [13:50<04:44,  4.67it/s, loss=0.48][A
 74%|███████▍  | 3878/5208 [13:50<04:44,  4.67it/s, loss=0.48][A
 74%|███████▍  | 3879/5208 [13:50<04:44,  4.67it/s, loss=0.48][A
 74%|███████▍  | 3879/5208 [13:50<04:44,  4.67it/s, loss=0.48][A
 75%|███████▍  | 3880/5208 [13:50<04:44,  4.67it/s, loss=0.48][A
 75%|███████▍  | 3880/5208 [13:50<04:44,  4.67it/s, loss=0.48][A
 75%|███████▍  | 3881/5208 [13:50<04:44,  4.67it/s, loss=0.48][A
 75%|███████▍  | 3881/5208 [13:51<04:44,  4.67it/s, loss=0.481][A
 75%|███████▍  | 3882/5208 [13:51<04:43,  4.67it/s, loss=0.481][A
 75

 76%|███████▌  | 3936/5208 [14:03<04:32,  4.67it/s, loss=0.483][A
 76%|███████▌  | 3936/5208 [14:03<04:32,  4.67it/s, loss=0.483][A
 76%|███████▌  | 3937/5208 [14:03<04:32,  4.67it/s, loss=0.483][A
 76%|███████▌  | 3937/5208 [14:03<04:32,  4.67it/s, loss=0.483][A
 76%|███████▌  | 3938/5208 [14:03<04:32,  4.67it/s, loss=0.483][A
 76%|███████▌  | 3938/5208 [14:03<04:32,  4.67it/s, loss=0.482][A
 76%|███████▌  | 3939/5208 [14:03<04:31,  4.67it/s, loss=0.482][A
 76%|███████▌  | 3939/5208 [14:03<04:31,  4.67it/s, loss=0.482][A
 76%|███████▌  | 3940/5208 [14:03<04:31,  4.67it/s, loss=0.482][A
 76%|███████▌  | 3940/5208 [14:04<04:31,  4.67it/s, loss=0.482][A
 76%|███████▌  | 3941/5208 [14:04<04:31,  4.67it/s, loss=0.482][A
 76%|███████▌  | 3941/5208 [14:04<04:31,  4.67it/s, loss=0.481][A
 76%|███████▌  | 3942/5208 [14:04<04:31,  4.67it/s, loss=0.481][A
 76%|███████▌  | 3942/5208 [14:04<04:31,  4.67it/s, loss=0.482][A
 76%|███████▌  | 3943/5208 [14:04<04:30,  4.67it/s, loss=0.482

 77%|███████▋  | 3997/5208 [14:16<04:19,  4.67it/s, loss=0.484][A
 77%|███████▋  | 3997/5208 [14:16<04:19,  4.66it/s, loss=0.483][A
 77%|███████▋  | 3998/5208 [14:16<04:19,  4.67it/s, loss=0.483][A
 77%|███████▋  | 3998/5208 [14:17<04:19,  4.66it/s, loss=0.485][A
 77%|███████▋  | 3999/5208 [14:17<04:19,  4.67it/s, loss=0.485][A
 77%|███████▋  | 3999/5208 [14:17<04:19,  4.66it/s, loss=0.484][A
 77%|███████▋  | 4000/5208 [14:17<04:18,  4.67it/s, loss=0.484][A
 77%|███████▋  | 4000/5208 [14:17<04:19,  4.66it/s, loss=0.484][A
 77%|███████▋  | 4001/5208 [14:17<04:18,  4.67it/s, loss=0.484][A
 77%|███████▋  | 4001/5208 [14:17<04:18,  4.66it/s, loss=0.485][A
 77%|███████▋  | 4002/5208 [14:17<04:18,  4.67it/s, loss=0.485][A
 77%|███████▋  | 4002/5208 [14:17<04:18,  4.66it/s, loss=0.485][A
 77%|███████▋  | 4003/5208 [14:17<04:18,  4.67it/s, loss=0.485][A
 77%|███████▋  | 4003/5208 [14:18<04:18,  4.66it/s, loss=0.485][A
 77%|███████▋  | 4004/5208 [14:18<04:18,  4.67it/s, loss=0.485

 78%|███████▊  | 4058/5208 [14:30<04:06,  4.66it/s, loss=0.483][A
 78%|███████▊  | 4058/5208 [14:30<04:06,  4.66it/s, loss=0.484][A
 78%|███████▊  | 4059/5208 [14:30<04:06,  4.66it/s, loss=0.484][A
 78%|███████▊  | 4059/5208 [14:30<04:06,  4.66it/s, loss=0.484][A
 78%|███████▊  | 4060/5208 [14:30<04:06,  4.66it/s, loss=0.484][A
 78%|███████▊  | 4060/5208 [14:30<04:06,  4.66it/s, loss=0.484][A
 78%|███████▊  | 4061/5208 [14:30<04:05,  4.66it/s, loss=0.484][A
 78%|███████▊  | 4061/5208 [14:31<04:06,  4.66it/s, loss=0.483][A
 78%|███████▊  | 4062/5208 [14:31<04:05,  4.66it/s, loss=0.483][A
 78%|███████▊  | 4062/5208 [14:31<04:05,  4.66it/s, loss=0.484][A
 78%|███████▊  | 4063/5208 [14:31<04:05,  4.66it/s, loss=0.484][A
 78%|███████▊  | 4063/5208 [14:31<04:05,  4.66it/s, loss=0.483][A
 78%|███████▊  | 4064/5208 [14:31<04:05,  4.66it/s, loss=0.483][A
 78%|███████▊  | 4064/5208 [14:31<04:05,  4.66it/s, loss=0.483][A
 78%|███████▊  | 4065/5208 [14:31<04:05,  4.66it/s, loss=0.483

 79%|███████▉  | 4119/5208 [14:43<03:53,  4.66it/s, loss=0.484][A
 79%|███████▉  | 4119/5208 [14:44<03:53,  4.66it/s, loss=0.483][A
 79%|███████▉  | 4120/5208 [14:44<03:53,  4.66it/s, loss=0.483][A
 79%|███████▉  | 4120/5208 [14:44<03:53,  4.66it/s, loss=0.482][A
 79%|███████▉  | 4121/5208 [14:44<03:53,  4.66it/s, loss=0.482][A
 79%|███████▉  | 4121/5208 [14:44<03:53,  4.66it/s, loss=0.483][A
 79%|███████▉  | 4122/5208 [14:44<03:53,  4.66it/s, loss=0.483][A
 79%|███████▉  | 4122/5208 [14:44<03:53,  4.66it/s, loss=0.482][A
 79%|███████▉  | 4123/5208 [14:44<03:52,  4.66it/s, loss=0.482][A
 79%|███████▉  | 4123/5208 [14:44<03:52,  4.66it/s, loss=0.48] [A
 79%|███████▉  | 4124/5208 [14:44<03:52,  4.66it/s, loss=0.48][A
 79%|███████▉  | 4124/5208 [14:45<03:52,  4.66it/s, loss=0.482][A
 79%|███████▉  | 4125/5208 [14:45<03:52,  4.66it/s, loss=0.482][A
 79%|███████▉  | 4125/5208 [14:45<03:52,  4.66it/s, loss=0.481][A
 79%|███████▉  | 4126/5208 [14:45<03:52,  4.66it/s, loss=0.481]

KeyboardInterrupt: 

In [None]:
learner.sched.plot_lr()

In [None]:
create_submission(learner.model, md.test_dl)

In [None]:
(datetime.now() - M.start_time).seconds

### Validate

In [None]:
df_val = predict_structured(learner.model, md, 'val', train, index='id')

In [None]:
pd.options.display.max_colwidth = 1000
df_val.head(5)