In [None]:
import math
import time

from torch import nn, optim
from torch.optim import Adam

from data import *
# data用于获取数据集，主要用DataLoader获取数据集
# 同时data中导入了conf，因此参数设置也在data中，如device
# 包括：
# train
# valid
# test

from model.LMVCAT import LMVCATModel
from loss.Loss import Loss
# __init__.py可以把一个文件夹变成一个包，组织清晰便于维护
# 模型放在models中

from tokens import get_tokens


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
# count_parameters可以计算模型参数量

def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.kaiming_uniform_(m.weight.data)
# kaiming_uniform初始化参数

print("gpu:", torch.cuda.is_available())
cls_tokens = get_tokens()

model = LMVCATModel(n_view=n_view,
                    d_vec=d_vec,
                    mlp_out=mlp_out,
                    d_model=d_model,
                    mlp_hidden=mlp_hidden,
                    drop_prob=drop_prob,
                    vf_hidden=vf_hidden,
                    vf_head=vf_head,
                    vf_layers=vf_layers,
                    awf_gamma=awf_gamma,
                    cf_hidden=cf_hidden,
                    cf_head=cf_head,
                    cf_layers=cf_layers,
                    cls_tokens=cls_tokens,
                    s_mask=s_mask,
                    l_mask=l_mask,
                    n_cls=n_cls)

print(f'The model has {count_parameters(model):,} trainable parameters')
model.to(device)
model.apply(initialize_weights)

optimizer = Adam(params=model.parameters(),
                 lr=init_lr,
                 weight_decay=weight_decay,
                 eps=adam_eps)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 verbose=True,
                                                 factor=factor,
                                                 patience=patience)

criterion = Loss(d_model=mlp_out,
                 n_cls=n_cls,
                 alpha=alpha,
                 beta=beta,
                 s_mask=s_mask,
                 l_mask=l_mask)


def train(model, iterator, optimizer, criterion, clip=None):
    """
    train函数是一个epoch内的，也仅返回一个epoch的loss
    """
    model.train()

    # torch.autograd.set_detect_anomaly(True)

    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src = batch[0].cuda(non_blocking=True)  #  [batch_size, n_view, d_vec]
        trg = batch[1].cuda(non_blocking=True)  #  [batch_size, n_cls]

        optimizer.zero_grad()
        v, z, p = model(src)

        loss = criterion(v, z, p, trg)
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        # 剪裁梯度范围
        optimizer.step()

        epoch_loss += loss.item()
        # print('step :', round((i / len(iterator)) * 100, 2), '% , loss :', loss.item())

    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    """
    计算验证集上误差
    """
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch[0].cuda(non_blocking=True)
            trg = batch[1].cuda(non_blocking=True)
            v, z, p = model(src)

            loss = criterion(v, z, p, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

def run(total_epoch, best_loss):
    """
    除了train跟evaluation外，一般用run将二者组合起来
    """
    train_iter, valid_iter = get_data_loader()

    train_losses, test_losses = [], []
    for step in range(total_epoch):
        start_time = time.time()
        train_loss = train(model, train_iter, optimizer, criterion)
        valid_loss = evaluate(model, valid_iter, criterion)
        end_time = time.time()

        # 在warmup前learning rate不变
        if step > warmup:
            scheduler.step(valid_loss)

        train_losses.append(train_loss)
        test_losses.append(valid_loss)
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        # best_loss计算最佳验证集loss，并且在loss最佳时保存模型
        if valid_loss < best_loss:
            best_loss = valid_loss
            torch.save(model.state_dict(), 'saved/model-{0}.pt'.format(valid_loss))

        f = open('result/train_loss.txt', 'w')
        f.write(str(train_losses))
        f.close()

        f = open('result/test_loss.txt', 'w')
        f.write(str(test_losses))
        f.close()

        print(f'Epoch: {step + 1} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\tVal Loss: {valid_loss:.3f} |  Val PPL: {math.exp(valid_loss):7.3f}')


if __name__ == '__main__':
    run(total_epoch=epoch, best_loss=inf)

gpu: True
The model has 315,802,126 trainable parameters
Epoch: 1 | Time: 1m 19s
	Train Loss: 0.623 | Train PPL:   1.864
	Val Loss: 0.398 |  Val PPL:   1.489
Epoch: 2 | Time: 1m 18s
	Train Loss: 0.371 | Train PPL:   1.449
	Val Loss: 0.376 |  Val PPL:   1.456
Epoch: 3 | Time: 1m 18s
	Train Loss: 0.359 | Train PPL:   1.432
	Val Loss: 0.375 |  Val PPL:   1.455
Epoch: 4 | Time: 1m 18s
	Train Loss: 0.357 | Train PPL:   1.430
	Val Loss: 0.376 |  Val PPL:   1.456
Epoch: 5 | Time: 1m 18s
	Train Loss: 0.356 | Train PPL:   1.428
	Val Loss: 0.375 |  Val PPL:   1.455
Epoch: 6 | Time: 1m 18s
	Train Loss: 0.355 | Train PPL:   1.426
	Val Loss: 0.374 |  Val PPL:   1.453
Epoch: 7 | Time: 1m 18s
	Train Loss: 0.354 | Train PPL:   1.425
	Val Loss: 0.374 |  Val PPL:   1.454
Epoch: 8 | Time: 1m 18s
	Train Loss: 0.354 | Train PPL:   1.425
	Val Loss: 0.374 |  Val PPL:   1.454
Epoch: 9 | Time: 1m 18s
	Train Loss: 0.354 | Train PPL:   1.424
	Val Loss: 0.374 |  Val PPL:   1.453
Epoch: 10 | Time: 1m 18s
	Train Lo