In [30]:
import torch
from dataLoader import VoicePassingDataloader
from model import VoicePassingModel
from transformers import DistilBertTokenizer, AdamW
from tqdm import tqdm

In [31]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [59]:
trainer_config = {
    'model' : VoicePassingModel(),
}

class VoicePassingTrainer():

    def __init__(self, model):

        self.device = "cuda" if torch.cuda.is_available() else 'cpu'
        self.model = model.to(self.device)
        self.tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
        self.train_loss_history = []
        self.train_acc_history = []
        self.valid_loss_history = []
        self.valid_acc_history = []

    def set_model(self, model):
        self.model = model.to(self.device)

    def get_history(self):

        history = {
            "train_loss" : self.train_loss_history,
            "train_accuracy" : self.train_acc_history,
            "valid_loss" : self.valid_loss_history,
            "valid_accuracy" : self.valid_acc_history
        }

        return history

    def train(self, num_epochs, train_loader, criterion, lr = 3e-5, valid_loader = None, reset_history = False, verbose = True):

        if reset_history:
            self.train_loss_history = []
            self.train_acc_history = []
            self.valid_loss_history = []
            self.valid_acc_history = []

        self.criterion = criterion
        self.optimizer = AdamW(params = self.model.parameters(), lr = lr, correct_bias=False)
        self.model.train()

        for epoch_idx in range(num_epochs):
            train_loss, train_acc = self.train_one_epoch(epoch_idx, train_loader, verbose = verbose)

            self.train_loss_history.append(train_loss)
            self.train_acc_history.append(train_acc)

            if valid_loader:
                valid_loss, valid_acc = self.validate(epoch_idx, valid_loader, verbose = verbose)

                # 모델 저장
                if not epoch_idx: # 저장된 게 없다면
                    torch.save(self.model.state_dict(), r"./result/best.pt")
                    print("the initial model saved")

                else:
                    if valid_loss < self.valid_loss_history[-1]:
                        torch.save(self.model.state_dict(), fr"./result/best.pt")
                        print("the best model saved")

                self.valid_loss_history.append(valid_loss)
                self.valid_acc_history.append(valid_acc)

        torch.save(self.model.state_dict(), fr"./result/last.pt")
        print("the last model saved")

    def train_one_epoch(self, index, train_loader, verbose = True):

        train_loss = 0
        train_correct = 0
        train_n_probs = 0

        for X, y in tqdm(train_loader, desc="batch", leave= True):

            X = self.tokenizer(
                text = X,
                add_special_tokens = True,
                max_length = 512,
                padding = "max_length",
                truncation = True,
                return_tensors = "pt"
            ).to(self.device)

            y = y.squeeze().to(self.device)

            pred = self.model(X)
        
            loss = self.criterion(pred, y)
            train_loss += loss.item()

            loss.backward()
            self.optimizer.step()

            pred_labels = pred.argmax(axis = 1)
            n_correct = len(torch.where(pred_labels == y)[0])

            train_correct += n_correct
            train_n_probs += len(y)

        train_acc = (train_correct / train_n_probs) * 100

        if verbose:
            print(f"EPOCH {index+1} TRAIN / Loss : {train_loss : .4f}, Acc : {train_acc : .4f}")
            print(pred[-4:])
            print(y[-4:])

        return train_loss, train_acc

    def test_a_sentence(self, text):

        X = self.tokenizer(
                text = text,
                add_special_tokens = True,
                max_length = 512,
                padding = "max_length",
                truncation = True,
                return_tensors = "pt"
            ).to(self.device)
        
        pred = self.model(X)
        return pred
    
    def validate(self, index, valid_loader, verbose = True):

        valid_correct = 0
        valid_n_probs = 0
        
        self.model.eval()

        for X, y in tqdm(valid_loader, desc="batch", leave= True):

            X = self.tokenizer(
                text = X,
                add_special_tokens = True,
                max_length = 512,
                padding = "max_length",
                truncation = True,
                return_tensors = "pt"
            ).to(self.device)

            y = y.squeeze().to(self.device)

            pred = self.model(X)        
            valid_loss = self.criterion(pred, y)

            pred_labels = pred.argmax(axis = 1)
            n_correct = len(torch.where(pred_labels == y)[0])

            valid_correct += n_correct
            valid_n_probs += len(y)

        valid_acc = (valid_correct / valid_n_probs) * 100

        if verbose:
            print(f"EPOCH {index+1} VALID / Loss : {valid_loss : .4f}, Acc : {valid_acc : .4f}")
            print(pred[-4:])
            print(y[-4:])
        
        return valid_loss, valid_acc
        

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [60]:
trainer = VoicePassingTrainer(model = VoicePassingModel())

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [61]:
criterion = torch.nn.CrossEntropyLoss()

In [62]:
trainer.train(10, VoicePassingDataloader(small = True), criterion = criterion, valid_loader=VoicePassingDataloader(test = True, small= True))

batch: 100%|██████████| 4/4 [00:01<00:00,  2.20it/s]


EPOCH 1 TRAIN / Loss :  5.3532, Acc :  33.0000
tensor([[ 0.1320,  0.1297, -0.0846, -0.2382],
        [-0.0037,  0.1082, -0.1101, -0.2471],
        [-0.0243,  0.0892, -0.2200, -0.3167],
        [ 0.0628,  0.1843, -0.1001, -0.3303]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([1, 1, 1, 0], device='cuda:0')


batch: 100%|██████████| 4/4 [00:01<00:00,  2.36it/s]


EPOCH 1 VALID / Loss :  1.1504, Acc :  66.0000
tensor([[ 0.1725,  0.1283, -0.2461, -0.2035],
        [ 0.2789,  0.1716, -0.1987, -0.4033],
        [ 0.2379,  0.2562, -0.2289, -0.4677],
        [ 0.2129,  0.1559, -0.1788, -0.3683]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([0, 0, 0, 0], device='cuda:0')
the initial model saved
the last model saved


batch: 100%|██████████| 4/4 [00:01<00:00,  2.32it/s]


EPOCH 2 TRAIN / Loss :  4.8712, Acc :  60.0000
tensor([[ 0.4310,  0.5601, -0.2421, -0.8580],
        [ 0.4449,  0.5607, -0.2596, -0.8575],
        [ 0.4547,  0.5757, -0.2674, -0.8576],
        [ 0.5859,  0.4499, -0.3377, -0.8678]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([1, 1, 1, 0], device='cuda:0')


batch: 100%|██████████| 4/4 [00:01<00:00,  2.34it/s]


EPOCH 2 VALID / Loss :  0.9095, Acc :  68.0000
tensor([[ 0.5023,  0.3073, -0.3905, -0.5416],
        [ 0.7375,  0.4752, -0.4240, -0.9210],
        [ 0.6432,  0.6677, -0.4294, -1.0479],
        [ 0.5780,  0.4784, -0.3534, -0.8592]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([0, 0, 0, 0], device='cuda:0')
the best model saved
the last model saved


batch: 100%|██████████| 4/4 [00:01<00:00,  2.33it/s]


EPOCH 3 TRAIN / Loss :  4.7637, Acc :  59.0000
tensor([[ 0.7102,  1.0363, -0.4755, -1.3269],
        [ 0.7389,  1.0294, -0.4955, -1.3316],
        [ 0.7493,  1.0570, -0.5313, -1.3415],
        [ 1.0129,  0.8391, -0.6218, -1.3427]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([1, 1, 1, 0], device='cuda:0')


batch: 100%|██████████| 4/4 [00:01<00:00,  2.32it/s]


EPOCH 3 VALID / Loss :  0.7823, Acc :  65.0000
tensor([[ 0.8451,  0.5240, -0.5993, -0.8573],
        [ 1.1642,  0.8216, -0.7198, -1.3455],
        [ 0.9942,  1.1480, -0.7206, -1.5266],
        [ 0.9173,  0.8431, -0.6012, -1.2561]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([0, 0, 0, 0], device='cuda:0')
the best model saved
the last model saved


batch: 100%|██████████| 4/4 [00:01<00:00,  2.33it/s]


EPOCH 4 TRAIN / Loss :  4.9237, Acc :  54.0000
tensor([[ 0.8208,  1.4438, -0.6471, -1.6079],
        [ 0.8651,  1.4324, -0.6772, -1.6138],
        [ 0.8901,  1.4708, -0.7354, -1.6254],
        [ 1.3122,  1.1515, -0.8623, -1.6251]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([1, 1, 1, 0], device='cuda:0')


batch: 100%|██████████| 4/4 [00:01<00:00,  2.33it/s]


EPOCH 4 VALID / Loss :  0.7500, Acc :  61.0000
tensor([[ 1.1094,  0.6729, -0.7710, -1.0441],
        [ 1.4632,  1.0549, -0.9454, -1.5619],
        [ 1.1565,  1.5142, -0.9293, -1.7578],
        [ 1.1029,  1.1016, -0.7674, -1.4597]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([0, 0, 0, 0], device='cuda:0')
the best model saved
the last model saved


batch: 100%|██████████| 4/4 [00:01<00:00,  2.31it/s]


EPOCH 5 TRAIN / Loss :  5.0013, Acc :  53.0000
tensor([[ 0.7314,  1.5915, -0.6235, -1.6259],
        [ 0.7913,  1.5741, -0.6650, -1.6322],
        [ 0.8329,  1.6192, -0.7419, -1.6403],
        [ 1.4504,  1.1787, -0.9197, -1.6421]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([1, 1, 1, 0], device='cuda:0')


batch: 100%|██████████| 4/4 [00:01<00:00,  2.25it/s]


EPOCH 5 VALID / Loss :  0.7129, Acc :  62.0000
tensor([[ 1.2555,  0.6192, -0.8050, -1.0561],
        [ 1.6198,  0.9718, -0.9703, -1.5229],
        [ 1.1225,  1.5560, -0.9063, -1.7064],
        [ 1.1214,  1.0852, -0.7400, -1.4254]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([0, 0, 0, 0], device='cuda:0')
the best model saved
the last model saved


batch: 100%|██████████| 4/4 [00:01<00:00,  2.23it/s]


EPOCH 6 TRAIN / Loss :  4.7645, Acc :  55.0000
tensor([[ 0.5149,  1.4283, -0.4066, -1.4175],
        [ 0.5899,  1.4036, -0.4547, -1.4231],
        [ 0.6486,  1.4507, -0.5433, -1.4271],
        [ 1.4827,  0.8752, -0.7820, -1.4330]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([1, 1, 1, 0], device='cuda:0')


batch: 100%|██████████| 4/4 [00:01<00:00,  2.30it/s]


EPOCH 6 VALID / Loss :  0.6437, Acc :  66.0000
tensor([[ 1.3183,  0.3702, -0.7095, -0.9272],
        [ 1.6863,  0.5970, -0.8243, -1.2927],
        [ 0.9745,  1.2702, -0.6898, -1.4333],
        [ 1.0448,  0.8100, -0.5621, -1.2037]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([0, 0, 0, 0], device='cuda:0')
the best model saved
the last model saved


batch: 100%|██████████| 4/4 [00:01<00:00,  2.25it/s]


EPOCH 7 TRAIN / Loss :  4.3860, Acc :  59.0000
tensor([[ 0.2522,  1.0674, -0.0970, -1.0869],
        [ 0.3429,  1.0322, -0.1526, -1.0867],
        [ 0.4100,  1.0753, -0.2511, -1.0864],
        [ 1.4559,  0.3733, -0.5435, -1.1001]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([1, 1, 1, 0], device='cuda:0')


batch: 100%|██████████| 4/4 [00:01<00:00,  2.27it/s]


EPOCH 7 VALID / Loss :  0.5942, Acc :  69.0000
tensor([[ 1.3382,  0.0325, -0.5600, -0.7226],
        [ 1.7076,  0.0803, -0.6005, -0.9700],
        [ 0.7763,  0.8031, -0.3904, -1.0523],
        [ 0.9281,  0.4007, -0.3131, -0.9005]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([0, 0, 0, 0], device='cuda:0')
the best model saved
the last model saved


batch: 100%|██████████| 4/4 [00:01<00:00,  2.26it/s]


EPOCH 8 TRAIN / Loss :  4.1786, Acc :  60.0000
tensor([[-0.0236,  0.6564,  0.2229, -0.7095],
        [ 0.0772,  0.6096,  0.1629, -0.7068],
        [ 0.1527,  0.6446,  0.0570, -0.7011],
        [ 1.3863, -0.1633, -0.2795, -0.7324]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([1, 1, 1, 0], device='cuda:0')


batch: 100%|██████████| 4/4 [00:01<00:00,  2.27it/s]


EPOCH 8 VALID / Loss :  0.6112, Acc :  69.0000
tensor([[ 1.3318, -0.3036, -0.4029, -0.4850],
        [ 1.6899, -0.4472, -0.3744, -0.6261],
        [ 0.5515,  0.3150, -0.0874, -0.6346],
        [ 0.7860, -0.0211, -0.0630, -0.5654]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([0, 0, 0, 0], device='cuda:0')
the last model saved


batch: 100%|██████████| 4/4 [00:01<00:00,  2.37it/s]


EPOCH 9 TRAIN / Loss :  4.2678, Acc :  58.0000
tensor([[-0.3140,  0.3181,  0.4926, -0.3173],
        [-0.2049,  0.2624,  0.4254, -0.3105],
        [-0.1220,  0.2938,  0.3140, -0.2987],
        [ 1.2952, -0.6270, -0.0630, -0.3446]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([1, 1, 1, 0], device='cuda:0')


batch: 100%|██████████| 4/4 [00:01<00:00,  2.28it/s]


EPOCH 9 VALID / Loss :  0.6963, Acc :  56.0000
tensor([[ 1.3091, -0.5882, -0.2857, -0.2399],
        [ 1.6467, -0.8838, -0.2116, -0.2572],
        [ 0.3119, -0.0555,  0.1473, -0.2136],
        [ 0.6371, -0.3572,  0.1195, -0.2275]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([0, 0, 0, 0], device='cuda:0')
the last model saved


batch: 100%|██████████| 4/4 [00:01<00:00,  2.29it/s]


EPOCH 10 TRAIN / Loss :  4.5092, Acc :  52.0000
tensor([[-0.6079,  0.1439,  0.6449,  0.0373],
        [-0.4867,  0.0796,  0.5777,  0.0469],
        [-0.3991,  0.1092,  0.4638,  0.0647],
        [ 1.2057, -0.9383,  0.0224,  0.0389]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([1, 1, 1, 0], device='cuda:0')


batch: 100%|██████████| 4/4 [00:01<00:00,  2.30it/s]


EPOCH 10 VALID / Loss :  0.8101, Acc :  49.0000
tensor([[ 1.2925, -0.7846, -0.2641, -0.0117],
        [ 1.6141, -1.1644, -0.1879,  0.0967],
        [ 0.0769, -0.2408,  0.2363,  0.1720],
        [ 0.4971, -0.5475,  0.1816,  0.0898]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([0, 0, 0, 0], device='cuda:0')
the last model saved
