In [1]:
!pip install Sentencepiece
!pip install transformers

Collecting Sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 4.3 MB/s 
[?25hInstalling collected packages: Sentencepiece
Successfully installed Sentencepiece-0.1.96
Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 4.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 65.6 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 77.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |█████████████████████████████

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BigBirdTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig, BigBirdForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import nltk

from models import *

In [3]:
# from google.colab import drive
# drive._mount('/content/drive')

Mounted at /content/drive


In [4]:
# import os
# os.chdir('drive/MyDrive/machine_learning')

In [13]:
# # load dataset

data_train = pd.read_csv('dataset_binary_train.csv')[0:128]
data_test = pd.read_csv('dataset_binary_test.csv')[0:128]

X_train, y_train = data_train.data.tolist(), data_train.label.tolist()
X_test, y_test = data_test.data.tolist(), data_test.label.tolist()

print('Train dataset length: {}'.format(len(X_train)))
print('Test dataset length: {}'.format(len(X_test)))

Train dataset length: 128
Test dataset length: 128


In [14]:
def train_val(train_dataset, test_dataset, transformer_name, transformer, classifier_name=None, classifier=None, lr_transformer=3e-5, lr_classifier=1e-3, batch_size=64, max_epoch=5):
    # create dataloader for tensor dataset
    train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size)
    val_dataloader = DataLoader(test_dataset, sampler = SequentialSampler(test_dataset), batch_size = batch_size)
    
    # define device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # use cuda for transformer
    transformer = transformer.model.to(device)
    
    # define models
    if classifier is not None:
        classifier = classifier.to(device)
        optimizer = torch.optim.Adam([{"params": classifier.parameters(), 'lr': lr_classifier}])
        for p in transformer.parameters(): # freeze the layers of transformer
            p.requires_grad = False
    else:
        optimizer = torch.optim.Adam(transformer.parameters(), lr = lr_transformer) # the learning rate is suggested by the authors

        for p in transformer.parameters():
            p.requires_grad = True

    
    # Hyper-parameters
    max_epoch = 5
    n_batch = int(len(train_dataset)/batch_size)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = len(train_dataloader) * max_epoch)
    criterion = F.cross_entropy
    
    # clean memory in GPU
    torch.cuda.empty_cache()
    
    # a list to record the state of training
    training_stats = []


            
    print('Training start!')
    for e in range(max_epoch):
        
        # train model
        #model.train()
        if classifier is not None:
            classifier.train()
        else:
            transformer.train()
        
        epoch_loss = 0
        train_acc = 0
        
        for b, (x_id, x_mask, y) in enumerate(train_dataloader):
            x_id, x_mask, y = x_id.to(device), x_mask.to(device), y.to(device)
            
            optimizer.zero_grad()
            
            if classifier is not None:
                with torch.no_grad():
                    word_embedding = transformer(x_id, token_type_ids=None, attention_mask=x_mask, labels=y)['hidden_states'][-1]   
                logits = classifier(word_embedding)
                loss = criterion(logits, y)
            else:
                outputs = transformer(x_id, token_type_ids=None, attention_mask=x_mask, labels=y)
                loss, logits = outputs['loss'], outputs['logits']


            epoch_loss += loss
            train_acc += (logits.max(1)[1] == y).float().mean().item()
            
            loss.backward()
            
            #clip gradient
            if classifier is None:
                torch.nn.utils.clip_grad_norm_(transformer.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            print("\rEpoch: {:d} batch: {:d} / {} loss: {:.4f} | {:.2%}".format(e + 1, b, n_batch, loss, b*1.0/n_batch), end='', flush=True)
        print("\n----- Epoch {} ------\nTraining loss: {}".format(e+1, epoch_loss / len(train_dataloader)))
        print("Training accuracy: {}".format(train_acc / len(train_dataloader)))

        
        # evaluate model
        if classifier is not None:
            classifier.eval()
        transformer.eval()
        
        eval_acc = 0
        eval_loss = 0
        nb_eval_steps = 0
        
        for b, (x_id, x_mask, y) in enumerate(val_dataloader):
            x_id, x_mask, y = x_id.to(device), x_mask.to(device), y.to(device)
            
            with torch.no_grad():
                if classifier is not None:
                    word_embedding = transformer(x_id, token_type_ids=None, attention_mask=x_mask, labels=y)['hidden_states'][-1]   
                    logits = classifier(word_embedding)
                    loss = criterion(logits, y)
                else:
                    outputs = transformer(x_id, token_type_ids=None, attention_mask=x_mask, labels=y)
                    loss, logits = outputs['loss'], outputs['logits']
            
            eval_loss += loss
            eval_acc += (logits.max(1)[1] == y).float().mean().item()

        print("Validation loss: {}".format(eval_loss / len(val_dataloader)))
        print("Validation accuracy: {}".format(eval_acc / len(val_dataloader)))
        print("\n")
        
        training_stats.append(
            {
                'epoch': e+1,
                'train_loss': epoch_loss / len(train_dataloader),
                'train_acc': train_acc / len(train_dataloader),
                'val_loss': eval_loss / len(val_dataloader),
                'val_acc': eval_acc / len(val_dataloader),
            }
        )

        # save models
        if classifier is not None:
            torch.save(classifier, '{}-{}.pkl'.format(transformer_name, classifier_name))
        else:
            torch.save(transformer, '{}.pkl'.format(transformer_name))
        
        # save states of training
        np.save('{}-{}-train_stats_Epoch{}.npy'.format(transformer_name, classifier_name, e+1), training_stats) 

    print('Training complete!')

**Fine-tune BERT**

In [16]:
transformer = Transformer('BERT')

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='BERT', transformer=transformer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

----- Processing 128/128 test samples

Training start!
Epoch: 1 batch: 1 / 2 loss: 0.6671 | 50.00%
----- Epoch 1 ------
Training loss: 0.6954854726791382
Training accuracy: 0.5078125
Validation loss: 0.7005167603492737
Validation accuracy: 0.4921875


Epoch: 2 batch: 1 / 2 loss: 0.6293 | 50.00%
----- Epoch 2 ------
Training loss: 0.6589019298553467
Training accuracy: 0.6015625
Validation loss: 0.6863597631454468
Validation accuracy: 0.53125


Epoch: 3 batch: 1 / 2 loss: 0.6014 | 50.00%
----- Epoch 3 ------
Training loss: 0.6126751899719238
Training accuracy: 0.7421875
Validation loss: 0.6732439398765564
Validation accuracy: 0.59375


Epoch: 4 batch: 1 / 2 loss: 0.5674 | 50.00%
----- Epoch 4 ------
Training loss: 0.5802326202392578
Training accuracy: 0.78125
Validation loss: 0.6695328950881958
Validation accuracy: 0.6015625


Epoch: 5 batch: 1 / 2 loss: 0.5474 | 50.00%
----- Epoch 5 ------
Training loss: 0.5640774965286255
Training accuracy: 0.84375
Validation loss: 0.6684213876724243
V

**Fine-tune GPT2**

In [17]:
transformer = Transformer('GPT2')

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='GPT2', transformer=transformer)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


----- Processing 128/128 test samples

Training start!
Epoch: 1 batch: 1 / 2 loss: 1.8211 | 50.00%
----- Epoch 1 ------
Training loss: 2.349409341812134
Training accuracy: 0.4140625
Validation loss: 1.583845853805542
Validation accuracy: 0.5390625


Epoch: 2 batch: 1 / 2 loss: 1.5382 | 50.00%
----- Epoch 2 ------
Training loss: 1.7181981801986694
Training accuracy: 0.4140625
Validation loss: 1.2930728197097778
Validation accuracy: 0.546875


Epoch: 3 batch: 1 / 2 loss: 1.0474 | 50.00%
----- Epoch 3 ------
Training loss: 1.2786409854888916
Training accuracy: 0.390625
Validation loss: 1.1193398237228394
Validation accuracy: 0.546875


Epoch: 4 batch: 1 / 2 loss: 1.0663 | 50.00%
----- Epoch 4 ------
Training loss: 1.095575213432312
Training accuracy: 0.453125
Validation loss: 1.035834550857544
Validation accuracy: 0.5546875


Epoch: 5 batch: 1 / 2 loss: 0.8082 | 50.00%
----- Epoch 5 ------
Training loss: 0.9123766422271729
Training accuracy: 0.4921875
Validation loss: 1.0074186325073242
V

**Fine-tune BIGBIRD**

In [18]:
transformer = Transformer('BIGBIRD')

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='BIGBIRD', transformer=transformer)

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

----- Processing 128/128 test samples



Attention type 'block_sparse' is not possible if sequence_length: 100 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Training start!
Epoch: 1 batch: 1 / 2 loss: 0.6867 | 50.00%
----- Epoch 1 ------
Training loss: 0.6903771162033081
Training accuracy: 0.546875
Validation loss: 0.7005292177200317
Validation accuracy: 0.4375


Epoch: 2 batch: 1 / 2 loss: 0.6880 | 50.00%
----- Epoch 2 ------
Training loss: 0.6862771511077881
Training accuracy: 0.5703125
Validation loss: 0.7018076181411743
Validation accuracy: 0.4609375


Epoch: 3 batch: 1 / 2 loss: 0.6864 | 50.00%
----- Epoch 3 ------
Training loss: 0.6820776462554932
Training accuracy: 0.5859375
Validation loss: 0.7052717804908752
Validation accuracy: 0.4609375


Epoch: 4 batch: 1 / 2 loss: 0.6924 | 50.00%
----- Epoch 4 ------
Training loss: 0.6825997233390808
Training accuracy: 0.5859375
Validation loss: 0.7088981866836548
Validation accuracy: 0.4609375


Epoch: 5 batch: 1 / 2 loss: 0.6889 | 50.00%
----- Epoch 5 ------
Training loss: 0.6784468293190002
Training accuracy: 0.5859375
Validation loss: 0.71048504114151
Validation accuracy: 0.4609375


Train

**BERT + BiLSTM**

In [19]:
lstm = LSTM_attention()
transformer = Transformer('BERT')

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='BERT', transformer=transformer, classifier_name='BiLSTM', classifier=lstm)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

----- Processing 128/128 test samples

Training start!
Epoch: 1 batch: 1 / 2 loss: 0.7493 | 50.00%
----- Epoch 1 ------
Training loss: 0.7136397361755371
Training accuracy: 0.578125
Validation loss: 0.718366265296936
Validation accuracy: 0.4609375


Epoch: 2 batch: 1 / 2 loss: 0.6011 | 50.00%
----- Epoch 2 ------
Training loss: 0.6217240691184998
Training accuracy: 0.65625
Validation loss: 0.650848388671875
Validation accuracy: 0.7109375


Epoch: 3 batch: 1 / 2 loss: 0.5636 | 50.00%
----- Epoch 3 ------
Training loss: 0.5741910934448242
Training accuracy: 0.859375
Validation loss: 0.6382578611373901
Validation accuracy: 0.71875


Epoch: 4 batch: 1 / 2 loss: 0.5374 | 50.00%
----- Epoch 4 ------
Training loss: 0.5199998021125793
Training accuracy: 0.890625
Validation loss: 0.6255455017089844
Validation accuracy: 0.6796875


Epoch: 5 batch: 1 / 2 loss: 0.4614 | 50.00%
----- Epoch 5 ------
Training loss: 0.47346341609954834
Training accuracy: 0.8984375
Validation loss: 0.6181678771972656
V

**BERT + TextCNN**

In [20]:
textcnn = textCNN()
transformer = Transformer('BERT')

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='BERT', transformer=transformer, classifier_name='TextCNN', classifier=textcnn)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

----- Processing 128/128 test samples

Training start!
Epoch: 1 batch: 1 / 2 loss: 0.6952 | 50.00%
----- Epoch 1 ------
Training loss: 0.7091507911682129
Training accuracy: 0.5078125
Validation loss: 0.739465594291687
Validation accuracy: 0.4609375


Epoch: 2 batch: 1 / 2 loss: 0.7029 | 50.00%
----- Epoch 2 ------
Training loss: 0.6657478213310242
Training accuracy: 0.5859375
Validation loss: 0.7763028740882874
Validation accuracy: 0.4609375


Epoch: 3 batch: 1 / 2 loss: 0.6428 | 50.00%
----- Epoch 3 ------
Training loss: 0.6248608231544495
Training accuracy: 0.5859375
Validation loss: 0.739249587059021
Validation accuracy: 0.4609375


Epoch: 4 batch: 1 / 2 loss: 0.5904 | 50.00%
----- Epoch 4 ------
Training loss: 0.5903469920158386
Training accuracy: 0.609375
Validation loss: 0.7120996713638306
Validation accuracy: 0.484375


Epoch: 5 batch: 1 / 2 loss: 0.6077 | 50.00%
----- Epoch 5 ------
Training loss: 0.5723909139633179
Training accuracy: 0.6875
Validation loss: 0.7042070031166077


**Fine-tune BERT with large dataset**

In [21]:
# # load large dataset (~5 million sentences)

# data_large_train = pd.read_csv('dataset_binary_train_large.csv')[0:128]
# data_large_test = pd.read_csv('dataset_binary_test_large.csv')[0:128]

data_large_train = pd.read_csv('dataset_binary_train.csv')[0:128]
data_large_test = pd.read_csv('dataset_binary_test.csv')[0:128]

X_large_train, y_large_train = data_large_train.data.tolist(), data_large_train.label.tolist()
X_large_test, y_large_test = data_large_test.data.tolist(), data_large_test.label.tolist()

print('Train dataset length: {}'.format(len(X_large_train)))
print('Test dataset length: {}'.format(len(X_large_test)))

Train dataset length: 128
Test dataset length: 128


In [22]:
transformer = Transformer('BERT')

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='BERT', transformer=transformer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

----- Processing 128/128 test samples

Training start!
Epoch: 1 batch: 1 / 2 loss: 0.7575 | 50.00%
----- Epoch 1 ------
Training loss: 0.8150010108947754
Training accuracy: 0.4296875
Validation loss: 0.6841989159584045
Validation accuracy: 0.5390625


Epoch: 2 batch: 1 / 2 loss: 0.5919 | 50.00%
----- Epoch 2 ------
Training loss: 0.6073437929153442
Training accuracy: 0.734375
Validation loss: 0.7567552328109741
Validation accuracy: 0.46875


Epoch: 3 batch: 1 / 2 loss: 0.5796 | 50.00%
----- Epoch 3 ------
Training loss: 0.6150369644165039
Training accuracy: 0.6328125
Validation loss: 0.7781224250793457
Validation accuracy: 0.46875


Epoch: 4 batch: 1 / 2 loss: 0.5377 | 50.00%
----- Epoch 4 ------
Training loss: 0.5784770846366882
Training accuracy: 0.609375
Validation loss: 0.738916277885437
Validation accuracy: 0.5


Epoch: 5 batch: 1 / 2 loss: 0.5396 | 50.00%
----- Epoch 5 ------
Training loss: 0.534643292427063
Training accuracy: 0.6796875
Validation loss: 0.7135176062583923
Validat

**Fine-tune BERT with multi-label data**

In [23]:
# # load multi-label dataset

data_multi_train = pd.read_csv('dataset_multi_num_train.csv')[0:128]
data_multi_test = pd.read_csv('dataset_multi_num_test.csv')[0:128]

X_multi_train, y_multi_train = data_multi_train.data.tolist(), data_multi_train.label.tolist()
X_multi_test, y_multi_test = data_multi_test.data.tolist(), data_multi_test.label.tolist()

print('Train dataset length: {}'.format(len(X_multi_train)))
print('Test dataset length: {}'.format(len(X_multi_test)))

Train dataset length: 128
Test dataset length: 128


In [24]:
transformer = Transformer('BERT', num_labels=5)

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='BERT', transformer=transformer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

----- Processing 128/128 test samples

Training start!
Epoch: 1 batch: 1 / 2 loss: 1.4157 | 50.00%
----- Epoch 1 ------
Training loss: 1.4839255809783936
Training accuracy: 0.265625
Validation loss: 1.3349766731262207
Validation accuracy: 0.4296875


Epoch: 2 batch: 1 / 2 loss: 1.2211 | 50.00%
----- Epoch 2 ------
Training loss: 1.2721401453018188
Training accuracy: 0.5390625
Validation loss: 1.1931195259094238
Validation accuracy: 0.4609375


Epoch: 3 batch: 1 / 2 loss: 1.0939 | 50.00%
----- Epoch 3 ------
Training loss: 1.1145927906036377
Training accuracy: 0.6015625
Validation loss: 1.0886192321777344
Validation accuracy: 0.4609375


Epoch: 4 batch: 1 / 2 loss: 1.0018 | 50.00%
----- Epoch 4 ------
Training loss: 1.0111143589019775
Training accuracy: 0.59375
Validation loss: 1.0257381200790405
Validation accuracy: 0.4609375


Epoch: 5 batch: 1 / 2 loss: 0.9933 | 50.00%
----- Epoch 5 ------
Training loss: 0.979364812374115
Training accuracy: 0.6015625
Validation loss: 1.00249338150024