In [None]:
%%capture
# !pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl
!pip install torch==1.7.1 torchtext==0.8.0
# !pip install torchtext==0.10.0
!pip install spacy==2.2.4

In [None]:
import numpy as np
import torch
import sys
import torchtext
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from copy import deepcopy
from torch.autograd import Variable
import matplotlib.pyplot as plt
import seaborn
from torchtext import data
import spacy
import pandas as pd
from sklearn.metrics import accuracy_score
seaborn.set_context(context="talk")
!python -m spacy download en
%matplotlib inline

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 402 kB/s 
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.2.5-py3-none-any.whl size=12011737 sha256=9918b9ecb33a24cab0845bea79e916a4df146bbe5312a9774246574edee5b4b1
  Stored in directory: /tmp/pip-ephem-wheel-cache-jfw5jm6c/wheels/51/19/da/a3885266a3c241aff0ad2eb674ae058fd34a4870fef1c0a5a0
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 3.4.0
    Uninstalling en-core-web-sm-3.4.0:
      Successfully uninstalled en-core-web-s

In [None]:
# sublayer.py

class LayerNorm(nn.Module):
    "Construct a layer normalization module."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
    
class SublayerOutput(nn.Module):
    '''
    A residual connection followed by a layer norm.
    '''
    def __init__(self, size, dropout):
        super(SublayerOutput, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

In [None]:
# feed_forward.py

class PositionwiseFeedForward(nn.Module):
    "Positionwise feed-forward network."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        "Implements FFN equation."
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [None]:
# utils.py

class Dataset(object):
    def __init__(self, config):
        self.config = config
        self.train_iterator = None
        self.test_iterator = None
        self.val_iterator = None
        self.vocab = []
        self.word_embeddings = {}
    
    def parse_label(self, label):
        '''
        Get the actual labels from label string
        Input:
            label (string) : labels of the form '__label__2'
        Returns:
            label (int) : integer value corresponding to label string
        '''
        return int(label.strip()[-1])

    def get_pandas_df(self, filename):
        '''
        Load the data into Pandas.DataFrame object
        This will be used to convert data to torchtext object
        '''
        with open(filename, 'r') as datafile:     
            data = [line.strip().split(',', maxsplit=1) for line in datafile]
            data_text = list(map(lambda x: x[1], data))
            data_label = list(map(lambda x: self.parse_label(x[0]), data))

        full_df = pd.DataFrame({"text":data_text, "label":data_label})
        return full_df
    
    def load_data(self, train_file, test_file=None, val_file=None):
        '''
        Loads the data from files
        Sets up iterators for training, validation and test data
        Also create vocabulary and word embeddings based on the data
        
        Inputs:
            train_file (String): path to training file
            test_file (String): path to test file
            val_file (String): path to validation file
        '''

        NLP = spacy.load('en')
        tokenizer = lambda sent: [x.text for x in NLP.tokenizer(sent) if x.text != " "]
        
        # Creating Field for data
        TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, fix_length=self.config.max_sen_len)
        LABEL = data.Field(sequential=False, use_vocab=False)
        datafields = [("text",TEXT),("label",LABEL)]
        
        # Load data from pd.DataFrame into torchtext.data.Dataset
        train_df = self.get_pandas_df(train_file)
        train_examples = [data.Example.fromlist(i, datafields) for i in train_df.values.tolist()]
        train_data = data.Dataset(train_examples, datafields)
        
        test_df = self.get_pandas_df(test_file)
        test_examples = [data.Example.fromlist(i, datafields) for i in test_df.values.tolist()]
        test_data = data.Dataset(test_examples, datafields)
        
        # If validation file exists, load it. Otherwise get validation data from training data
        if val_file:
            val_df = self.get_pandas_df(val_file)
            val_examples = [data.Example.fromlist(i, datafields) for i in val_df.values.tolist()]
            val_data = data.Dataset(val_examples, datafields)
        else:
            train_data, val_data = train_data.split(split_ratio=0.8)
        
        TEXT.build_vocab(train_data)
        self.vocab = TEXT.vocab
        
        self.train_iterator = data.BucketIterator(
            (train_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True)
        
        self.val_iterator, self.test_iterator = data.BucketIterator.splits(
            (val_data, test_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=False)
        
        print ("Loaded {} training examples".format(len(train_data)))
        print ("Loaded {} test examples".format(len(test_data)))
        print ("Loaded {} validation examples".format(len(val_data)))

def evaluate_model(model, iterator):
    all_preds = []
    all_y = []
    for idx,batch in enumerate(iterator):
        if torch.cuda.is_available():
            x = batch.text.cuda()
        else:
            x = batch.text
        y_pred = model(x)
        predicted = torch.max(y_pred.cpu().data, 1)[1] + 1
        all_preds.extend(predicted.numpy())
        all_y.extend(batch.label.numpy())
    score = accuracy_score(all_y, np.array(all_preds).flatten())
    return score

In [None]:
# train_utils.py

def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class Embeddings(nn.Module):
    '''
    Usual Embedding layer with weights multiplied by sqrt(d_model)
    '''
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)
    
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(torch.as_tensor(position.numpy() * div_term.unsqueeze(0).numpy()))
        pe[:, 1::2] = torch.cos(torch.as_tensor(position.numpy() * div_term.unsqueeze(0).numpy())) #torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        return self.dropout(x)

In [None]:
# attention.py

def attention(query, key, value, mask=None, dropout=None):
    "Implementation of Scaled dot product attention"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        "Implements Multi-head attention"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask, 
                                 dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

In [None]:
# encoder.py

class Encoder(nn.Module):
    '''
    Transformer Encoder
    
    It is a stack of N layers.
    '''
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)
    
class EncoderLayer(nn.Module):
    '''
    An encoder layer
    
    Made up of self-attention and a feed forward layer.
    Each of these sublayers have residual and layer norm, implemented by SublayerOutput.
    '''
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer_output = clones(SublayerOutput(size, dropout), 2)
        self.size = size

    def forward(self, x, mask=None):
        "Transformer Encoder"
        x = self.sublayer_output[0](x, lambda x: self.self_attn(x, x, x, mask)) # Encoder self-attention
        return self.sublayer_output[1](x, self.feed_forward)

In [None]:
# Model.py

class Transformer(nn.Module):
    def __init__(self, config, src_vocab):
        super(Transformer, self).__init__()
        self.config = config
        
        h, N, dropout = self.config.h, self.config.N, self.config.dropout
        d_model, d_ff = self.config.d_model, self.config.d_ff
        
        attn = MultiHeadedAttention(h, d_model)
        ff = PositionwiseFeedForward(d_model, d_ff, dropout)
        position = PositionalEncoding(d_model, dropout)
        
        self.encoder = Encoder(EncoderLayer(config.d_model, deepcopy(attn), deepcopy(ff), dropout), N)
        self.src_embed = nn.Sequential(Embeddings(config.d_model, src_vocab), deepcopy(position)) #Embeddings followed by PE

        # Fully-Connected Layer
        self.fc = nn.Linear(
            self.config.d_model,
            self.config.output_size
        )
        
        # Softmax non-linearity
        self.softmax = nn.Softmax()

    def forward(self, x):
        embedded_sents = self.src_embed(x.permute(1,0)) # shape = (batch_size, sen_len, d_model)
        encoded_sents = self.encoder(embedded_sents)
        
        # Convert input to (batch_size, d_model) for linear layer
        final_feature_map = encoded_sents[:,-1,:]
        final_out = self.fc(final_feature_map)
        return self.softmax(final_out)
    
    def add_optimizer(self, optimizer):
        self.optimizer = optimizer
        
    def add_loss_op(self, loss_op):
        self.loss_op = loss_op
    
    def reduce_lr(self):
        print("Reducing LR")
        for g in self.optimizer.param_groups:
            g['lr'] = g['lr'] / 2
                
    def run_epoch(self, train_iterator, val_iterator, epoch):
        train_losses = []
        val_accuracies = []
        losses = []
        
        # Reduce learning rate as number of epochs increase
        if (epoch == int(self.config.max_epochs/3)) or (epoch == int(2*self.config.max_epochs/3)):
            self.reduce_lr()
            
        for i, batch in enumerate(train_iterator):
            self.optimizer.zero_grad()
            if torch.cuda.is_available():
                x = batch.text.cuda()
                y = (batch.label - 1).type(torch.cuda.LongTensor)
            else:
                x = batch.text
                y = (batch.label - 1).type(torch.LongTensor)
            y_pred = self.__call__(x)
            loss = self.loss_op(y_pred, y)
            loss.backward()
            losses.append(loss.data.cpu().numpy())
            self.optimizer.step()
    
            if i % 100 == 0:
                print("Iter: {}".format(i+1))
                avg_train_loss = np.mean(losses)
                train_losses.append(avg_train_loss)
                print("\tAverage training loss: {:.5f}".format(avg_train_loss))
                losses = []
                
                # Evalute Accuracy on validation set
                val_accuracy = evaluate_model(self, val_iterator)
                print("\tVal Accuracy: {:.4f}".format(val_accuracy))
                self.train()
                
        return train_losses, val_accuracies

In [None]:
# config.py

class Config(object):
    N = 1 #6 in Transformer Paper
    d_model = 256 #512 in Transformer Paper
    d_ff = 512 #2048 in Transformer Paper
    h = 8
    dropout = 0.1
    output_size = 4
    lr = 0.0003
    max_epochs = 35
    batch_size = 128
    max_sen_len = 60

In [None]:
# train.py

config = Config()
train_file = '/content/drive/MyDrive/Projects/Classification using Vanilla Transformer/data/ag_news.train'
# if len(sys.argv) > 2:
#     train_file = sys.argv[1]
test_file = '/content/drive/MyDrive/Projects/Classification using Vanilla Transformer/data/ag_news.test'
# if len(sys.argv) > 3:
#     test_file = sys.argv[2]

dataset = Dataset(config)
dataset.load_data(train_file, test_file)

# Create Model with specified optimizer and loss function
##############################################################
model = Transformer(config, len(dataset.vocab))
if torch.cuda.is_available():
    model.cuda()
model.train()
optimizer = optim.Adam(model.parameters(), lr=config.lr)
NLLLoss = nn.NLLLoss()
model.add_optimizer(optimizer)
model.add_loss_op(NLLLoss)
##############################################################

train_losses = []
val_accuracies = []

for i in range(config.max_epochs):
    print ("Epoch: {}".format(i))
    train_loss,val_accuracy = model.run_epoch(dataset.train_iterator, dataset.val_iterator, i)
    train_losses.append(train_loss)
    val_accuracies.append(val_accuracy)

train_acc = evaluate_model(model, dataset.train_iterator)
val_acc = evaluate_model(model, dataset.val_iterator)
test_acc = evaluate_model(model, dataset.test_iterator)

print ('Final Training Accuracy: {:.4f}'.format(train_acc))
print ('Final Validation Accuracy: {:.4f}'.format(val_acc))
print ('Final Test Accuracy: {:.4f}'.format(test_acc))



Loaded 96000 training examples
Loaded 7600 test examples
Loaded 24000 validation examples
Epoch: 0
Iter: 1
	Average training loss: -0.24567




	Val Accuracy: 0.2515




Iter: 101
	Average training loss: -0.27229
	Val Accuracy: 0.3810




Iter: 201
	Average training loss: -0.41188
	Val Accuracy: 0.4562




Iter: 301
	Average training loss: -0.48223
	Val Accuracy: 0.5159




Iter: 401
	Average training loss: -0.53741
	Val Accuracy: 0.5848




Iter: 501
	Average training loss: -0.58972
	Val Accuracy: 0.6220




Iter: 601
	Average training loss: -0.62871
	Val Accuracy: 0.6459




Iter: 701
	Average training loss: -0.65471
	Val Accuracy: 0.6763




Epoch: 1
Iter: 1
	Average training loss: -0.70899
	Val Accuracy: 0.6938




Iter: 101
	Average training loss: -0.69837
	Val Accuracy: 0.7164




Iter: 201
	Average training loss: -0.71721
	Val Accuracy: 0.7329




Iter: 301
	Average training loss: -0.73882
	Val Accuracy: 0.7443




Iter: 401
	Average training loss: -0.74836
	Val Accuracy: 0.7552




Iter: 501
	Average training loss: -0.75130
	Val Accuracy: 0.7642




Iter: 601
	Average training loss: -0.76286
	Val Accuracy: 0.7707




Iter: 701
	Average training loss: -0.76657
	Val Accuracy: 0.7758




Epoch: 2
Iter: 1
	Average training loss: -0.81537
	Val Accuracy: 0.7770




Iter: 101
	Average training loss: -0.78749
	Val Accuracy: 0.7847




Iter: 201
	Average training loss: -0.78730
	Val Accuracy: 0.7849




Iter: 301
	Average training loss: -0.79607
	Val Accuracy: 0.7938




Iter: 401
	Average training loss: -0.80389
	Val Accuracy: 0.7982




Iter: 501
	Average training loss: -0.80542
	Val Accuracy: 0.8010




Iter: 601
	Average training loss: -0.80693
	Val Accuracy: 0.8035




Iter: 701
	Average training loss: -0.80598
	Val Accuracy: 0.8105




Epoch: 3
Iter: 1
	Average training loss: -0.77061
	Val Accuracy: 0.8087




Iter: 101
	Average training loss: -0.82177
	Val Accuracy: 0.8096




Iter: 201
	Average training loss: -0.81978
	Val Accuracy: 0.8168




Iter: 301
	Average training loss: -0.81947
	Val Accuracy: 0.8164




Iter: 401
	Average training loss: -0.82224
	Val Accuracy: 0.8113




Iter: 501
	Average training loss: -0.82496
	Val Accuracy: 0.8200




Iter: 601
	Average training loss: -0.82958
	Val Accuracy: 0.8220




Iter: 701
	Average training loss: -0.83132
	Val Accuracy: 0.8211




Epoch: 4
Iter: 1
	Average training loss: -0.89266
	Val Accuracy: 0.8286




Iter: 101
	Average training loss: -0.83992
	Val Accuracy: 0.8287




Iter: 201
	Average training loss: -0.84615
	Val Accuracy: 0.8298




Iter: 301
	Average training loss: -0.84135
	Val Accuracy: 0.8309




Iter: 401
	Average training loss: -0.85148
	Val Accuracy: 0.8360




Iter: 501
	Average training loss: -0.84223
	Val Accuracy: 0.8333




Iter: 601
	Average training loss: -0.84082
	Val Accuracy: 0.8397




Iter: 701
	Average training loss: -0.84466
	Val Accuracy: 0.8370




Epoch: 5
Iter: 1
	Average training loss: -0.87319
	Val Accuracy: 0.8389




Iter: 101
	Average training loss: -0.84632
	Val Accuracy: 0.8409




Iter: 201
	Average training loss: -0.85692
	Val Accuracy: 0.8425




Iter: 301
	Average training loss: -0.85235
	Val Accuracy: 0.8418




Iter: 401
	Average training loss: -0.85774
	Val Accuracy: 0.8422




Iter: 501
	Average training loss: -0.85375
	Val Accuracy: 0.8465




Iter: 601
	Average training loss: -0.85576
	Val Accuracy: 0.8439




Iter: 701
	Average training loss: -0.85401
	Val Accuracy: 0.8485




Epoch: 6
Iter: 1
	Average training loss: -0.86676
	Val Accuracy: 0.8483




Iter: 101
	Average training loss: -0.86936
	Val Accuracy: 0.8503




Iter: 201
	Average training loss: -0.86110
	Val Accuracy: 0.8446




Iter: 301
	Average training loss: -0.86510
	Val Accuracy: 0.8500




Iter: 401
	Average training loss: -0.85855
	Val Accuracy: 0.8490




Iter: 501
	Average training loss: -0.86449
	Val Accuracy: 0.8504




Iter: 601
	Average training loss: -0.85595
	Val Accuracy: 0.8525




Iter: 701
	Average training loss: -0.86190
	Val Accuracy: 0.8481




Epoch: 7
Iter: 1
	Average training loss: -0.90619
	Val Accuracy: 0.8520




Iter: 101
	Average training loss: -0.86959
	Val Accuracy: 0.8494




Iter: 201
	Average training loss: -0.86355
	Val Accuracy: 0.8511




Iter: 301
	Average training loss: -0.86913
	Val Accuracy: 0.8556




Iter: 401
	Average training loss: -0.86371
	Val Accuracy: 0.8564




Iter: 501
	Average training loss: -0.86761
	Val Accuracy: 0.8600




Iter: 601
	Average training loss: -0.86997
	Val Accuracy: 0.8584




Iter: 701
	Average training loss: -0.87011
	Val Accuracy: 0.8546




Epoch: 8
Iter: 1
	Average training loss: -0.88058
	Val Accuracy: 0.8575




Iter: 101
	Average training loss: -0.87184
	Val Accuracy: 0.8583




Iter: 201
	Average training loss: -0.87384
	Val Accuracy: 0.8569




Iter: 301
	Average training loss: -0.86777
	Val Accuracy: 0.8585




Iter: 401
	Average training loss: -0.87270
	Val Accuracy: 0.8618




Iter: 501
	Average training loss: -0.87499
	Val Accuracy: 0.8621




Iter: 601
	Average training loss: -0.87491
	Val Accuracy: 0.8620




Iter: 701
	Average training loss: -0.87412
	Val Accuracy: 0.8607




Epoch: 9
Iter: 1
	Average training loss: -0.86335
	Val Accuracy: 0.8608




Iter: 101
	Average training loss: -0.87373
	Val Accuracy: 0.8619




Iter: 201
	Average training loss: -0.87787
	Val Accuracy: 0.8592




Iter: 301
	Average training loss: -0.87760
	Val Accuracy: 0.8628




Iter: 401
	Average training loss: -0.87637
	Val Accuracy: 0.8629




Iter: 501
	Average training loss: -0.87642
	Val Accuracy: 0.8628




Iter: 601
	Average training loss: -0.87821
	Val Accuracy: 0.8638




Iter: 701
	Average training loss: -0.88004
	Val Accuracy: 0.8623




Epoch: 10
Iter: 1
	Average training loss: -0.84656
	Val Accuracy: 0.8651




Iter: 101
	Average training loss: -0.87969
	Val Accuracy: 0.8654




Iter: 201
	Average training loss: -0.87679
	Val Accuracy: 0.8631




Iter: 301
	Average training loss: -0.88181
	Val Accuracy: 0.8646




Iter: 401
	Average training loss: -0.88222
	Val Accuracy: 0.8661




Iter: 501
	Average training loss: -0.88920
	Val Accuracy: 0.8620




Iter: 601
	Average training loss: -0.87745
	Val Accuracy: 0.8671




Iter: 701
	Average training loss: -0.88463
	Val Accuracy: 0.8679




Epoch: 11
Reducing LR
Iter: 1
	Average training loss: -0.89353
	Val Accuracy: 0.8691




Iter: 101
	Average training loss: -0.88805
	Val Accuracy: 0.8663




Iter: 201
	Average training loss: -0.88830
	Val Accuracy: 0.8680




Iter: 301
	Average training loss: -0.88853
	Val Accuracy: 0.8689




Iter: 401
	Average training loss: -0.88968
	Val Accuracy: 0.8681




Iter: 501
	Average training loss: -0.89064
	Val Accuracy: 0.8687




Iter: 601
	Average training loss: -0.89133
	Val Accuracy: 0.8714




Iter: 701
	Average training loss: -0.88882
	Val Accuracy: 0.8729




Epoch: 12
Iter: 1
	Average training loss: -0.88836
	Val Accuracy: 0.8722




Iter: 101
	Average training loss: -0.89560
	Val Accuracy: 0.8722




Iter: 201
	Average training loss: -0.89102
	Val Accuracy: 0.8699




Iter: 301
	Average training loss: -0.89210
	Val Accuracy: 0.8724




Iter: 401
	Average training loss: -0.89206
	Val Accuracy: 0.8707




Iter: 501
	Average training loss: -0.88337
	Val Accuracy: 0.8729




Iter: 601
	Average training loss: -0.89176
	Val Accuracy: 0.8753




Iter: 701
	Average training loss: -0.89702
	Val Accuracy: 0.8752




Epoch: 13
Iter: 1
	Average training loss: -0.92147
	Val Accuracy: 0.8741




Iter: 101
	Average training loss: -0.89631
	Val Accuracy: 0.8770




Iter: 201
	Average training loss: -0.89555
	Val Accuracy: 0.8758




Iter: 301
	Average training loss: -0.89424
	Val Accuracy: 0.8750




Iter: 401
	Average training loss: -0.89491
	Val Accuracy: 0.8743




Iter: 501
	Average training loss: -0.89403
	Val Accuracy: 0.8732




Iter: 601
	Average training loss: -0.89519
	Val Accuracy: 0.8755




Iter: 701
	Average training loss: -0.89436
	Val Accuracy: 0.8752




Epoch: 14
Iter: 1
	Average training loss: -0.86943
	Val Accuracy: 0.8739




Iter: 101
	Average training loss: -0.89806
	Val Accuracy: 0.8724




Iter: 201
	Average training loss: -0.89373
	Val Accuracy: 0.8764




Iter: 301
	Average training loss: -0.89753
	Val Accuracy: 0.8762




Iter: 401
	Average training loss: -0.89208
	Val Accuracy: 0.8764




Iter: 501
	Average training loss: -0.89507
	Val Accuracy: 0.8758




Iter: 601
	Average training loss: -0.89792
	Val Accuracy: 0.8735




Iter: 701
	Average training loss: -0.89398
	Val Accuracy: 0.8758




Epoch: 15
Iter: 1
	Average training loss: -0.90910
	Val Accuracy: 0.8768




Iter: 101
	Average training loss: -0.89617
	Val Accuracy: 0.8779




Iter: 201
	Average training loss: -0.89716
	Val Accuracy: 0.8748




Iter: 301
	Average training loss: -0.89398
	Val Accuracy: 0.8760




Iter: 401
	Average training loss: -0.90428
	Val Accuracy: 0.8749




Iter: 501
	Average training loss: -0.89561
	Val Accuracy: 0.8753




Iter: 601
	Average training loss: -0.89871
	Val Accuracy: 0.8762




Iter: 701
	Average training loss: -0.89940
	Val Accuracy: 0.8738




Epoch: 16
Iter: 1
	Average training loss: -0.91035
	Val Accuracy: 0.8753




Iter: 101
	Average training loss: -0.90091
	Val Accuracy: 0.8780




Iter: 201
	Average training loss: -0.89848
	Val Accuracy: 0.8754




Iter: 301
	Average training loss: -0.89659
	Val Accuracy: 0.8771




Iter: 401
	Average training loss: -0.90198
	Val Accuracy: 0.8770




Iter: 501
	Average training loss: -0.89436
	Val Accuracy: 0.8754




Iter: 601
	Average training loss: -0.89772
	Val Accuracy: 0.8770




Iter: 701
	Average training loss: -0.90012
	Val Accuracy: 0.8766




Epoch: 17
Iter: 1
	Average training loss: -0.90269
	Val Accuracy: 0.8782




Iter: 101
	Average training loss: -0.89768
	Val Accuracy: 0.8777




Iter: 201
	Average training loss: -0.89750
	Val Accuracy: 0.8774




Iter: 301
	Average training loss: -0.90320
	Val Accuracy: 0.8770




Iter: 401
	Average training loss: -0.90194
	Val Accuracy: 0.8760




Iter: 501
	Average training loss: -0.89806
	Val Accuracy: 0.8793




Iter: 601
	Average training loss: -0.90254
	Val Accuracy: 0.8778




Iter: 701
	Average training loss: -0.90090
	Val Accuracy: 0.8776




Epoch: 18
Iter: 1
	Average training loss: -0.90860
	Val Accuracy: 0.8765




Iter: 101
	Average training loss: -0.90122
	Val Accuracy: 0.8768




Iter: 201
	Average training loss: -0.90012
	Val Accuracy: 0.8802




Iter: 301
	Average training loss: -0.90735
	Val Accuracy: 0.8785




Iter: 401
	Average training loss: -0.90080
	Val Accuracy: 0.8807




Iter: 501
	Average training loss: -0.90297
	Val Accuracy: 0.8772




Iter: 601
	Average training loss: -0.90282
	Val Accuracy: 0.8782




Iter: 701
	Average training loss: -0.90334
	Val Accuracy: 0.8788




Epoch: 19
Iter: 1
	Average training loss: -0.90024
	Val Accuracy: 0.8798




Iter: 101
	Average training loss: -0.90163
	Val Accuracy: 0.8795




Iter: 201
	Average training loss: -0.90518
	Val Accuracy: 0.8797




Iter: 301
	Average training loss: -0.90119
	Val Accuracy: 0.8796




Iter: 401
	Average training loss: -0.90272
	Val Accuracy: 0.8797




Iter: 501
	Average training loss: -0.90022
	Val Accuracy: 0.8779




Iter: 601
	Average training loss: -0.90273
	Val Accuracy: 0.8810




Iter: 701
	Average training loss: -0.90633
	Val Accuracy: 0.8807




Epoch: 20
Iter: 1
	Average training loss: -0.89443
	Val Accuracy: 0.8812




Iter: 101
	Average training loss: -0.90223
	Val Accuracy: 0.8784




Iter: 201
	Average training loss: -0.90613
	Val Accuracy: 0.8831




Iter: 301
	Average training loss: -0.90570
	Val Accuracy: 0.8819




Iter: 401
	Average training loss: -0.90469
	Val Accuracy: 0.8774




Iter: 501
	Average training loss: -0.90146
	Val Accuracy: 0.8810




Iter: 601
	Average training loss: -0.90449
	Val Accuracy: 0.8802




Iter: 701
	Average training loss: -0.90537
	Val Accuracy: 0.8831




Epoch: 21
Iter: 1
	Average training loss: -0.86976
	Val Accuracy: 0.8847




Iter: 101
	Average training loss: -0.90829
	Val Accuracy: 0.8826




Iter: 201
	Average training loss: -0.90627
	Val Accuracy: 0.8818




Iter: 301
	Average training loss: -0.90334
	Val Accuracy: 0.8812




Iter: 401
	Average training loss: -0.90609
	Val Accuracy: 0.8824




Iter: 501
	Average training loss: -0.90367
	Val Accuracy: 0.8825




Iter: 601
	Average training loss: -0.90571
	Val Accuracy: 0.8811




Iter: 701
	Average training loss: -0.90502
	Val Accuracy: 0.8827




Epoch: 22
Iter: 1
	Average training loss: -0.91783
	Val Accuracy: 0.8795




Iter: 101
	Average training loss: -0.90725
	Val Accuracy: 0.8818




Iter: 201
	Average training loss: -0.90728
	Val Accuracy: 0.8829




Iter: 301
	Average training loss: -0.90632
	Val Accuracy: 0.8831




Iter: 401
	Average training loss: -0.90539
	Val Accuracy: 0.8831




Iter: 501
	Average training loss: -0.90552
	Val Accuracy: 0.8830




Iter: 601
	Average training loss: -0.91151
	Val Accuracy: 0.8821




Iter: 701
	Average training loss: -0.90213
	Val Accuracy: 0.8825




Epoch: 23
Reducing LR
Iter: 1
	Average training loss: -0.94544
	Val Accuracy: 0.8828




Iter: 101
	Average training loss: -0.90723
	Val Accuracy: 0.8840




Iter: 201
	Average training loss: -0.90902
	Val Accuracy: 0.8817




Iter: 301
	Average training loss: -0.90803
	Val Accuracy: 0.8832




Iter: 401
	Average training loss: -0.91183
	Val Accuracy: 0.8859




Iter: 501
	Average training loss: -0.90916
	Val Accuracy: 0.8825




Iter: 601
	Average training loss: -0.91375
	Val Accuracy: 0.8848




Iter: 701
	Average training loss: -0.90972
	Val Accuracy: 0.8855




Epoch: 24
Iter: 1
	Average training loss: -0.91699
	Val Accuracy: 0.8839




Iter: 101
	Average training loss: -0.91556
	Val Accuracy: 0.8851




Iter: 201
	Average training loss: -0.91067
	Val Accuracy: 0.8845




Iter: 301
	Average training loss: -0.91268
	Val Accuracy: 0.8860




Iter: 401
	Average training loss: -0.90646
	Val Accuracy: 0.8879




Iter: 501
	Average training loss: -0.90999
	Val Accuracy: 0.8857




Iter: 601
	Average training loss: -0.90973
	Val Accuracy: 0.8845




Iter: 701
	Average training loss: -0.91096
	Val Accuracy: 0.8866




Epoch: 25
Iter: 1
	Average training loss: -0.88476
	Val Accuracy: 0.8857




Iter: 101
	Average training loss: -0.91234
	Val Accuracy: 0.8852




Iter: 201
	Average training loss: -0.90800
	Val Accuracy: 0.8867




Iter: 301
	Average training loss: -0.91100
	Val Accuracy: 0.8843




Iter: 401
	Average training loss: -0.91179
	Val Accuracy: 0.8858




Iter: 501
	Average training loss: -0.91210
	Val Accuracy: 0.8867




Iter: 601
	Average training loss: -0.90968
	Val Accuracy: 0.8862




Iter: 701
	Average training loss: -0.91254
	Val Accuracy: 0.8875




Epoch: 26
Iter: 1
	Average training loss: -0.91721
	Val Accuracy: 0.8848




Iter: 101
	Average training loss: -0.91316
	Val Accuracy: 0.8854




Iter: 201
	Average training loss: -0.90645
	Val Accuracy: 0.8843




Iter: 301
	Average training loss: -0.91312
	Val Accuracy: 0.8876




Iter: 401
	Average training loss: -0.91448
	Val Accuracy: 0.8880




Iter: 501
	Average training loss: -0.91361
	Val Accuracy: 0.8860




Iter: 601
	Average training loss: -0.91177
	Val Accuracy: 0.8886




Iter: 701
	Average training loss: -0.91222
	Val Accuracy: 0.8879




Epoch: 27
Iter: 1
	Average training loss: -0.94001
	Val Accuracy: 0.8881




Iter: 101
	Average training loss: -0.91163
	Val Accuracy: 0.8876




Iter: 201
	Average training loss: -0.91233
	Val Accuracy: 0.8860




Iter: 301
	Average training loss: -0.91391
	Val Accuracy: 0.8875




Iter: 401
	Average training loss: -0.91138
	Val Accuracy: 0.8866




Iter: 501
	Average training loss: -0.91579
	Val Accuracy: 0.8865




Iter: 601
	Average training loss: -0.91235
	Val Accuracy: 0.8898




Iter: 701
	Average training loss: -0.91092
	Val Accuracy: 0.8871




Epoch: 28
Iter: 1
	Average training loss: -0.89444
	Val Accuracy: 0.8879




Iter: 101
	Average training loss: -0.90737
	Val Accuracy: 0.8853




Iter: 201
	Average training loss: -0.91469
	Val Accuracy: 0.8868




Iter: 301
	Average training loss: -0.91510
	Val Accuracy: 0.8886




Iter: 401
	Average training loss: -0.91296
	Val Accuracy: 0.8872




Iter: 501
	Average training loss: -0.91511
	Val Accuracy: 0.8850




Iter: 601
	Average training loss: -0.91294
	Val Accuracy: 0.8869




Iter: 701
	Average training loss: -0.91395
	Val Accuracy: 0.8865




Epoch: 29
Iter: 1
	Average training loss: -0.92268
	Val Accuracy: 0.8870




Iter: 101
	Average training loss: -0.91853
	Val Accuracy: 0.8880




Iter: 201
	Average training loss: -0.91661
	Val Accuracy: 0.8887




Iter: 301
	Average training loss: -0.91825
	Val Accuracy: 0.8862




Iter: 401
	Average training loss: -0.91256
	Val Accuracy: 0.8876




Iter: 501
	Average training loss: -0.91068
	Val Accuracy: 0.8865




Iter: 601
	Average training loss: -0.91580
	Val Accuracy: 0.8895




Iter: 701
	Average training loss: -0.91169
	Val Accuracy: 0.8877




Epoch: 30
Iter: 1
	Average training loss: -0.88292
	Val Accuracy: 0.8864




Iter: 101
	Average training loss: -0.91902
	Val Accuracy: 0.8882




Iter: 201
	Average training loss: -0.91512
	Val Accuracy: 0.8874




Iter: 301
	Average training loss: -0.91166
	Val Accuracy: 0.8890




Iter: 401
	Average training loss: -0.91546
	Val Accuracy: 0.8871




Iter: 501
	Average training loss: -0.91551
	Val Accuracy: 0.8874




Iter: 601
	Average training loss: -0.91197
	Val Accuracy: 0.8873




Iter: 701
	Average training loss: -0.91564
	Val Accuracy: 0.8871




Epoch: 31
Iter: 1
	Average training loss: -0.88077
	Val Accuracy: 0.8878




Iter: 101
	Average training loss: -0.91536
	Val Accuracy: 0.8903




Iter: 201
	Average training loss: -0.91422
	Val Accuracy: 0.8889




Iter: 301
	Average training loss: -0.91373
	Val Accuracy: 0.8872




Iter: 401
	Average training loss: -0.91077
	Val Accuracy: 0.8878




Iter: 501
	Average training loss: -0.91608
	Val Accuracy: 0.8870




Iter: 601
	Average training loss: -0.91956
	Val Accuracy: 0.8896




Iter: 701
	Average training loss: -0.91420
	Val Accuracy: 0.8857




Epoch: 32
Iter: 1
	Average training loss: -0.90967
	Val Accuracy: 0.8876




Iter: 101
	Average training loss: -0.91499
	Val Accuracy: 0.8890




Iter: 201
	Average training loss: -0.91532
	Val Accuracy: 0.8881




Iter: 301
	Average training loss: -0.91466
	Val Accuracy: 0.8867




Iter: 401
	Average training loss: -0.91575
	Val Accuracy: 0.8881




Iter: 501
	Average training loss: -0.91614
	Val Accuracy: 0.8876




Iter: 601
	Average training loss: -0.91632
	Val Accuracy: 0.8876




Iter: 701
	Average training loss: -0.91418
	Val Accuracy: 0.8883




Epoch: 33
Iter: 1
	Average training loss: -0.92334
	Val Accuracy: 0.8896




Iter: 101
	Average training loss: -0.91651
	Val Accuracy: 0.8900




Iter: 201
	Average training loss: -0.91866
	Val Accuracy: 0.8892




Iter: 301
	Average training loss: -0.91376
	Val Accuracy: 0.8888




Iter: 401
	Average training loss: -0.91377
	Val Accuracy: 0.8888




Iter: 501
	Average training loss: -0.91870
	Val Accuracy: 0.8892




Iter: 601
	Average training loss: -0.91677
	Val Accuracy: 0.8900




Iter: 701
	Average training loss: -0.91435
	Val Accuracy: 0.8890




Epoch: 34
Iter: 1
	Average training loss: -0.93003
	Val Accuracy: 0.8886




Iter: 101
	Average training loss: -0.91445
	Val Accuracy: 0.8890




Iter: 201
	Average training loss: -0.91818
	Val Accuracy: 0.8902




Iter: 301
	Average training loss: -0.92057
	Val Accuracy: 0.8896




Iter: 401
	Average training loss: -0.91685
	Val Accuracy: 0.8903




Iter: 501
	Average training loss: -0.91199
	Val Accuracy: 0.8896




Iter: 601
	Average training loss: -0.91860
	Val Accuracy: 0.8892




Iter: 701
	Average training loss: -0.91569
	Val Accuracy: 0.8886




Final Training Accuracy: 0.9180
Final Validation Accuracy: 0.8906
Final Test Accuracy: 0.8883
