# BERT

In [1]:
# from google.colab import files
# uploaded = files.upload()


#IMPORT FILES FROM DRIVE INTO GOOGLE-COLAB:

#STEP-1: Import Libraries

# Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

#STEP-2: Autheticate E-Mail ID

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

#STEP-3: Get File from Drive using file-ID

#2.1 Get the file
downloaded = drive.CreateFile({'id':'1hHnirG9DtR-MpqMnchFb0wY-VCEVEoQG'}) # replace the id with id of file you want to linkable link and delete from 'https....id='
downloaded.GetContentFile('ibm_0k-60k.csv')  # file name to be imported to colab

## 1. Check Device

In [2]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
print(f'Device name: {device_name}')

Device name: /device:GPU:0


## 2. Install pytorch interface for bert model
At the moment, the Hugging Face library seems to be the most widely accepted and powerful pytorch interface for working with BERT.

In [3]:
! pip install pytorch-pretrained-bert pytorch-nlp

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 19.8MB/s eta 0:00:01[K     |█████▎                          | 20kB 26.5MB/s eta 0:00:01[K     |████████                        | 30kB 14.0MB/s eta 0:00:01[K     |██████████▋                     | 40kB 11.9MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 10.7MB/s eta 0:00:01[K     |███████████████▉                | 61kB 10.4MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 9.5MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 9.9MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 10.0MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 10.1MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 10.1MB/s eta 0:00:01[K     |██████████████

In [4]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import numpy as np
import io
import matplotlib.pyplot as plt

% matplotlib inline

Using TensorFlow backend.


In order for torch to use the GPU, we need to identify and specify the GPU as the device. Later, in our training loop, we will load data onto the device.

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla T4'

## 3. Load Data

We'll use The Corpus of Linguistic Acceptability (CoLA) dataset for single sentence classification. It's a set of sentences labeled as grammatically correct or incorrect. The data is as follows:

**Column 1**:	the code representing the source of the sentence.

**Column 2**:	the acceptability judgment label (0=unacceptable, 1=acceptable).

**Column 3**:	the acceptability judgment as originally notated by the author.

**Column 4**:	the sentence.

### Upload file


In [6]:
# df = pd.read_csv("in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])
df = pd.read_csv("ibm_0k-60k.csv")


  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
df.sample(10)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Place,Query,Datetime,Text,retweets,favourites,hashtags,Subjectivity,Polarity,sentiment,ibm_sent,Unnamed: 0.1.1.1.1
59218,9217,9218,9218.0,9218.0,Meghalaya,coronavirus,2020-03-30 23:53:04+00:00,Coronavirus update Second death confirmed Aust...,2.0,0.0,,0.333333,0.133333,neutral,Sadness,
49545,9545,9545,59545.0,17227.0,Madhya Pradesh,covid19,2020-04-16 04:01:32+00:00,cpplusglobal Plastic CP-MED INFRARED THERMOMET...,0.0,2.0,#usmansecuritysolution #infraredthermometer #h...,0.0,0.0,neutral,Neutral,
18415,8415,28415,28415.0,28415.0,Punjab,coronavirus,2020-03-14 10:10:07+00:00,Called airtelindia number times network team s...,0.0,0.0,,0.0,0.0,neutral,Sadness,
41085,1085,1085,51085.0,8767.0,Haryana,covid19,2020-04-01 18:51:47+00:00,The coronavirus pandemic highlights need addre...,0.0,0.0,#coronavirus #COVID19 #UNEP #ClimateChange #na...,0.0,0.0,neutral,Fear,
48184,8184,8184,58184.0,15866.0,Karnataka,covid19,2020-04-22 01:35:03+00:00,Most number COVID19 cases world still n't get ...,0.0,1.0,#COVID19,0.5,0.5,surprise,Neutral,
42053,2053,2053,52053.0,9735.0,Himachal Pradesh,covid19,2020-04-17 01:31:41+00:00,I might I want yet I get closer everyday 26 da...,0.0,0.0,#iambalki #lockdown2020 #covi #airwalk #challe...,0.5,-0.05,surprise,Neutral,
45615,5615,5615,55615.0,13297.0,Karnataka,covid19,2020-04-15 17:12:14+00:00,Karnataka govt starts critical care support un...,5.0,10.0,#COVID19,0.8,0.0,joy,Sadness,
27098,7098,37098,37098.0,37098.0,Uttar Pradesh,coronavirus,2020-03-22 14:40:37+00:00,Believe Coronavirus name Roman charioteer 2017...,2.0,11.0,#asterix #CoronavirusPandemic #trivia,0.5,0.375,neutral,Neutral,
4644,4644,14644,14644.0,14644.0,Andhra Pradesh,covid19,2020-03-30 23:43:02+00:00,A picture gratitude fortitude relief one hope ...,0.0,0.0,#USA #ChineseVirus #COVID19 #NeverForget,0.454545,0.136364,neutral,Joy,
20693,693,30693,30693.0,30693.0,Telangana,covid19,2020-03-30 03:28:49+00:00,Distributed 200 Milk packets Migrant workers P...,0.0,8.0,#COVID19 #IndiaFightsCorona,0.033333,0.016667,neutral,Neutral,


In [8]:
df.Text = df.Text.astype(str)
sentences = df.Text.values
# adding [cls] and [sep] tokens to the tweets....[cls] token are for classification
# problem and [sep] token  means end of tweet
sentences =["[CLS] "+sent+" [SEP]" for sent in sentences]
labels = df.ibm_sent.values

In [9]:
# importing BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
tokenized_text = [tokenizer.tokenize(sent) for sent in sentences]
print(tokenized_text[0])

100%|██████████| 231508/231508 [00:00<00:00, 272305.10B/s]


['[CLS]', 'fr', 'he', 'stay', 'ignoring', 'question', 'come', 'africa', 'corona', 'virus', '’', '4', '##g', '5', '##g', '[SEP]']


BERT requires specifically formatted inputs. For each tokenized input sentence, we need to create:

- **input ids**: a sequence of integers identifying each input token to its index number in the BERT tokenizer vocabulary
- **segment mask**: (optional) a sequence of 1s and 0s used to identify whether the input is one sentence or two sentences long. For one sentence inputs, this is simply a sequence of 0s. For two sentence inputs, there is a 0 for each token of the first sentence, followed by a 1 for each token of the second sentence
- **attention mask**: (optional) a sequence of 1s and 0s, with 1s for all input tokens and 0s for all padding tokens (we'll detail this in the next paragraph)
- **labels**: a single value of 1 or 0. In our task 1 means "grammatical" and 0 means "ungrammatical"

In [10]:
# use bert tokenizer to convert tokens to their index number or in simple
# words representing words in numbers

input_ids = [tokenizer.convert_tokens_to_ids(token_sent) for token_sent in tokenized_text]
input_ids[0]

[101,
 10424,
 2002,
 2994,
 9217,
 3160,
 2272,
 3088,
 21887,
 7865,
 1521,
 1018,
 2290,
 1019,
 2290,
 102]

In [11]:
# Set the maximum sequence length. The longest sequence in our training set is 96, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
# to provide inputs of same size we use pad_sequence
MAX_LEN = 128
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, padding="post", truncating="post")
print(input_ids[0])

[  101 10424  2002  2994  9217  3160  2272  3088 21887  7865  1521  1018
  2290  1019  2290   102     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]


In [None]:
# attention_mask is used to notify the model to which tokens to attend and which not to
# as pad_seq is of no use so the model should not attend those pad tokens(respresented by '0')

attention_masks = []

for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

attention_masks[0]

In [13]:
df.ibm_sent = df.ibm_sent.astype(str)
df["ibm_sent"]= df["ibm_sent"].replace("nan", "Neutral") 
t = df.ibm_sent.unique()
print(t)

def la(sent):
  if sent == "Sadness":
    return 0
  if sent == "Confident":
    return 1
  if sent == "Neutral":
    return 2
  if sent == "Joy":
    return 3
  if sent == "Analytical":
    return 4
  if sent == "Anger":
    return 5
  if sent == "Fear":
    return 6

df['ibm_sent_cat'] = df['ibm_sent'].apply(la)
labels = df['ibm_sent_cat'].values

['Sadness' 'Confident' 'Neutral' 'Joy' 'Analytical' 'Anger' 'Fear']


In [14]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
train_mask, validation_mask, _, _ = train_test_split(attention_masks, input_ids, random_state=2020, test_size=0.1)


In [15]:
len(train_inputs), len(validation_inputs)

(54000, 6000)

In [16]:
len(train_mask), len(validation_mask)

(54000, 6000)

In [17]:
# we cannot just provide numpy arrays to the model, model only takes tensors as inputs

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_mask = torch.tensor(train_mask)
validation_mask = torch.tensor(validation_mask)

In [18]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_mask, train_labels)
validation_data = TensorDataset(validation_inputs, validation_mask, validation_labels)
train_sampler = RandomSampler(train_data) # sampler (Sampler, optional) – defines the strategy to draw samples from the dataset
validation_sampler = RandomSampler(validation_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size=batch_size)
validation_dataloader = DataLoader(validation_data, sampler = validation_sampler, batch_size=batch_size)

# train_dataloader wont have 60,000 tweets instead it would have (60000/ batch_size) batches


## Train Model

Now that our input data is properly formatted, it's time to fine tune the BERT model. 

For this task, we first want to modify the pre-trained BERT model to give outputs for classification, and then we want to continue training the model on our dataset until that the entire model, end-to-end, is well-suited for our task. Thankfully, the huggingface pytorch implementation includes a set of interfaces designed for a variety of NLP tasks. Though these interfaces are all built on top of a trained BERT model, each has different top layers and output types designed to accomodate their specific NLP task.  

We'll load [BertForSequenceClassification](https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L1129). This is the normal BERT model with an added single linear layer on top for classification that we will use as a sentence classifier. As we feed input data, the entire pre-trained BERT model and the additional untrained classification layer is trained on our specific task. 

### Structure of Fine-Tuning Model

As we've showed beforehand, the first token of every sequence is the special classification token ([CLS]). Unlike the hidden state vector corresponding to a normal word token, the hidden state corresponding to this special token is designated by the authors of BERT as an aggregate representation of the whole sentence used for classification tasks. As such, when we feed in an input sentence to our model during training, the output is the length 768 hidden state vector corresponding to this token. The additional layer that we've added on top consists of untrained linear neurons of size [hidden_state, number_of_labels], so [768,7], meaning that the output of BERT plus our classification layer is a vector of two numbers representing the "score" for "grammatical/non-grammatical" that are then fed into cross-entropy loss.



### The Fine-Tuning Process

Because the pre-trained BERT layers already encode a lot of information about the language, training the classifier is relatively inexpensive. Rather than training every layer in a large model from scratch, it's as if we have already trained the bottom layers 95% of where they need to be, and only really need to train the top layer, with a bit of tweaking going on in the lower levels to accomodate our task.

Sometimes practicioners will opt to "freeze" certain layers when fine-tuning, or to apply different learning rates, apply diminishing learning rates, etc. all in an effort to preserve the good quality weights in the network and speed up training (often considerably). In fact, recent research on BERT specifically has demonstrated that freezing the majority of the weights results in only minimal accuracy declines, but there are exceptions and broader rules of transfer learning that should also be considered. For example, if your task and fine-tuning dataset is very different from the dataset used to train the transfer learning model, freezing the weights may not be a good idea. We'll cover the broader scope of transfer learning in NLP in a future post.  



In [19]:
# importing model from the library
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=7)
model.cuda()

100%|██████████| 407873900/407873900 [00:37<00:00, 10740601.89B/s]


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

Now that we have our model loaded we need to grab the training hyperparameters from within the stored model.

For the purposes of fine-tuning, the authors recommend the following hyperparameter ranges:
- Batch size: 16, 32
- Learning rate (Adam): 5e-5, 3e-5, 2e-5
- Number of epochs: 2, 3, 4

In [20]:
# param_optimizer = list(model.named_parameters())
# no_decay = ["gamma", "beta", "bias"]


In [21]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [22]:
optimizer = BertAdam(model.parameters(),
                      lr=2e-5,
                      warmup=.1)

t_total value of -1 results in schedule not being applied


Below is our training loop. There's a lot going on, but fundamentally for each pass in our loop we have a trianing phase and a validation phase. At each pass we need to:

Training loop:
- Tell the model to compute gradients by setting the model in train mode
- Unpack our data inputs and labels
- Load data onto the GPU for acceleration
- Clear out the gradients calculated in the previous pass. In pytorch the gradients accumulate by default (useful for things like RNNs) unless you explicitly clear them out
- Forward pass (feed input data through the network)
- Backward pass (backpropagation)
- Tell the network to update parameters with optimizer.step()
- Track variables for monitoring progress

Evalution loop:
- Tell the model not to compute gradients by setting th emodel in evaluation mode
- Unpack our data inputs and labels
- Load data onto the GPU for acceleration
- Forward pass (feed input data through the network)
- Compute loss on our validation data and track variables for monitoring progress

So please read carefully through the comments to get an understanding of what's happening. If you're unfamiliar with pytorch a quick look at some of their [beginner tutorials](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py) will help show you that training loops really involve only a few simple steps; the rest is usually just decoration and logging.  

In [23]:
def acc(preds, labels):
  pred_flat = np.argmax(preds, axis=1) # np.argmax - Returns the indices of the maximum values along an axis.
  labels_flat = labels.flatten() # Flatten - Return a copy of the array collapsed into one dimension.
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [24]:
# from transformers import get_linear_schedule_with_warmup

# epoches = 4
# total_steps = len(train_dataloader)* epoches

# scheduler = get_linear_scheduler_with_warmup(optimizer,
#                                              num_warmup_steps = 0,
#                                              num_training_steps = total_steps)

In [None]:
import random
seed_val = 42

# # Set the seed value all over the place to make this reproducible.

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# store the average loss value after each epoch so that we can later use it for plotting 
loss_values = []
epoches = 4


for epoch_i in range(0, epoches):

    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epoches))
    print('Training...')

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    model.train()

    # for each batch of training dataset
    for step, batch in enumerate(train_dataloader):
      if step%40 == 0 and not step == 0:
        print("   Batch      {:}   of   {:}".format(step, len(train_dataloader)))

      # Unpack this training batch from our dataloader.
      # batch[0] = input_ids, batch[1] = attention_mask, batch[2] = labels
      batch[0] = torch.tensor(batch[0]).to(torch.int64)
      batch[1] = torch.tensor(batch[1]).to(torch.int64)
      # batch[2] = torch.tensor(batch[2]).to(torch.int64)
      b_input_ids = batch[0].to(device)
      b_attention_masks = batch[1].to(device)
      b_labels = batch[2].to(device)
      
      # Always clear any previously calculated gradients before performing a
      # backward pass. PyTorch doesn't do this automatically because 
      # accumulating the gradients is "convenient while training RNNs".
      model.zero_grad()

      # Perform a forward pass (evaluate the model on this training batch).
      # This will return the loss (rather than the model output) because we
      # have provided the `labels`.
      outputs = model(b_input_ids,
                      attention_mask = b_attention_masks,
                      labels=b_labels)
      if step == 0:
        print(outputs, outputs.size())
      
      loss = outputs

      total_loss += loss.item()

      # Perform a backward pass to calculate the gradients.
      loss.backward()

      # Clip the norm of the gradients to 1.0. 
      # This is to help prevent the "exploding gradients" problem.
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      # Update parameters and take a step using the computed gradient.
      optimizer.step()

      # Update the learning rate.
      # scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    loss_values.append(avg_train_loss)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.

    model.eval()

    eval_loss, eval_acc = 0, 0
    nb_eval_steps, nb_eval_example = 0, 0

    for batch in validation_dataloader:
      # add batch
      batch[0] = torch.tensor(batch[0]).to(torch.int64)
      batch[1] = torch.tensor(batch[1]).to(torch.int64)
            
      b_input_ids = batch[0].to(device)
      b_attention_masks = batch[1].to(device)
      b_labels = batch[2].to(device)

      # Telling the model not to compute or store gradients, saving memory and
      # speeding up validation      
      with torch.no_grad():
        logits = model(b_input_ids, attention_mask = b_attention_masks)


        
      # Get the "logits" output by the model. The "logits" are the output
      # values prior to applying an activation function like the softmax.
      # logits = outputs

      # Move logits and labels to CPU
      logits = logits.detach().cpu().numpy()
      labels = b_labels.to('cpu').numpy()

      # Calculate the accuracy for this batch of test sentences.
      eval_acc += acc(logits, labels)

      nb_eval_steps += 1

    print("  Val_Accuracy: {0:.2f}".format(eval_acc/nb_eval_steps)) 

print("")
print("Training complete!")




Training...




tensor(2.0207, device='cuda:0', grad_fn=<NllLossBackward>) torch.Size([])


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


   Batch      40   of   1688
   Batch      80   of   1688
   Batch      120   of   1688
   Batch      160   of   1688
   Batch      200   of   1688
   Batch      240   of   1688
   Batch      280   of   1688
   Batch      320   of   1688
   Batch      360   of   1688
   Batch      400   of   1688
   Batch      440   of   1688
   Batch      480   of   1688
   Batch      520   of   1688
   Batch      560   of   1688
   Batch      600   of   1688
   Batch      640   of   1688
   Batch      680   of   1688
   Batch      720   of   1688
   Batch      760   of   1688
   Batch      800   of   1688
   Batch      840   of   1688
   Batch      880   of   1688
   Batch      920   of   1688
   Batch      960   of   1688
   Batch      1000   of   1688
   Batch      1040   of   1688
   Batch      1080   of   1688
   Batch      1120   of   1688
   Batch      1160   of   1688
   Batch      1200   of   1688
   Batch      1240   of   1688
   Batch      1280   of   1688
   Batch      1320   of   1688
   



  Val_Accuracy: 0.66

Training...
tensor(0.8672, device='cuda:0', grad_fn=<NllLossBackward>) torch.Size([])
   Batch      40   of   1688
   Batch      80   of   1688
   Batch      120   of   1688
   Batch      160   of   1688
   Batch      200   of   1688
   Batch      240   of   1688
   Batch      280   of   1688
   Batch      320   of   1688
   Batch      360   of   1688
   Batch      400   of   1688
   Batch      440   of   1688
   Batch      480   of   1688
   Batch      520   of   1688
   Batch      560   of   1688
   Batch      600   of   1688
   Batch      640   of   1688
   Batch      680   of   1688
   Batch      720   of   1688
   Batch      760   of   1688
   Batch      800   of   1688
   Batch      840   of   1688
   Batch      880   of   1688
   Batch      920   of   1688
   Batch      960   of   1688
   Batch      1000   of   1688
   Batch      1040   of   1688
   Batch      1080   of   1688
   Batch      1120   of   1688
   Batch      1160   of   1688
   Batch      1200 

In [None]:
torch.cuda.empty_cache()