In [None]:
! pip install torch
! pip install torchtext==0.6.0
! pip install scapy

In [1]:
import os
import sys
import errno
import glob
import random
import numpy as np
from argparse import ArgumentParser
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torchtext import data
from torchtext import datasets
from classifier import NLIModel
import spacy

import pandas as pd
from torchtext.vocab import build_vocab_from_iterator
from torchtext.vocab import GloVe, Vocab
import torch

In [2]:
print(torch.__version__)

import torchtext
print(torchtext.__version__)
print(spacy.__version__)

2.2.1+cpu
0.6.0
3.7.4


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


# Deifine Field For Tokenise and Embedding

In [4]:
spacy_en = spacy.load('en_core_web_sm')

# Define how to preprocess the text data
TEXT = data.Field(lower=True, sequential=True, tokenize=lambda text: [token.text for token in spacy_en.tokenizer(text)])
# Define how to process the labels
LABEL = data.Field(sequential=False, use_vocab=False, unk_token=None)

fields = [('premise', TEXT), ('hypothesis', TEXT), ('label', LABEL)]

In [5]:
train_data, validation_data = data.TabularDataset.splits(
        path='./',  # Directory of your CSV files
        train='train.csv', validation='dev.csv',
        format='csv',
        fields=fields,
        skip_header=True  # If your CSV has a header
)

# Build the vocabulary only for the TEXT field from the training set
TEXT.build_vocab(train_data, vectors="glove.840B.300d")
LABEL.build_vocab(train_data)

print(vars(train_data.examples[0]))  # Print the first training example
print(vars(validation_data.examples[0]))  # Print the first validation example

# print("Premise:", ' '.join(train_data.premise))
# print("Hypothesis:", ' '.join(train_data.hypothesis))
# print("Label:", train_data.label)

<torchtext.data.example.Example object at 0x00000234D25BBD10>
<torchtext.data.example.Example object at 0x00000234DB34A8D0>
{'premise': ['however', ',', 'fort', 'charles', 'was', 'rebuilt', 'as', 'a', 'military', 'and', 'naval', 'garrison', ',', 'and', 'it', 'protected', 'jamaica', 'and', 'much', 'of', 'the', 'english', 'caribbean', 'for', '250', 'years', 'until', 'the', 'advent', 'of', 'steamships', 'and', 'yet', 'another', 'earthquake', 'in', '1907', 'saw', 'its', 'decline', '.'], 'hypothesis': ['fort', 'charles', 'was', 'rebuilt', 'as', 'an', 'amusement', 'park', 'for', 'the', 'locals', '.'], 'label': '0'}
{'premise': ['mon', 'dieu', '!'], 'hypothesis': ['this', 'person', 'is', 'speaking', 'english', '.'], 'label': '0'}


In [6]:
print(type(TEXT.vocab.vectors))  # Expected: <class 'torch.Tensor'>
print(TEXT.vocab.vectors.size())  # Expected output: torch.Size([vocab_size, embedding_dim]) --> torch.Size([35538, 300])

<class 'torch.Tensor'>
torch.Size([35538, 300])


# Define Model, Loss Function, and Optimiser

In [7]:
from argparse import ArgumentParser

original_argv = sys.argv
sys.argv = ['']

parser = ArgumentParser(description='Helsinki NLI')

config = parser.parse_args()
config.dropout =  0.5
config.activation = 'leakyrelu'
config.hidden_dim = 600
config.fc_dim = 600
config.out_dim = len(LABEL.vocab)
config.embed_size = len(TEXT.vocab)
config.embed_dim = TEXT.vocab.vectors.size(1)
config.encoder_type = 'HBMP'
config.layers = 1
config.cells = config.layers * 2
config.word_embedding = 'glove.840B.300d'
config.epochs = 20
config.batch_size = 32
config.optimizer = 'adam'
config.learning_rate = 0.0005
config.lr_patience = 1
# config.lr_decay = 0.99
config.lr_reduction_factor = 0.2
config.weight_decay = 0
config.early_stopping_patience = 3
config.save_path = 'results'
config.gpu = 'cpu'
# config.seed = 1234

# Restore the original sys.argv
sys.argv = original_argv
print(config)

Namespace(dropout=0.5, activation='leakyrelu', hidden_dim=600, fc_dim=600, out_dim=2, embed_size=35538, embed_dim=300, encoder_type='HBMP', layers=1, cells=2, word_embedding='glove.840B.300d', epochs=20, batch_size=32, optimizer='adam', learning_rate=0.0005, lr_patience=1, lr_decay=0.99, lr_reduction_factor=0.2, weight_decay=0, early_stopping_patience=3, save_path='results', gpu='cpu')


In [8]:
train_iter, dev_iter = data.BucketIterator.splits(
        (train_data, validation_data),
        batch_size=config.batch_size,
        sort_within_batch=True,
        sort_key=lambda x: len(x.premise),
        device=device
    )


> Example Usage

In [9]:
# Getting a single batch from the iterator
for batch in train_iter:
    # Assuming 'premise' and 'hypothesis' are your input fields and 'label' is your target field
    premises = batch.premise
    hypotheses = batch.hypothesis

    for i in range(premises.shape[1]):  # Loop over batch size
        # Convert indices back to strings (for textual data fields)
        # Note: This step assumes that you have the TEXT field build vocab
        premise = ' '.join([TEXT.vocab.itos[ind] for ind in premises[:, i].tolist()])
        hypothesis = ' '.join([TEXT.vocab.itos[ind] for ind in hypotheses[:, i].tolist()])
        # print(batch.label)
        label = batch.label[i].item()

        print("Premise:", premise)
        print("Hypothesis:", hypothesis)
        print("Label:", label)
        
        # Break after the first batch to only see one batch
        break
    
    break

Premise: from the ticket office you enter the temple complex through a colossal pylon , one of the most recent structures at the site and the largest constructed anywhere in egypt during the ptolemaic period .
Hypothesis: the large pylon at the entrance of the temple complex was erected in honor of isis .   <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Label: 1


In [10]:
model = NLIModel(config).to(device)

# if config.word_embedding:
#     model.sentence_embedding.word_embedding.weight.data = TEXT.vocab.vectors
#       model.cuda(device=config.gpu)

# Print the model
print('Model:\n')
print(model)
print('\n')
params = sum([p.numel() for p in model.parameters()])
print('Parameters: {}'.format(params))

Model:

NLIModel(
  (sentence_embedding): SentenceEmbedding(
    (word_embedding): Embedding(35538, 300)
    (encoder): HBMP(
      (max_pool): AdaptiveMaxPool1d(output_size=1)
      (rnn1): LSTM(300, 600, dropout=0.5, bidirectional=True)
      (rnn2): LSTM(300, 600, dropout=0.5, bidirectional=True)
      (rnn3): LSTM(300, 600, dropout=0.5, bidirectional=True)
    )
  )
  (classifier): FCClassifier(
    (activation): LeakyReLU(negative_slope=0.01)
    (mlp): Sequential(
      (0): Dropout(p=0.5, inplace=False)
      (1): Linear(in_features=14400, out_features=600, bias=True)
      (2): LeakyReLU(negative_slope=0.01)
      (3): Dropout(p=0.5, inplace=False)
      (4): Linear(in_features=600, out_features=600, bias=True)
      (5): LeakyReLU(negative_slope=0.01)
      (6): Linear(in_features=600, out_features=2, bias=True)
    )
  )
)


Parameters: 32652602




In [11]:
def make_dirs(name):
    try:
        os.makedirs(name)
    except OSError as ex:
        if ex.errno == errno.EEXIST and os.path.isdir(name):
            # ignore existing directory
            pass
        else:
            # a different error happened
            raise

make_dirs(config.save_path)

In [12]:
# Loss and Optimizer
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate,)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                            'min',
                                            factor=config.lr_reduction_factor,
                                            patience=config.lr_patience,
                                            min_lr=1e-5)



best_dev_acc = -1
dev_accuracies = []
best_dev_loss = 1

early_stopping = 0
stop_training = False
train_iter.repeat = False


print('\nTraining started...\n')
# Training and Evaluation
best_acc = 0.0
for epoch in range(config.epochs):  # Adjust the number of epochs

    n_correct = 0
    n_total = 0
    train_accuracies = []
    all_losses = []

    print('\nEpoch: {:>02.0f}/{:<02.0f}'.format(epoch+1, config.epochs), end='\t')
    print('(Learning rate: {})'.format(optimizer.param_groups[0]['lr']))

    for batch_idx, batch in enumerate(train_iter):
        
        model.train()
        optimizer.zero_grad()
        
        # Move batch data to the correct device
        batch.premise, batch.hypothesis, batch.label = batch.premise.to(device), batch.hypothesis.to(device), batch.label.to(device)
        predictions = model(batch)
        
        # Calculate accuracy
        n_correct += (torch.max(predictions, 1)[1].view(batch.label.size()).data == batch.label.data).sum()
        n_total += batch.batch_size
        train_acc = 100. * n_correct/n_total
        train_accuracies.append(train_acc.item())
        
        # Calculate loss
        loss = criterion(predictions, batch.label)
        all_losses.append(loss.item())

        # Backpropagate and update the learning rate
        loss.backward()
        optimizer.step()

        # For accuracy calculation
        preds = torch.argmax(predictions, dim=1)

        print('Progress: {:3.0f}% - Batch: {:>4.0f}/{:>4.0f} - Loss: {:6.2f}% - Accuracy: {:6.2f}%'.format(
            100. * (1+batch_idx) / len(train_iter),
            1+batch_idx, len(train_iter),
            round(100. * np.mean(all_losses), 2),
            round(np.mean(train_accuracies), 2)), end='\r')

    
    print('\nEvaluation started...\n')
    # Evaluation
    model.eval()

    # Calculate Accuracy
    n_dev_correct = 0
    dev_loss = 0
    dev_losses = []

    with torch.no_grad():
        for dev_batch_idx, dev_batch in enumerate(dev_iter):
            answer = model(dev_batch)

            n_dev_correct += (torch.max(answer, 1)[1].view(dev_batch.label.size()).data == \
                dev_batch.label.data).sum()
            
            dev_loss = criterion(answer, dev_batch.label)
            dev_losses.append(dev_loss.item())
        
        dev_acc = 100. * n_dev_correct / len(validation_data)
        dev_acc=dev_acc.item()
        dev_accuracies.append(dev_acc)

        print('\nDev loss: {}% - Dev accuracy: {}%'.format(round(100.*np.mean(dev_losses), 2), round(dev_acc, 2)))

        if dev_acc > best_dev_acc:

            best_dev_acc = dev_acc
            best_dev_epoch = 1+epoch
            snapshot_prefix = os.path.join(config.save_path, 'best')
            dev_snapshot_path = snapshot_prefix + \
                '_{}_{}D_devacc_{}_epoch_{}.pt'.format(config.encoder_type, config.hidden_dim, round(dev_acc, 2), 1+epoch)
        
            # save model, delete previous snapshot
            torch.save(model, dev_snapshot_path)
            for f in glob.glob(snapshot_prefix + '*'):
                if f != dev_snapshot_path:
                    os.remove(f)


        # Check for early stopping
        if np.mean(dev_losses) < best_dev_loss:
            best_dev_loss = np.mean(dev_losses)
        else:
            early_stopping += 1

        if early_stopping > config.early_stopping_patience and config.optimizer != 'sgd':
            stop_training = True
            print('\nEarly stopping')

        if config.optimizer == 'sgd' and optimizer.param_groups[0]['lr'] < 1e-5:
            stop_training = True
            print('\nEarly stopping')
            
        # Update learning rate
        scheduler.step(round(np.mean(dev_losses), 2))
        dev_losses = []


    # If training has completed, calculate the test scores
    if stop_training == True or (1+epoch == config.epochs and 1+batch_idx == len(train_iter)):
        print('\nTraining completed after {} epocs.\n'.format(1+epoch))


        #Save the final model
        final_snapshot_prefix = os.path.join(config.save_path, 'final')
        final_snapshot_path = final_snapshot_prefix + \
        '_{}_{}D.pt'.format(config.encoder_type, config.hidden_dim)
        torch.save(model, final_snapshot_path)
        for f in glob.glob(final_snapshot_prefix + '*'):
            if f != final_snapshot_path:
                os.remove(f)
        
        break


Training started...


Epoch: 01/20	(Learning rate: 0.0005)
Progress: 100% - Batch:  842/ 842 - Loss:  61.99% - Accuracy:  59.33%
Evaluation started...


Dev loss: 55.94% - Dev accuracy: 70.39%

Epoch: 02/20	(Learning rate: 0.0005)
Progress: 100% - Batch:  842/ 842 - Loss:  54.20% - Accuracy:  71.40%
Evaluation started...


Dev loss: 53.69% - Dev accuracy: 71.38%

Epoch: 03/20	(Learning rate: 0.0005)
Progress: 100% - Batch:  842/ 842 - Loss:  46.76% - Accuracy:  77.23%
Evaluation started...


Dev loss: 56.67% - Dev accuracy: 69.44%

Epoch: 04/20	(Learning rate: 0.0005)
Progress: 100% - Batch:  842/ 842 - Loss:  37.45% - Accuracy:  83.41%
Evaluation started...


Dev loss: 62.71% - Dev accuracy: 70.91%

Epoch: 05/20	(Learning rate: 0.0001)
Progress: 100% - Batch:  842/ 842 - Loss:  17.66% - Accuracy:  92.86%
Evaluation started...


Dev loss: 85.76% - Dev accuracy: 70.77%

Epoch: 06/20	(Learning rate: 0.0001)
Progress: 100% - Batch:  842/ 842 - Loss:  10.25% - Accuracy:  96.15%
Evaluation

KeyboardInterrupt: 

# UNWANTED

In [None]:
# # Loss
# criterion = nn.CrossEntropyLoss()

# # Optimizer
# if config.optimizer == 'adadelta':
#     optim_algorithm = optim.Adadelta
# elif config.optimizer == 'adagrad':
#     optim_algorithm = optim.Adagrad
# elif config.optimizer == 'adam':
#     optim_algorithm = optim.Adam
# elif config.optimizer == 'adamax':
#     optim_algorithm = optim.Adamax
# elif config.optimizer == 'asgd':
#     optim_algorithm = optim.ASGD
# elif config.optimizer == 'rmsprop':
#     optim_algorithm = optim.RMSprop
# elif config.optimizer == 'rprop':
#     optim_algorithm = optim.Rprop
# elif config.optimizer == 'sgd':
#     optim_algorithm = optim.SGD
# else:
#     raise Exception('Unknown optimization optimizer: "%s"' % config.optimizer)

# optimizer = optim_algorithm(model.parameters(),
#                             lr=config.learning_rate,
#                             weight_decay=config.weight_decay)

# scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
#                                             'min',
#                                             factor=config.lr_reduction_factor,
#                                             patience=config.lr_patience,
#                                             min_lr=1e-5)

# iterations = 0
# best_dev_acc = -1
# dev_accuracies = []
# best_dev_loss = 1
# early_stopping = 0
# stop_training = False
# train_iter.repeat = False
# make_dirs(config.save_path)

# # Print parameters and config
# print('\nConfig: {}\n'.format(sys.argv[1:]))
# print(config)

# # Print the model
# print('Model:\n')
# print(model)
# print('\n')
# params = sum([p.numel() for p in model.parameters()])
# print('Parameters: {}'.format(params))



Config: ['--f=c:\\Users\\user\\AppData\\Roaming\\jupyter\\runtime\\kernel-v2-12832L9wxdsH5Cnc8.json']

Namespace(dropout=0.1, activation='leakyrelu', hidden_dim=600, fc_dim=600, out_dim=2, embed_size=35538, embed_dim=300, encoder_type='HBMP', layers=2, cells=4, word_embedding='glove.840B.300d', epochs=1, batch_size=64, optimizer='adam', learning_rate=0.0005, lr_patience=1, lr_decay=0.99, lr_reduction_factor=0.2, weight_decay=0, early_stopping_patience=3, save_path='results', gpu='cpu')
Model:

NLIModel(
  (sentence_embedding): SentenceEmbedding(
    (word_embedding): Embedding(35538, 300)
    (encoder): HBMP(
      (max_pool): AdaptiveMaxPool1d(output_size=1)
      (rnn1): LSTM(300, 600, num_layers=2, dropout=0.1, bidirectional=True)
      (rnn2): LSTM(300, 600, num_layers=2, dropout=0.1, bidirectional=True)
      (rnn3): LSTM(300, 600, num_layers=2, dropout=0.1, bidirectional=True)
    )
  )
  (classifier): FCClassifier(
    (activation): LeakyReLU(negative_slope=0.01)
    (mlp): Seq

In [None]:
# print('\nTraining started...\n')

# # Train for the number of epochs specified
# for epoch in range(config.epochs):
#     if stop_training == True:
#         break

#     train_iter.init_epoch()
#     n_correct = 0
#     n_total = 0
#     all_losses = []
#     train_accuracies = []
#     all_losses = []

#     optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] * config.lr_decay if epoch>0\
#     and config.optimizer == 'sgd' else optimizer.param_groups[0]['lr']
#     print('\nEpoch: {:>02.0f}/{:>02.0f}'.format(epoch+1, config.epochs), end='\t')
#     print('(Learning rate: {})'.format(optimizer.param_groups[0]['lr']))

#     for batch_idx, batch in enumerate(train_iter):

#         model.train()
#         optimizer.zero_grad()
#         iterations += 1
#         answer = model(batch)
#         # sys.exit()
#         # Calculate accuracy

#         n_correct += (torch.max(answer, 1)[1].view(batch.label.size()).data == batch.label.data).sum()
#         n_total += batch.batch_size
#         train_acc = 100. * n_correct/n_total
#         train_accuracies.append(train_acc.item())

#         # Calculate loss
#         loss = criterion(answer, batch.label)
#         all_losses.append(loss.item())

#         # Backpropagate and update the learning rate
#         loss.backward()
#         optimizer.step()

#         print('Progress: {:3.0f}% - Batch: {:>4.0f}/{:<4.0f} - Loss: {:6.2f}% - Accuracy: {:6.2f}%'.format(
#             100. * (1+batch_idx) / len(train_iter),
#             1+batch_idx, len(train_iter),
#             round(100. * np.mean(all_losses), 2),
#             round(np.mean(train_accuracies), 2)), end='\r')
        
        
#         # Evaluate performance
#         # if iterations % config.dev_every == 0:
#         if 1+batch_idx == len(train_iter):
#             # Switch model to evaluation mode
#             model.eval()
#             dev_iter.init_epoch()

#             # Calculate Accuracy
#             n_dev_correct = 0
#             dev_loss = 0
#             dev_losses = []

#             for dev_batch_idx, dev_batch in enumerate(dev_iter):
#                 answer = model(dev_batch)
#                 n_dev_correct += (torch.max(answer, 1)[1].view(dev_batch.label.size()).data == \
#                     dev_batch.label.data).sum()
#                 dev_loss = criterion(answer, dev_batch.label)
#                 dev_losses.append(dev_loss.item())

#             dev_acc = 100. * n_dev_correct / len(dev)
#             dev_acc=dev_acc.item()
#             dev_accuracies.append(dev_acc)

#             print('\nDev loss: {}% - Dev accuracy: {}%'.format(round(100.*np.mean(dev_losses), 2), round(dev_acc, 2)))

#             # Update validation best accuracy if it is better than
#             # already stored
#             if dev_acc > best_dev_acc:

#                 best_dev_acc = dev_acc
#                 best_dev_epoch = 1+epoch
#                 snapshot_prefix = os.path.join(config.save_path, 'best')
#                 dev_snapshot_path = snapshot_prefix + \
#                     '_{}_{}D_devacc_{}_epoch_{}.pt'.format(config.encoder_type, config.hidden_dim, round(dev_acc, 2), 1+epoch)

#                 # save model, delete previous snapshot
#                 torch.save(model, dev_snapshot_path)
#                 for f in glob.glob(snapshot_prefix + '*'):
#                     if f != dev_snapshot_path:
#                         os.remove(f)

#             # Check for early stopping
#             if np.mean(dev_losses) < best_dev_loss:
#                 best_dev_loss = np.mean(dev_losses)
#             else:
#                 early_stopping += 1

#             if early_stopping > config.early_stopping_patience and config.optimizer != 'sgd':
#                 stop_training = True
#                 print('\nEarly stopping')

#             if config.optimizer == 'sgd' and optimizer.param_groups[0]['lr'] < 1e-5:
#                 stop_training = True
#                 print('\nEarly stopping')

#             # Update learning rate
#             scheduler.step(round(np.mean(dev_losses), 2))
#             dev_losses = []


#         # If training has completed, calculate the test scores
#         if stop_training == True or (1+epoch == config.epochs and 1+batch_idx == len(train_iter)):
#             print('\nTraining completed after {} epocs.\n'.format(1+epoch))


#             #Save the final model
#             final_snapshot_prefix = os.path.join(config.save_path, 'final')
#             final_snapshot_path = final_snapshot_prefix + \
#             '_{}_{}D.pt'.format(config.encoder_type, config.hidden_dim)
#             torch.save(model, final_snapshot_path)
#             for f in glob.glob(final_snapshot_prefix + '*'):
#                 if f != final_snapshot_path:
#                     os.remove(f)


Training started...


Epoch: 01/10	(Learning rate: 0.0005)
Progress:   5% - Batch:   21/421  - Loss:  69.51% - Accuracy:  48.94%

KeyboardInterrupt: 