In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 7.7MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 36.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 56.4MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import pickle


# torchtext libraries
from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator, Dataset

# huggingface libraries
from transformers import BertTokenizer, BertForSequenceClassification


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




### Data Loading

In [5]:
MAX_SEQ_LEN = 128
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)
label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.long)
text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True,
                   fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)
fields = [('text', text_field), ('label', label_field)]
train, valid, test = TabularDataset.splits(path='preprocessed_data/', train='trn_title.csv', validation='val_title.csv',
                                           test='tst_title.csv', format='CSV', fields=fields, skip_header=True)

In [6]:
train_iter = BucketIterator(train, batch_size=8, sort_key=lambda x: len(x.text),
                            device=device, train=True, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=8, sort_key=lambda x: len(x.text),
                            device=device, train=True, sort=True, sort_within_batch=True)
test_iter = Iterator(test, batch_size=8, device=device, train=False, shuffle=False, sort=False)

## **BERT MODEL**

In [7]:
class BERT(nn.Module):

    def __init__(self):
        super(BERT, self).__init__()

        self.encoder = BertForSequenceClassification.from_pretrained('bert-base-uncased')

    def forward(self, text, label):
        loss, pred = self.encoder(text, labels=label)[:2]
        pred = torch.argmax(pred, dim=1)
        return loss, pred

In [8]:
def train(model,
          optimizer,
          criterion = nn.BCELoss(),
          train_loader = train_iter,
          valid_loader = valid_iter,
          num_epochs = 10,):
    
    # initialize values
    running_loss = 0.0
    total_acc = 0
    # training loop
    # model.train()
    total_acc = 0
    best_val_acc = 0
    for epoch in range(num_epochs):
        epoch_loss = 0.0
        test_acc = 0
        val_acc = 0
        test_seen = 0
        val_seen = 0
        model.train()
        for batch in train_loader: 
            optimizer.zero_grad()  
            text = batch.text
            labels = batch.label
            labels = labels.to(device)
            text = text.to(device)
            output = model(text, labels)
            loss, pred = output
            test_acc += torch.eq(pred, labels).sum().item()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            epoch_loss += loss.item()
            test_seen += len(batch)
        test_acc = test_acc/test_seen
        print('The training accuracy for epoch {epoch} is {test_acc}'.format(epoch=epoch+1, test_acc=test_acc))
        print('The cumulative loss for epoch {epoch} is {epoch_loss}'.format(epoch=epoch+1, epoch_loss=epoch_loss))
        # validation 
        model.eval()
        for batch in valid_loader:   
            text = batch.text
            labels = batch.label
            labels = labels.to(device)
            text = text.to(device)
            output = model(text, labels)
            loss, pred = output
            val_acc += torch.eq(pred, labels).sum().item()
            val_seen += len(batch)
        val_acc = val_acc/val_seen
        print('The validation accuracy for epoch {epoch} is {val_acc}'.format(epoch=epoch+1, val_acc=val_acc))
        if(val_acc > best_val_acc):
          best_val_acc = val_acc
          torch.save(model.state_dict(), 'weights.pt')

In [9]:
model = BERT()
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()
train(model, optimizer=optimizer, criterion=criterion, num_epochs=20)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

The training accuracy for epoch 1 is 0.9582055833467807
The cumulative loss for epoch 1 is 439.3697469012841
The validation accuracy for epoch 1 is 0.7391549932218707
The training accuracy for epoch 2 is 0.9826690334032596
The cumulative loss for epoch 2 is 192.85473932541936
The validation accuracy for epoch 2 is 0.7086534116583822
The training accuracy for epoch 3 is 0.9912861061804099
The cumulative loss for epoch 3 is 107.16719352501241
The validation accuracy for epoch 3 is 0.8120198825124265
The training accuracy for epoch 4 is 0.9935775375181539
The cumulative loss for epoch 4 is 79.31527341130277
The validation accuracy for epoch 4 is 0.8404880253050158
The training accuracy for epoch 5 is 0.9951589478780055
The cumulative loss for epoch 5 is 60.94451177493829
The validation accuracy for epoch 5 is 0.8431992769995481
The training accuracy for epoch 6 is 0.9968049055994836
The cumulative loss for epoch 6 is 42.339954667319034
The validation accuracy for epoch 6 is 0.913804789877

In [10]:
model = BERT()
model = model.to(device)
model.load_state_dict(torch.load('weights.pt'))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

<All keys matched successfully>

In [11]:
def predict(text, label, model):
  model.eval()
  text = tokenizer.encode(text)
  text=torch.tensor(text)
  text = text.unsqueeze(0)
  label = torch.tensor(label)
  label = label.unsqueeze(0)
  text = text.to(device)
  label = label.to(device)
  model=model.to(device)
  loss, pred = model(text, label)
  return pred.item()
  

In [12]:
def test(test_loader = test_iter):
  model.eval()
  tst_acc = 0
  tst_seen = 0
  for batch in test_loader:   
    text = batch.text
    labels = batch.label
    labels = labels.to(device)
    text = text.to(device)
    output = model(text, labels)
    loss, pred = output
    tst_acc += torch.eq(pred, labels).sum().item()
    tst_seen += len(batch)
  tst_acc = tst_acc/tst_seen
  print('The test accuracy is {tst_acc}'.format(tst_acc=tst_acc))

In [13]:
test()

The test accuracy is 0.9484978540772532


In [14]:
text = 'Maine GOP Governor’s Statement On Drug Overdoses Proves ‘Pro-Life Republicans’ Don’t Exist'
label = 0
output = predict(text, label, model)
print(output)

0
