##imports/dataloading/installations

In [15]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

###Install Huggingface library

In [16]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


###Set up Device

In [17]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are/is {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are/is 1 GPU(s) available.
Device name: Tesla T4


### Change Folder, get Data

In [18]:
# mounting drive
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
# changing to folder:
%cd drive/MyDrive/ml_series4/Series4_Datasets
!ls

[Errno 2] No such file or directory: 'drive/MyDrive/ml_series4/Series4_Datasets'
/content/drive/MyDrive/ml_series4/Series4_Datasets
bbc-text_test.csv  bbc-text_train.csv  BERT_classifier_news_data.pt


In [20]:
data = pd.read_csv("bbc-text_train.csv")
data_test= pd.read_csv("bbc-text_test.csv")
data_test.sample(5)

Unnamed: 0,category,text
205,business,umbro profits lifted by euro 2004 uk sportswea...
117,business,lse sets date for takeover deal the london s...
418,business,car giant hit by mercedes slump a slump in pro...
94,politics,election could be terror target terrorists m...
5,business,jobs go at oracle after takeover oracle has an...


###Label Encoder

In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(data['category'])
print(le.classes_)
data['category'] = le.transform(data['category'])
data_test['category'] = le.transform(data_test['category'])
data_test.head()

['business' 'entertainment' 'politics' 'sport' 'tech']


Unnamed: 0,category,text
0,2,brown and blair face new rift claims for the u...
1,0,small firms hit by rising costs rising fuel ...
2,1,spirit awards hail sideways the comedy sideway...
3,4,microsoft releases patches microsoft has warne...
4,3,arsenal through on penalties arsenal win 4-2 o...


##Tokenization and Formatting

###Preprocessing

In [22]:
def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

###Tokenization

In [23]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # cut too long sentences, only take the end
        

        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=text_preprocessing(sent),  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=512,                  # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length

            # !!! not used in newest version, but it did not make a difference anyway
            # truncation_side="left" ,        # Cut longer documents at the end
            
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True      # Return attention mask
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

##Getting the Model

###Define Model architecture

In [24]:
%%time
import torch
import torch.nn as nn
from transformers import BertModel, BertConfig

# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False, num_labels=5):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, num_labels

        # Instantiate BERT model
        # here we do not need to download the pretrained version, since we are using our fine-tuned model
        bert_config = BertConfig() # Configuration (i.e. some hyperparameters) for bert-base-uncased
        self.bert = BertModel(bert_config)

        # one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)

        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

CPU times: user 45 µs, sys: 0 ns, total: 45 µs
Wall time: 48.2 µs


###Loading the Model

In [25]:
# loading the model
model = BertClassifier()
model.load_state_dict(torch.load("BERT_classifier_news_data.pt"))
model.to(device)
# note to myself: map_location=device is important if using cpu, the default is cuda
# the device varialbe is defined above in Set up GPU for training.

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

##Make a prediction on the test set

####get random datapoint

In [26]:
# get a random datapoint
data_test_sample = data_test.sample(1)
data_test_sample

Unnamed: 0,category,text
155,0,call to overhaul uk state pension the uk pensi...


####make a dataloader for efficiency

In [27]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# Run `preprocessing_for_bert` on the test set
print('Tokenizing data...')
test_inputs, test_masks = preprocessing_for_bert(data_test.text)

# Create the DataLoader for our test set
test_dataset = TensorDataset(test_inputs, test_masks)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=16)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Tokenizing data...




###Prediction

In [28]:
import torch.nn.functional as F

def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

In [29]:
!pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.9.1-py3-none-any.whl (419 kB)
[K     |████████████████████████████████| 419 kB 24.5 MB/s 
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.9.1


In [30]:
from torchmetrics import F1Score


# Compute predicted probabilities on the test set
probs = bert_predict(model, test_dataloader)

# Get predictions from the probabilities
preds = [np.argmax(prob) for prob in probs]

#make two tensors to compare
preds = torch.tensor(preds)
y_true = torch.tensor(data_test.category)

# get micro F1 score
f1_micro = F1Score(num_classes=5)
f1_micro(preds, y_true)


tensor(0.9730)