In [1]:
import transformers
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np
import torch
#BERT data prep:

from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer, BertModel, BertForSequenceClassification


from sklearn.model_selection import train_test_split


In [2]:

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [4]:
# Testing with first sentence
MAX_LEN = 510  # As that is the max length that is generally encoded. 

In [5]:
# Function for preprocessing based on the steps provided: 

def preprocessing_for_bert(data):
    """
    
    Perform required preprocessing steps for pretrained BERT.
    
    Input: 
        - Param: data: Array of texts. 
    
    Returns: 
        - input_ids in the form of torch.Tensor
        - attention masks in torch.Tensor. These are indices which specify which tokens should be focused on by 
        the model
  
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in tqdm(data):
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        

        encoded_sent = tokenizer.encode_plus(
            text=sent,  
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,             # Max length to truncate/pad
            pad_to_max_length=True,         # Pad or Truncate sentences to max length
            return_attention_mask=True      # Return attention mask
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [6]:
import torch
import torch.nn as nn
from transformers import BertModel

# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 2

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                    max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                    information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                    num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

In [7]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                    lr=3e-5,    # Default learning rate
                    eps=1e-8    # Default epsilon value
                    )

    return bert_classifier, optimizer

In [14]:
bert_classifier, optimizer = initialize_model(epochs=2)
# model = BertClassifier(freeze_bert=False)
model = bert_classifier



In [15]:
# model.load_state_dict(torch.load('bert_news_sentiment.pth', map_location=torch.device('cpu')))
# model.eval()  # Set the model to evaluation mode


RuntimeError: Error(s) in loading state_dict for BertClassifier:
	Unexpected key(s) in state_dict: "bert.embeddings.position_ids". 

In [16]:
model.load_state_dict(torch.load('bert_news_sentiment_epoch1.pth', map_location=torch.device('cpu')))
model.eval()  # Set the model to evaluation mode


BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [25]:
import torch.nn.functional as F

def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU or CPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()
    
    return probs

In [30]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

def predict_sentiment_local(path):
    df = pd.read_csv(path)
    text = df.text
    
    #Preprocess
    test_inputs, test_masks = preprocessing_for_bert(text)
    
    #Add to dataloader
    test_dataset = TensorDataset(test_inputs, test_masks)
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=32)

    # Compute predicted probabilities on the test set
    probs = bert_predict(bert_classifier, test_dataloader)
    # Get predictions from the probabilities
    threshold = 0.9
    preds = np.where(probs[:, 1] > threshold, 1, 0)

    sentiment = preds[0]

    return sentiment, probs

In [31]:
path = "sample5_1.csv"

In [32]:
sentiment, probs = predict_sentiment_local(path)
print(sentiment)
print(probs)

  0%|          | 0/1 [00:00<?, ?it/s]



1
[[0.07640223 0.92359775]]


In [35]:
raw_text = "Imagine your life B"
inputs = tokenizer.encode_plus(
    raw_text,
    add_special_tokens=True,
    max_length=512,  # Max length for BERT
    truncation=True,
    padding='max_length',
    return_tensors='pt'  # Return PyTorch tensors
)
