Based on https://skimai.com/fine-tuning-bert-for-sentiment-analysis/

In [1]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [37]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=str(sent),  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,                  # Max length to truncate/pad
            padding='max_length',         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True      # Return attention mask
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [41]:
train_data = pd.read_csv('./data/downloaded/twitter-train-clean.tsv', sep='\t', names=['label', 'text'])
test_data = pd.read_csv('./data/downloaded/twitter-test-clean.tsv', sep='\t', names=['label', 'text'])
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

train_data.head()

Unnamed: 0,label,text
0,neutral,Won the match getin . Plus tomorrow is a very ...
1,neutral,Some areas of New England could see the first ...
2,negative,worst QB. DEFINITELY Tony Romo. The man who li...
3,neutral,Thailand Washington US President Barack Obama ...
4,neutral,Did yall hear what Tony Romo dressed up as for...


In [42]:


#  Encode our concatenated data
encoded_tweets = [tokenizer.encode(str(sent), add_special_tokens=True) for sent in np.concatenate([train_data.text.values, test_data.text.values])]

# Find the maximum length
max_len = max([len(sent) for sent in encoded_tweets])
print('Max length: ', max_len)

MAX_LEN = 64

print('Before removing sequences', 'Train:', len(train_data), 'Test:', len(test_data))
indices = train_data['text'].apply(lambda x: len(tokenizer.encode(str(x), add_special_tokens=True)) < MAX_LEN)
train_data = train_data[indices]
indices = test_data['text'].apply(lambda x: len(tokenizer.encode(str(x), add_special_tokens=True)) < MAX_LEN)
test_data = test_data[indices]
print('After removing sequences', 'Train:', len(train_data), 'Test:', len(test_data))

Token indices sequence length is longer than the specified maximum sequence length for this model (842 > 512). Running this sequence through the model will result in indexing errors


Max length:  1391
Before removing sequences Train: 40297 Test: 9834
After removing sequences Train: 40283 Test: 9827


In [43]:
from sklearn.model_selection import train_test_split
# Split data

x_train, x_val, y_train, y_val = train_test_split(train_data.text.values, train_data.label.values, test_size=0.1, random_state=2020)

In [44]:
print('Tokenizing data...')
train_inputs, train_masks = preprocessing_for_bert(x_train)
val_inputs, val_masks = preprocessing_for_bert(x_val)
print('Done!')

Tokenizing data...
Done!


In [45]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

def create_data_loaders():
    # Convert other data types to torch.Tensor
    train_labels = torch.tensor(y_train)
    val_labels = torch.tensor(y_val)

    # For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
    batch_size = 32

    # Create the DataLoader for our training set
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # Create the DataLoader for our validation set
    val_data = TensorDataset(val_inputs, val_masks, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    return train_dataloader, val_dataloader

train_dataloader, val_dataloader = create_data_loaders()

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.