<a href="https://colab.research.google.com/github/CarloCHEN/Tweet-Sentiment-Extraction_Kaggle/blob/master/Sentiment_prediction_by_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# import tools
import os
from nltk.tokenize import sent_tokenize

This notebook introduces a way to predict the sentiment of a tweet by using the pre-trained BERT model.
Original tweets and their sentiment are used to predict if a new tweet is negative, neutral or positive.

The model being used: pretrained_bert_base_uncased

In [5]:
# load the model
pip install pytorch_pretrained_bert


Collecting pytorch_pretrained_bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 21.1MB/s eta 0:00:01[K     |█████▎                          | 20kB 1.8MB/s eta 0:00:01[K     |████████                        | 30kB 2.6MB/s eta 0:00:01[K     |██████████▋                     | 40kB 1.7MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 2.1MB/s eta 0:00:01[K     |███████████████▉                | 61kB 2.5MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 2.9MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 2.2MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 2.5MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 2.8MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 2.8MB/s eta 0:00:01[K     |██████████████████████

In [6]:
# import libs, tools and models
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

Using TensorFlow backend.


In [0]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
# load dataset from google drive
link = 'https://drive.google.com/open?id=1P4z6napWZDIZS7hFhnAL3GUYnxpz1aPS'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('train_1.csv')  
train = pd.read_csv('train_1.csv')

In [9]:
# use the tokenizer from the model "bert_base_uncased"
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

100%|██████████| 231508/231508 [00:00<00:00, 5810528.99B/s]


In [0]:
# use CUDA instead of cpu to accelerate the computation
# gpu is required as the runtime
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [11]:
# check out device type (cpu or CUDA)
device

device(type='cuda')

In [0]:
# 'text' which is the original tweets would be the independent variable
sentences = train['text']

In [0]:
# tokenizing and put tokens in a list
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

In [14]:
# check out the tokens
tokenized_texts[1]

['oh', '!', 'good', 'idea', 'about', 'putting', 'them', 'on', 'ice', 'cream']

In [15]:
# check what is the max length of the tokenized tweet
# this parameter 'max_len' is gonna be used in the following sequencing function
max_len = 0
for i in range(len(tokenized_texts)):
    if (len(tokenized_texts[i]) > max_len):
        max_len = len(tokenized_texts[i])
    
max_len

108

In [0]:
# Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the vocabulary.
# pad_sequencing the ids
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=108, dtype="long", truncating="post", padding="post")

In [17]:
# check out the ids
input_ids

array([[ 2985,  1996,  2972, ...,     0,     0,     0],
       [ 2821,   999,  2204, ...,     0,     0,     0],
       [ 2758,  2204,  1006, ...,     0,     0,     0],
       ...,
       [ 2652, 19219, 21940, ...,     0,     0,     0],
       [ 2156,  1057,  9061, ...,     0,     0,     0],
       [ 5292,  5292,  1010, ...,     0,     0,     0]])

In [0]:
# use the sentiment info in the dataset 
# please note that the original code for neg, neu, pos (-1, 0 ,1) is not suitable for bert
# as labels in bert start from 0
# so the sentiment codes are transformed to (0, 1, 2) by incrementing 1
sentiment = train['sentiment'] + 1
sentiment = sentiment.values
sentiment = sentiment.reshape((sentiment.shape[0],1))

In [0]:
# reshape the sentiment array 
# so that it has the same shape with that of the input_ids'
segment = sentiment
for i in range(0,107):
    sentiment=np.concatenate((sentiment,segment),axis=1)

In [20]:
# check out the shape
sentiment.shape

(27485, 108)

In [0]:
# create attention masks for the input_ids
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

In [0]:
# train test split the data
tr_inputs, val_inputs, tr_sentiment, val_sentiment = train_test_split(input_ids, sentiment, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [0]:
# transforming codes into tensors
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_sentiment = torch.tensor(tr_sentiment)
val_sentiment = torch.tensor(val_sentiment)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [24]:
# check out the shape
tr_sentiment.shape

torch.Size([24736, 108])

In [0]:
# transforming the data
train_data = TensorDataset(tr_inputs, tr_masks, tr_sentiment)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

valid_data = TensorDataset(val_inputs, val_masks, val_sentiment)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=32)

In [26]:
# initialize the model
# number of labels is 3 (0, 1, 2)
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=3)
model = model.cuda()

100%|██████████| 407873900/407873900 [00:05<00:00, 76372578.37B/s]


In [0]:
# setting up the optimizer
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [28]:
# use seqeval
%pip install seqeval

Collecting seqeval
  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-0.0.12-cp36-none-any.whl size=7424 sha256=e663ed4120d387718b62ade8b1337280592686b0fddb6dc34db9ddffdf00feb5
  Stored in directory: /root/.cache/pip/wheels/4f/32/0a/df3b340a82583566975377d65e724895b3fad101a3fb729f68
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-0.0.12


In [0]:
# create a method for accuracy scoring
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [0]:
# create a list with label description for modeling interpretability purpose
# '0' for 'negative', '1' for 'neutral', '2' for 'positive'
tags_vals = ['negative', 'neutral', 'positive']

In [31]:
# model training and validation with 4 epochs
epochs = 4
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []

    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Train loss: 0.6889187627282828
Validation loss: 0.6485453397035599
Validation Accuracy: 0.7762002648950074


Epoch:  25%|██▌       | 1/4 [05:28<16:26, 328.69s/it]

F1-Score: 0.418236301369863
Train loss: 0.4769166450065598
Validation loss: 0.624666781792807
Validation Accuracy: 0.7912843834461968


Epoch:  50%|█████     | 2/4 [10:57<10:57, 328.69s/it]

F1-Score: 0.4570392201314895
Train loss: 0.320456727876546
Validation loss: 0.7894930423692216
Validation Accuracy: 0.7796017154132883


Epoch:  75%|███████▌  | 3/4 [16:26<05:28, 328.75s/it]

F1-Score: 0.44946868641193755
Train loss: 0.20976736780766875
Validation loss: 0.7938755804369616
Validation Accuracy: 0.7838572991847098


Epoch: 100%|██████████| 4/4 [21:55<00:00, 328.85s/it]

F1-Score: 0.5074251195570099





In [32]:
# model evaluation and validation
model.eval()
predictions = []
true_labels = []
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                              attention_mask=b_input_mask, labels=b_labels)
        logits = model(b_input_ids, token_type_ids=None,
                       attention_mask=b_input_mask)
        
    logits = logits.detach().cpu().numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])

    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]
valid_tags = [[tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l ]
print("Validation loss: {}".format(eval_loss/nb_eval_steps))
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))


Validation loss: 0.7938755804369616
Validation Accuracy: 0.7838572991847098
Validation F1-Score: 0.7268093781855249


In [0]:
# a method for predicting the sentiment, given a sentence of tweet
# print the coefficients for prediction 
def sentiment_predict(sentence):
    text = sentence
    tkns = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tkns)
    segments_ids = [0] * len(tkns)
    tokens_tensor = torch.tensor([indexed_tokens]).to(device)
    segments_tensors = torch.tensor([segments_ids]).to(device)
    print(tokens_tensor)
    print(segments_tensors)
    model.eval()
    prediction = []
    logit = model(tokens_tensor, token_type_ids=None,
                                  attention_mask=segments_tensors)
    logit = logit.detach().cpu().numpy()
    print(logit)
    prediction.extend([list(p) for p in np.argmax(logit, axis=2)])
    for k, j in enumerate(prediction[0]):
        if j==1 or j==0:
            print(tokenizer.convert_ids_to_tokens(tokens_tensor[0].to('cpu').numpy())[k], j)


In [42]:
# test with a sentence
# '1' stands for 'neutral'
sentiment_predict("ops")

tensor([[23092]], device='cuda:0')
tensor([[0]], device='cuda:0')
[[[-1.9385886   2.6365848  -0.35850373]]]
ops 1
