In [None]:
!pip install transformers

In [None]:
import time
import datetime

import torch 
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from google.colab import files
import torch.nn as nn 

from transformers import BertModel
from transformers import BertTokenizer


# train on GPU if it is available
if torch.cuda.is_available(): 
    device = torch.device("cuda")
else: 
    device = torch.device("cpu")

# Upload Tweets csv
uploaded = files.upload()

In [None]:
np.random.seed(42)

df = pd.read_csv("Tweets.csv")

# Select the utils columns
df = df[["text", "airline_sentiment"]]

# Convert to int the labels
labels = ["positive", "negative", "neutral"]
d = {k:v for k,v in zip(labels, range(3))}
df["airline_sentiment"] = df["airline_sentiment"].apply(lambda x : d[x])

# Split train, validation and test
idxs = np.arange(df.shape[0])
np.random.shuffle(idxs)

train_size = int(len(df)*0.6) 
val_size = int(len(df)*0.2)

#train_size + val_size 
train_idxs, val_idxs, test_idxs = idxs[:train_size], idxs[train_size:(train_size + val_size)], idxs[(train_size + val_size):]

y_train = df.iloc[train_idxs]["airline_sentiment"]
X_train = df.iloc[train_idxs].drop(columns=["airline_sentiment"])

y_val = df.iloc[val_idxs]["airline_sentiment"]
X_val = df.iloc[val_idxs].drop(columns=["airline_sentiment"])

# Let's extract the senteces and the values
sentences = X_train["text"].values
labels = y_train.values


df.head(5)

In [None]:
# Load the BERT tokenizer
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
def tokenize(sentences, labels):
  # Tokenize all of the sentences and map the tokens

  MAX_LEN = 70
  
  input_ids = list()
  attention_masks = list()

  for sent in sentences:
      encoded_dict = tokenizer.encode_plus(
                                          sent,
                                          add_special_tokens = True,
                                          max_length = MAX_LEN,
                                          pad_to_max_length = True, 
                                          return_attention_mask = True, 
                                          return_tensors = "pt", 
                                          )

      input_ids.append(encoded_dict["input_ids"])
      attention_masks.append(encoded_dict["attention_mask"])

  input_ids = torch.cat(input_ids, dim = 0)
  attention_masks = torch.cat(attention_masks, dim = 0)
  labels = torch.tensor(labels) 

  return input_ids, attention_masks, labels

input_ids, attention_masks, labels = tokenize(sentences=sentences, labels = labels) 


# Print sentence 0, now as a list of IDs.
print(f'Original: {sentences[0]}')
print(f'Token IDs: {input_ids[0]}')

In [None]:
# Create the DataLoaders for our training sets.
train_dataset = TensorDataset(input_ids, attention_masks, labels)

BATCH_SIZE = 32
 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = BATCH_SIZE # Trains with this batch size.
        )

In [None]:
# Create personalized classifier 

class BertClassifier(nn.Module):

    "Bert Model for Classification"

    def __init__(self, freeze_bert = False):
        super(BertClassifier, self).__init__()

        # Specify the size of Bert
        input = 768
        hidden = 50
        output = 3

        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.classifier = nn.Sequential(nn.Linear(input, hidden),
                                        nn.ReLU(), 
                                        nn.Linear(hidden, output))
        
        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):

        # Feed input on Bert
        outputs = self.bert(input_ids= input_ids, attention_mask= attention_mask)
        last_hidden_state_cls = outputs[0][:, 0, :]
        
        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)
        #print(logits)

        return logits


model = BertClassifier()
model.cuda()

In [None]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs.
epochs = 3
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Total number of training steps is [number of batches] x [number of epochs]. 
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))



In [None]:
import random
import numpy as np

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training loss and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# Loss fucntion
loss_fn = nn.CrossEntropyLoss()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    
    t0 = time.time() # Starting time.
    
    total_train_loss = 0 # Reset the total loss for this epoch.
    model.train() # Put the model into training mode.

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            
            elapsed = format_time(time.time() - t0) # Calculate elapsed time in minutes.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) # Report progress.

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()  # Clear calculated gradients      

        # Perform a forward pass (evaluate the model on this training batch).
        logits = model(b_input_ids, attention_mask=b_input_mask)
        loss = loss_fn(logits, b_labels)

        total_train_loss += loss.item() #Accumulate the training loss over all of the batches so that we can calculate the average loss at the end.

        loss.backward()  # Perform a backward pass to calculate the gradients.

        # Clip the norm of the gradients to 1.0.  This is to help prevent the "exploding gradients" problem.  
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step() # Update parameters and take a step using the computed gradient.

        
        scheduler.step() # Update the learning rate.

    
    avg_train_loss = total_train_loss / len(train_dataloader) # Calculate the average loss over all of the batches.       
    training_time = format_time(time.time() - t0) # Measure how long this epoch took.

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
   
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Training Time': training_time,
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
sentences = df["text"].values
labels = df["airline_sentiment"].values
BATCH_SIZE = 1


input_ids, attention_masks, _ = tokenize(sentences=sentences, labels = labels)
dataset = TensorDataset(input_ids, attention_masks)
train_dataloader = DataLoader(dataset, batch_size = BATCH_SIZE, shuffle=False)


outputs = list()

for step, batch in enumerate(train_dataloader):

  if step % 80 == 0: 
    print(f"Step:{step}/{len(train_dataloader)}")

  input_ids = batch[0].to(device)
  attention_masks = batch[1].to(device)

  output = model(input_ids, attention_masks)
  output = output.detach().cpu().numpy()[0]

  input_ids.to("cpu")
  attention_masks.to("cpu")
  
  outputs.append(output)



outputs = np.array(outputs)

output_df = pd.DataFrame(outputs, columns = ["positive", "negative", "neutral"])
print(output_df.head(5))
print(output_df.shape)

In [None]:
# Save bert output! 
from google.colab import files

filename = "bert_out.csv"
output_df.to_csv(filename) 
files.download(filename)