<a href="https://colab.research.google.com/github/ErcongLuo/GU_ANLY580_FinalProject/blob/main/Project580Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT Emotion Classification using COVID-19 Twitter Data

## Import Dataset

In [None]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df = pd.read_csv("TwitterSentimentDataset2.csv", engine = "python" )

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Display 10 random rows from the data.
df.sample(10)\

In [None]:
df['emote'] = pd.factorize( df.emotion )[0]

In [None]:
df.emotion.value_counts()

In [None]:
df.emote.value_counts()

In [None]:
tweets = df['full_text'].tolist()
tweets[0:10]

In [None]:
tweets[0]

In [None]:
tweets[30]

### Cleaning Subroutine

In [None]:
clean = False

In [None]:
import re
if clean:
  for i in range (len(tweets)):
      tweets[i] = re.sub("@[\S]+", " ", tweets[i]) #remove all users
      tweets[i] = re.sub("\w+:\/\/\S+", " ", tweets[i]) #remove all links
  df.full_text = tweets

In [None]:
tweets[0]

In [None]:
tweets[30]

### Split Data

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, random_state = 42, train_size= 0.9)

In [None]:
df_train.emotion.value_counts()

### Install Dependencies

In [None]:
!pip install datasets
!pip install -q transformers torch

### Hyperparameters

In [None]:
batch_size = 32
epochs = 6
lr_init = 5e-5
max_len = 400 #updated in cell below
warmup_steps = 2

### Batched Inputs

In [None]:
from transformers import DistilBertTokenizer, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
#tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

max_len = 0

# For every tweet
for sent in df.full_text.to_list():
    input_ids = tokenizer.encode(sent, add_special_tokens=True)  # Tokenize
    max_len = max(max_len, len(input_ids))    #update max length

print('Max token length: ', max_len)

In [None]:

def batch_data(data, bsize):
    batches = []
    sentences = data['full_text'].tolist()
    labels = data['emote'].tolist()
    for i in range(0, len(sentences), bsize):
        s = sentences[i: i + bsize]
        Y = labels[i: i + bsize]
        X = tokenizer.batch_encode_plus(
            s, max_length=max_len, padding='longest', truncation=True,
            return_attention_mask=True, return_token_type_ids=False)
        batches.append((X, Y, s))
    return batches

train_batches = batch_data(df_train, bsize=batch_size)
test_batches = batch_data(df_test, bsize=batch_size)

In [None]:
train_batches[0]

### Load model

In [None]:
from tqdm import tqdm
import torch
from transformers import BertForSequenceClassification, DistilBertForSequenceClassification, \
  AdamW, get_linear_schedule_with_warmup


model = BertForSequenceClassification.from_pretrained(
    'bert-base-cased',
    #'bert-base-uncased',
    num_labels=5,
    output_hidden_states=True)

if torch.cuda.is_available():
    device = torch.device('cuda:0') # GPU
    print("GPU")
else:
  device = torch.device('cpu') # CPU

model.to(device)

optimizer = AdamW(model.parameters(), lr=lr_init)
lr = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, 
    num_training_steps=len(train_batches))

### Training

In [None]:
from datasets import load_metric

def runner(batches, desc: str, train=True):
    
    grad_mode = torch.enable_grad if train else torch.no_grad
    preds = []
    
    if train:
        model.train()
    else:
        model.eval()
    
    for epoch in range(epochs if train else 1):
        
        acc = load_metric("accuracy", keep_in_memory=True)
        f1 = load_metric("f1", keep_in_memory=True)
        cumloss = 0.0
        embeds = []
        
        with tqdm(total=len(batches)) as bar:

            for i, batch in enumerate(batches):
                X, Y, _ = batch
                inputs = torch.tensor(X['input_ids'], device=device)
                attmsk = torch.tensor(X['attention_mask'], device=device)
                labels = torch.tensor(Y, device=device)
                batch = {'input_ids': inputs,
                         'attention_mask': attmsk,
                         'labels': labels}
                with grad_mode():
                    outputs = model(**batch)
                    embeds.append(outputs[-1][1][:, 0, :].squeeze().detach().cpu())
                    loss = outputs.loss
                    if train:
                        loss.backward()
                        optimizer.step()
                        lr.step()
                        optimizer.zero_grad()
                    logits = outputs.logits
                    Yhat = torch.argmax(logits, dim=-1)
                    preds.append(Yhat)
                    cumloss += loss.clone().detach().cpu().item()
                    acc.add_batch(predictions=Yhat, references=Y)
                    f1.add_batch(predictions=Yhat, references=Y)

                bar.update(1)
            bar.set_description('epoch: %s, %s loss: %.5f, f1-score: %.5f, accuracy: %.5f' %
                                (epoch + 1, desc,
                                 cumloss / (i + 1),
                                 f1.compute(average="macro")['f1'],
                                 acc.compute()['accuracy']))
                
    embeds = torch.cat(embeds, dim=0)
        
    return preds, embeds

In [None]:
# Train model
runner(train_batches, 'train');

### Evaluation

In [None]:
# Evaluate training set
preds, embeds = runner(train_batches, 'train', train=False)

### Define Confusion Matrix Function

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix
def conf_mat(tensor_preds, batches):  
  preds = torch.cat(tensor_preds).tolist()
  y_true = []
  for i, batch in enumerate(batches):
      X, Y, _ = batch
      y_true.extend(Y)
  return(confusion_matrix(y_true, preds))

In [None]:
conf_mat(preds, train_batches)

In [None]:
# Evaluate test set
preds, embeds = runner(test_batches, 'test', train=False)

In [None]:
conf_mat(preds, test_batches)

### Save Model Weights

In [None]:
model.save_pretrained("model/BERT1")

### Validation Set

In [None]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df = pd.read_csv("validation_set.csv" )

In [None]:
df['full_text'] = df.text
df['emote'] = df.LabelJasmine
df

In [None]:
validation_batches = batch_data(df, batch_size)

In [None]:
model = BertForSequenceClassification.from_pretrained('.' ,num_labels = 5)

if torch.cuda.is_available():
    device = torch.device('cuda:0') # GPU
    print("GPU")
else:
  device = torch.device('cpu') # CPU

model.to(device);

In [None]:
# Evaluate validation set
preds, embeds = runner(validation_batches, 'validation', train=False)

In [None]:
conf_mat(preds, validation_batches)