### `K-Fold Cross-validation with BERT`

The text is cleaned and should be tokenized in a way that the embeddings can be extracted for K-Fold cross-validation with the `BERT Sequence Classification` model.

First, let's install the pytorch interface for Bert by Hugging Face.

In [2]:
#!pip install pytorch-pretrained-bert pytorch-nlp

In [38]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertModel, BertTokenizer
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm, trange
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
df_tweets = pd.read_csv('../data/preprocessed_tweets.csv')
df_tweets.shape

(22830, 9)

In [8]:
df_tweets = df_tweets.drop(['Unnamed: 0'], axis=1)
df_tweets.sample(10)

Unnamed: 0,vader_sentiment_label,vader_score,tweet,tweet_length,url_link,pos_emoji,neg_emoji,profanity_word
1851,1,0.4101,finna go gma hou escap depress make happi,72,0,0,0,0
15938,1,0.5994,peopl anxieti depress flaunt around like medal...,107,0,0,0,0
10536,0,0.0258,relat newfoundland bankrupt depress effecti ha...,259,0,0,0,0
5517,0,-0.7606,depress hit got ta remind bitch like aye wtf d...,104,0,0,0,2
12501,0,-0.7236,watch coupl episod st season battl depress cho...,243,0,0,0,0
14757,0,-0.6771,must watch everyon especi one suffer mental di...,139,1,0,0,0
2412,0,-0.5719,ever thought area might contain cure depress,76,0,0,0,0
12181,0,-0.1531,friend plea pray mi wouldst depress pain deal ...,147,0,0,0,0
21198,0,-0.296,depress long therapi gap like,47,0,0,0,0
19318,1,0.9779,hi want say actual love ygt sm help thru depre...,234,0,0,0,0


### Inputs
**BERT** requires specifically formatted inputs. For each tokenized input sentence, we need to create:

- **input ids**: a sequence of integers identifying each input token to its index number in the BERT tokenizer vocabulary
- **segment mask**: (optional) a sequence of 1s and 0s used to identify whether the input is one sentence or two sentences long. For one sentence inputs, this is simply a sequence of 0s. For two sentence inputs, there is a 0 for each token of the first sentence, followed by a 1 for each token of the second sentence. Will not be used in this project.
- **attention mask**: (optional) a sequence of 1s and 0s, with 1s for all input tokens and 0s for all padding tokens (we'll detail this in the next paragraph)
- **labels**: a single value of 1 or 0. In our task 1 means "grammatical" and 0 means "ungrammatical"

In [35]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.cuda()

RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1650 Ti'

In [9]:
K_FOLDS = 10
MAX_LEN = 128
PADDING = 'post'
TRUNCATING = 'post'
DTYPE = 'long'
BATCH_SIZE = 32

In [32]:
# Returns tokens of the tweet, and tensors of the tokens and segment ids
def tokenization(tweets, maxlen=MAX_LEN, dtype=DTYPE, truncating=TRUNCATING, padding=PADDING, tokenizer=tokenizer):
    tweets = ["[CLS] " + tweet + " [SEP]" for tweet in tweets]
    tokenized_texts = [tokenizer.tokenize(tweet) for tweet in tweets]

    # Map the token strings to their vocabulary indeces.
    input_ids = [tokenizer.convert_tokens_to_ids(text) for text in tokenized_texts]

    # Pad our input tokens
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype=dtype, truncating=truncating, padding=padding)

    # Create attention masks
    attention_masks = []

    for seq in input_ids:
      seq_mask = [float(i>0) for i in seq]
      attention_masks.append(seq_mask)

    # Convert inputs to PyTorch tensors
    # tokens_tensor = torch.tensor([indexed_tokens])
    # segments_tensor = torch.tensor([segments_ids])

    return tokenized_texts, input_ids, attention_masks

In [37]:
LEARNING_RATE = 2e-5
EPOCHS = 3

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = BertAdam(optimizer_grouped_parameters, lr=LEARNING_RATE, warmup=.1)

t_total value of -1 results in schedule not being applied


In [41]:
tweets = df_tweets.tweet.values
labels = df_tweets.vader_sentiment_label.values

_, input_ids, attention_masks = tokenization(tweets) 
dataset = {'input_ids': input_ids, 'attention_masks': attention_masks, 'labels': torch.tensor(labels)}

### K-fold Crossvalidation on BERT

In [43]:
def crossvalidation(df, dataset, device, optimizer, epochs=EPOCHS, k_folds=K_FOLDS, batch_size=BATCH_SIZE):

    # Define k-fold cross-validation
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

    # Initialize lists to store accuracies for each fold
    fold_accuracies = []
    
    # Perform k-fold cross-validation
    for fold, (train_indices, val_indices) in enumerate(skf.split(df['tweet'], df['vader_sentiment_label'])):
        print(f"Training fold: {fold+1}/{k_folds}")

        # Split dataset into train and validation sets for the current fold
        train_dataset = torch.utils.data.Subset(dataset, train_indices)
        val_dataset =  torch.utils.data.Subset(dataset, val_indices)

        # Create data loaders
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # Training loop
        model.to(device)
        model.train()
        for _ in trange(epochs, desc="Epoch"):
            for batch in train_loader:
                # clear out the gradients (by default they accumulate)
                optimizer.zero_grad()
                # add batch to GPU
                batch = tuple(t.to(device) for t in batch)
                # unpack the inputs from dataloader
                b_input_ids, b_input_mask, b_labels = batch
                # forward pass
                outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
                loss = outputs.loss
                # backward pass
                loss.backward()
                # update parameters and take a step using the computed gradient
                optimizer.step()
        
        # Validation
        # Put model on evaluation mode to evaluate loss on the validation set
        model.eval()

        # Tracking variables
        val_predictions, val_labels = [], []
        with torch.no_grad():
            # Evaluate the data for one epoch
            for batch in val_loader:
                # add batch to GPU
                batch = tuple(t.to(device) for t in batch)
                # unpack the inputs from dataloader
                b_input_ids, b_input_mask, b_labels = batch
                logits = model(input_ids, attention_mask=attention_mask)
                _, predicted_labels = torch.max(outputs.logits, dim=1)
                val_predictions.extend(predicted_labels.tolist())
                val_labels.extend(labels.tolist())

        fold_accuracy = accuracy_score(val_labels, val_predictions)
        fold_accuracies.append(fold_accuracy)
        print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")

    # Calculate average accuracy across all folds
    average_accuracy = sum(fold_accuracies) / len(fold_accuracies)
    print(f"Average Accuracy: {average_accuracy}")
    