# CodeBERT Experiment

## Setup

Installs PyTorch and HuggingFace transformers.

In [1]:
%%capture
!pip install torch
!pip install transformers

In [2]:
import os
import numpy as np
import torch
import pandas as pd

In [3]:
if torch.cuda.is_available():  
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print(f'We will use the GPU: torch.cuda.get_device_name(0)')
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


## Dataset

Loads dataset from specified `dpath` in Google Drive. Drops empty `label` column and fills null values with empty strings. 

In [4]:
from google.colab import drive

drive.mount('/content/drive')
dpath = 'drive/MyDrive/Research/Notebooks'

Mounted at /content/drive


In [5]:
df = pd.read_csv(os.path.join(dpath, "dataset2.csv"))
df = df.drop(['id'], axis=1).fillna('')
df.astype({
    'projectName': 'string',
    'packageName': 'string',
    'className': 'string',
    'javadocTag': 'string',
    'methodJavadoc': 'string',
    'methodSourceCode': 'string',
    'classJavadoc': 'string',
    'token': 'string',
    'tokenInfo': 'string',
    'tokenClass': 'string',
    'notes': 'string',
}).dtypes

label                 bool
projectName         string
packageName         string
className           string
javadocTag          string
methodJavadoc       string
methodSourceCode    string
classJavadoc        string
oracleSoFar         object
token               string
tokenClass          string
tokenInfo           string
notes               string
dtype: object

In [6]:
df_input = df.drop(['label'], axis=1).fillna('')
df_label = df.label.values

## Tokenization

Loads `codebert-base` from `AutoTokenizer`. Tokenizes inputs feature-wise and creates `DataLoader` objects for `input_ids`, `attention_masks`, and `labels`.

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base", model_max_length=1024)

Downloading (…)okenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [8]:
cat_input = df_input.apply(lambda x: tokenizer.cls_token.join(x.dropna().astype(str).values), axis=1)
cat_input = cat_input.to_numpy().tolist()
token_dict = tokenizer(cat_input, padding='max_length', truncation='longest_first', max_length=512)

In [9]:
input_ids = torch.stack([torch.tensor(ids) for ids in token_dict['input_ids']])
attention_masks = torch.stack([torch.tensor(mask) for mask in token_dict['attention_mask']])
labels = torch.tensor(df_label).int()

In [10]:
from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print(f'{train_size} training samples')
print(f'{val_size} validation samples')

143 training samples
36 validation samples


In [11]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 2

train_loader = DataLoader(train_dataset, 
                          sampler=RandomSampler(train_dataset),
                          batch_size=batch_size)
val_loader = DataLoader(val_dataset, 
                        sampler=SequentialSampler(val_dataset),
                        batch_size=batch_size)

## Classifier
Modifies CodeBERT transformer for classification.

**Architecture:**
- BERT Transformer
- Linear Layer
- Sigmoid Activation

Uses a linear layer to transform BERT output into a single node. Applies Sigmoid activation to get a value between 0 and 1. Returns True iff the values is greater than 0.5.

In [12]:
from torch.nn import Module, Linear, Sigmoid
from transformers import AutoModel

In [13]:
class OracleClassifier(Module):
  def __init__(self, device=None):
    super(OracleClassifier, self).__init__()
    self.bert_transformer = AutoModel.from_pretrained("microsoft/codebert-base")
    hidden_size = self.bert_transformer.config.to_dict()['hidden_size']
    self.linear = Linear(hidden_size, 1)
    self.sigmoid = Sigmoid()
    
  def forward(self, input_ids, input_masks):
    output = self.bert_transformer(input_ids, 
                                   input_masks)
    output = output.pooler_output
    output = self.linear(output)
    output = self.sigmoid(output)
    return output

In [22]:
%%capture
model = OracleClassifier()
model.to(device)

## Training

Uses AdamW optimizer and MSE loss arbitrarily.

**Training Time:** ~17 minutes.<br>
**Validation Time:** ~2 minutes.

Fine-tuning typically requires no more than 4 epochs.

In [18]:
from tqdm.auto import tqdm
import time

from torch.optim import AdamW

optimizer = AdamW(model.parameters())

In [23]:
def calculate_loss(y_hat, target):
  return torch.sum((y_hat.flatten() - target.flatten()) ** 2)

In [None]:
num_epochs = 4

training_loss = []
validation_loss = []
for i in tqdm(range(num_epochs)):
  start_time = time.time()

  ####################
  # TRAINING LOOP
  ####################
  total_loss = 0
  model.train()
  for batch in tqdm(train_loader):
    input_ids = batch[0].to(device)
    input_masks = batch[1].to(device)
    labels = batch[2].to(device)

    model.zero_grad()
    
    output = model(input_ids, input_masks)
    loss = calculate_loss(output, labels)
    total_loss += loss.item()

    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
  training_loss.append(total_loss / len(train_loader))

  ####################
  # VALIDATION LOOP
  ####################
  total_loss = 0
  model.eval()
  for batch in tqdm(val_loader):
    input_ids = batch[0].to(device)
    input_masks = batch[1].to(device)
    labels = batch[2].to(device)

    output = model(input_ids, input_masks)
    loss = calculate_loss(output, labels)
    total_loss += loss.item()
  validation_loss.append(total_loss / len(val_loader))

  print(f'Epoch {i} Summary: training_loss={training_loss[-1]}, validation_loss={validation_loss[-1]}, time={(time.time() - start_time)/60}m')

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/72 [00:00<?, ?it/s]

## Plot Results

In [31]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(range(num_epochs), training_loss, color='orange', label='training')
plt.plot(range(num_epochs), validation_loss, color='blue', label='validation')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.title('CodeBERT fine tuning loss per epoch')
plt.legend()
plt.show()

## Evaluation
Evaluates the accuracy of the model (0/1 output) on the validation dataset.

In [16]:
def get_accuracy(y_hat, target):
  y_hat = torch.where(y_hat > 0.5, 1, 0)
  accuracy = torch.where(y_hat == target, 1., 0.)
  return torch.mean(accuracy)

In [None]:
total_accuracy = 0

model.eval()
for batch in tqdm(val_loader):
  input_ids = batch[0].to(device)
  input_masks = batch[1].to(device)
  labels = batch[2].to(device)

  output = model(input_ids, input_masks)
  total_accuracy += get_accuracy(output, labels).item()
print(total_accuracy / len(val_loader))

## Save Artifact
Saves the model state to the specified path.

In [29]:
artifact_name = 'bert_experiment.pt'
path = F"/content/drive/MyDrive/Research/Notebooks/Artifacts/{artifact_name}"
# torch.save(model.state_dict(), path)

## Load Artifact
Loads the model state from the specified path. 

**NOTE:** Model instance must already be initialized before loading the state.

In [15]:
artifact_name = 'bert_experiment.pt'
path = F"/content/drive/MyDrive/Research/Notebooks/Artifacts/{artifact_name}"
# model.load_state_dict(torch.load(path))
# model.eval()

<All keys matched successfully>