In [None]:
!pip install transformers datasets scikit-learn

In [3]:
from datasets import load_dataset

data_files = {"train": "train.csv", "test": "test.csv"}
# data_files = {"train": "/content/drive/MyDrive/ColabWork/Inputs/train.csv", "test": "/content/drive/MyDrive/ColabWork/Inputs/test.csv"}
dataset = load_dataset('csv', data_files=data_files)
dataset

DatasetDict({
    train: Dataset({
        features: ['code', 'godclass', 'dataclass', 'featureenvy', 'longmethod', 'longparamlist'],
        num_rows: 26548
    })
    test: Dataset({
        features: ['code', 'godclass', 'dataclass', 'featureenvy', 'longmethod', 'longparamlist'],
        num_rows: 6637
    })
})

In [4]:
labels = [label for label in dataset['train'].features.keys() if label not in ['code']]
labels

['godclass', 'dataclass', 'featureenvy', 'longmethod', 'longparamlist']

In [6]:
from transformers import RobertaTokenizer
import numpy as np

model_name = "Salesforce/codet5-base"

tokenizer = RobertaTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    code = examples['code']
    tokens = tokenizer(code, padding="max_length", truncation=True, max_length=128)
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    labels_matrix = np.zeros((len(code), len(labels)))
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    tokens['labels'] = labels_matrix.tolist()

    return tokens

In [7]:
encoded_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")

Map: 100%|██████████| 26548/26548 [00:28<00:00, 931.24 examples/s] 
Map: 100%|██████████| 6637/6637 [00:09<00:00, 720.13 examples/s]


In [8]:
from transformers import AutoModelForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels)).to(device)

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at Salesforce/codet5-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [9]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results_multilabel",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [10]:
from sklearn.metrics import f1_score, accuracy_score
import sklearn.metrics as mt
import torch
from transformers import EvalPrediction

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    accuracy = accuracy_score(y_true, y_pred)

    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [11]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate), None)
)

  trainer = Trainer(


In [14]:
#forward pass
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0).to(device), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0).to(device))
outputs

Seq2SeqSequenceClassifierOutput(loss=tensor(0.2613, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-1.2559, -0.8674, -0.9618, -2.5982, -1.0123]], device='cuda:0',
       grad_fn=<AddmmBackward0>), past_key_values=None, decoder_hidden_states=None, decoder_attentions=None, cross_attentions=None, encoder_last_hidden_state=tensor([[[-0.0577, -0.2030, -0.3015,  ..., -0.0000, -0.1130, -0.0701],
         [ 0.0038,  0.3563,  0.6777,  ..., -0.7557, -0.3001,  0.0000],
         [-0.0180, -0.0000,  0.2003,  ..., -0.0802,  0.0402, -0.0000],
         ...,
         [ 0.1944,  0.0000, -0.0865,  ...,  0.2751, -0.0000,  0.3387],
         [ 0.4551, -0.0707, -0.0088,  ...,  0.5075,  0.0000,  0.0000],
         [ 0.8137,  0.1812,  0.5943,  ...,  0.2201,  0.0193, -0.1406]]],
       device='cuda:0', grad_fn=<NativeDropoutBackward0>), encoder_hidden_states=None, encoder_attentions=None)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model("/content/drive/My Drive/ColabWork/Outputs/codet5_matrix")
tokenizer.save_pretrained("/content/drive/My Drive/ColabWork/Outputs/codet5_matrix")

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

predictions, label_ids, _ = trainer.predict(encoded_dataset['test'])
preds = torch.sigmoid(torch.tensor(predictions)) > 0.5

for i, label in enumerate(labels):
  print(f'\nLabel: {label}')

  cm = confusion_matrix(label_ids[:,i], preds[:,i])
  print(f'\nConfusion matrix for {label}')
  print(cm)

  report = classification_report(label_ids[:,i], preds[:,i], target_names=['Class 0', 'Class 1'], digits = 4)
  print(f'\nClassification report for {label}')
  print(report)