In [16]:
!pip install transformers datasets scikit-learn



In [17]:
from datasets import load_dataset

data_files = {"train": "/content/drive/MyDrive/ColabWork/Inputs/train.csv", "test": "/content/drive/MyDrive/ColabWork/Inputs/test.csv"}
dataset = load_dataset('csv', data_files=data_files)
dataset

DatasetDict({
    train: Dataset({
        features: ['code', 'godclass', 'dataclass', 'featureenvy', 'longmethod', 'longparamlist'],
        num_rows: 26548
    })
    test: Dataset({
        features: ['code', 'godclass', 'dataclass', 'featureenvy', 'longmethod', 'longparamlist'],
        num_rows: 6637
    })
})

In [19]:
labels = [label for label in dataset['train'].features.keys() if label not in ['code']]
labels

['godclass', 'dataclass', 'featureenvy', 'longmethod', 'longparamlist']

In [20]:
from transformers import RobertaTokenizer
import numpy as np

model_name = "microsoft/unixcoder-base"

tokenizer = RobertaTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    code = examples['code']
    tokens = tokenizer(code, padding="max_length", truncation=True, max_length=128)
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    labels_matrix = np.zeros((len(code), len(labels)))
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    tokens['labels'] = labels_matrix.tolist()

    return tokens

In [21]:
encoded_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")

In [22]:
from transformers import AutoModelForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels)).to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/unixcoder-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results_multilabel",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [24]:
from sklearn.metrics import f1_score, accuracy_score
import sklearn.metrics as mt
import torch
from transformers import EvalPrediction

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    accuracy = accuracy_score(y_true, y_pred)

    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [25]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate), None)
)

  trainer = Trainer(


In [26]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0452,0.03862,0.23946,0.9438
2,0.0347,0.037022,0.413199,0.948471
3,0.0217,0.038558,0.49694,0.947265


TrainOutput(global_step=4980, training_loss=0.03761810796806611, metrics={'train_runtime': 2021.3503, 'train_samples_per_second': 39.401, 'train_steps_per_second': 2.464, 'total_flos': 5238945334600704.0, 'train_loss': 0.03761810796806611, 'epoch': 3.0})

In [27]:
trainer.evaluate()

{'eval_loss': 0.03855816647410393,
 'eval_f1': 0.4969400244798042,
 'eval_accuracy': 0.9472653307217116,
 'eval_runtime': 44.0715,
 'eval_samples_per_second': 150.596,
 'eval_steps_per_second': 9.417,
 'epoch': 3.0}

In [28]:
trainer.save_model("/content/drive/My Drive/ColabWork/Outputs/unixcoder_matrix")
tokenizer.save_pretrained("/content/drive/My Drive/ColabWork/Outputs/unixcoder_matrix")

('/content/drive/My Drive/ColabWork/Outputs/unixcoder_matrix/tokenizer_config.json',
 '/content/drive/My Drive/ColabWork/Outputs/unixcoder_matrix/special_tokens_map.json',
 '/content/drive/My Drive/ColabWork/Outputs/unixcoder_matrix/vocab.json',
 '/content/drive/My Drive/ColabWork/Outputs/unixcoder_matrix/merges.txt',
 '/content/drive/My Drive/ColabWork/Outputs/unixcoder_matrix/added_tokens.json')

In [32]:
from sklearn.metrics import confusion_matrix, classification_report

predictions, label_ids, _ = trainer.predict(encoded_dataset['test'])
preds = torch.sigmoid(torch.tensor(predictions)) > 0.5

for i, label in enumerate(labels):
  print(f'\nLabel: {label}')

  cm = confusion_matrix(label_ids[:,i], preds[:,i])
  print(f'\nConfusion matrix for {label}')
  print(cm)

  report = classification_report(label_ids[:,i], preds[:,i], target_names=['Class 0', 'Class 1'], digits = 4)
  print(f'\nClassification report for {label}')
  print(report)


Label: godclass

Confusion matrix for godclass
[[6444   36]
 [  77   80]]

Classification report for godclass
              precision    recall  f1-score   support

     Class 0     0.9882    0.9944    0.9913      6480
     Class 1     0.6897    0.5096    0.5861       157

    accuracy                         0.9830      6637
   macro avg     0.8389    0.7520    0.7887      6637
weighted avg     0.9811    0.9830    0.9817      6637


Label: dataclass

Confusion matrix for dataclass
[[6507   25]
 [  55   50]]

Classification report for dataclass
              precision    recall  f1-score   support

     Class 0     0.9916    0.9962    0.9939      6532
     Class 1     0.6667    0.4762    0.5556       105

    accuracy                         0.9879      6637
   macro avg     0.8291    0.7362    0.7747      6637
weighted avg     0.9865    0.9879    0.9870      6637


Label: featureenvy

Confusion matrix for featureenvy
[[6534   17]
 [  74   12]]

Classification report for featureenvy
 