In [1]:
# !pip install transformers accelerate
from transformers import AutoTokenizer, AutoModel, EarlyStoppingCallback, AutoModelForSequenceClassification, AutoConfig,Trainer, TrainingArguments,DataCollatorWithPadding
import torch
import numpy as np
import pandas as pd
# !pip install datasets
from datasets import load_metric
# %pip install evaluate
from evaluate import evaluator
from sklearn.model_selection import train_test_split
import csv

# notebook_login()

CUDA_LAUNCH_BLOCKING=1


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_csv(r'train.csv', encoding='utf_8_sig')
train.fillna('', inplace=True)
test = pd.read_csv(r'test.csv', encoding='utf_8_sig')
test.fillna('', inplace=True)

In [3]:
train

Unnamed: 0,label,text,diff
0,negative,Fix macro names - related to #448,diff --git a/src/ciphers/aes/aes.c b/src/ciphe...
1,negative,9.0.4,diff --git a/package.json b/package.json\ninde...
2,positive,https://github.com/ImageMagick/ImageMagick/iss...,diff --git a/coders/png.c b/coders/png.c\ninde...
3,negative,test: incr. disk size for TEST 35 ISCSI-MULTI,diff --git a/test/TEST-35-ISCSI-MULTI/test.sh ...
4,negative,git-svn-id: https://plugins.svn.wordpress.org/...,diff --git a/inc/define.php b/inc/define.php\n...
5,negative,[JBPM-9474] Produce events to Kafka by Signal ...,diff --git a/jbpm-bpmn2/src/main/java/org/jbpm...
6,positive,SPOOLSS: Try to avoid an infinite loop.\n\nUse...,diff --git a/epan/dissectors/packet-dcerpc-spo...
7,positive,Normalize resource URL in ResourceServlet\n\nI...,diff --git a/spring-webmvc/src/main/java/org/s...
8,negative,Merge branch '1.8.x',diff --git a/src/ClientCommand.cpp b/src/Clien...
9,negative,Change distribution URL.\n\ngit-svn-id: https:...,diff --git a/c/xml-security-c.spec b/c/xml-sec...


In [4]:
train_code_change = list(train['diff'].astype(str))
test_code_change = list(test['diff'].astype(str))

In [5]:
# !pip install setfit
from commitfit import get_templated_dataset,sample_dataset
from datasets import Dataset, load_metric

In [6]:
Dataset_train = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

In [7]:
train_dataset = get_templated_dataset(Dataset_train, candidate_labels=['positive','negative'], sample_size=8)

In [8]:
# list(train['message'].astype(str).values)
train_dataset

Dataset({
    features: ['label', 'text', 'diff'],
    num_rows: 66
})

In [9]:
len(train)

50

In [10]:
train['label'].value_counts()

label
negative    31
positive    19
Name: count, dtype: int64

In [11]:
test['label'].value_counts()

label
negative    6316
positive    3746
Name: count, dtype: int64

In [12]:
len(train_dataset)

66

In [13]:
from sklearn import  metrics

def compute_metrics(y_pred, y_test):
    # print(y_pred,y_test)
    # classification_report = metrics.classification_report(y_test,y_pred)
    accuracy_score = metrics.accuracy_score(y_test,y_pred)
    precision_score = metrics.precision_score(y_test,y_pred, average='weighted')
    recall_score = metrics.recall_score(y_test,y_pred,average='weighted')
    f1_score = metrics.f1_score(y_test,y_pred,average='weighted')

    # return {"classification_report": classification_report}
    return {"accuracy": accuracy_score, "precision": precision_score,"recall": recall_score, "f1":f1_score}

In [14]:
test['label'].value_counts()

label
negative    6316
positive    3746
Name: count, dtype: int64

In [15]:
train_dataset

Dataset({
    features: ['label', 'text', 'diff'],
    num_rows: 66
})

In [16]:
train

Unnamed: 0,label,text,diff
0,negative,Fix macro names - related to #448,diff --git a/src/ciphers/aes/aes.c b/src/ciphe...
1,negative,9.0.4,diff --git a/package.json b/package.json\ninde...
2,positive,https://github.com/ImageMagick/ImageMagick/iss...,diff --git a/coders/png.c b/coders/png.c\ninde...
3,negative,test: incr. disk size for TEST 35 ISCSI-MULTI,diff --git a/test/TEST-35-ISCSI-MULTI/test.sh ...
4,negative,git-svn-id: https://plugins.svn.wordpress.org/...,diff --git a/inc/define.php b/inc/define.php\n...
5,negative,[JBPM-9474] Produce events to Kafka by Signal ...,diff --git a/jbpm-bpmn2/src/main/java/org/jbpm...
6,positive,SPOOLSS: Try to avoid an infinite loop.\n\nUse...,diff --git a/epan/dissectors/packet-dcerpc-spo...
7,positive,Normalize resource URL in ResourceServlet\n\nI...,diff --git a/spring-webmvc/src/main/java/org/s...
8,negative,Merge branch '1.8.x',diff --git a/src/ClientCommand.cpp b/src/Clien...
9,negative,Change distribution URL.\n\ngit-svn-id: https:...,diff --git a/c/xml-security-c.spec b/c/xml-sec...


In [17]:
model_id = "sentence-transformers/paraphrase-mpnet-base-v2"

In [18]:
from commitfit import CommitFitModel
model = CommitFitModel.from_pretrained(model_id)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [19]:
from commitfit import CommitFitTrainer

trainer = CommitFitTrainer(
    model=model,
    train_dataset=train_dataset,
    train_code_change = train_code_change,
    test_code_change = test_code_change,
    eval_dataset=test_dataset,
    metric = compute_metrics,
    num_iterations=20,
    num_epochs=1
)

In [20]:
%%time
trainer.train()
fewshot_metrics = trainer.evaluate()
fewshot_metrics

Generating Training Pairs: 100%|██████████| 20/20 [00:00<00:00, 168.68it/s]
***** Running training *****
  Num examples = 2640
  Num epochs = 1
  Total optimization steps = 165
  Total train batch size = 16
Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/165 [00:00<?, ?it/s][A
Iteration:   1%|          | 1/165 [00:00<01:03,  2.58it/s][A
Iteration:   1%|          | 2/165 [00:00<00:41,  3.91it/s][A
Iteration:   2%|▏         | 3/165 [00:00<00:39,  4.14it/s][A
Iteration:   2%|▏         | 4/165 [00:00<00:37,  4.26it/s][A
Iteration:   3%|▎         | 5/165 [00:01<00:38,  4.19it/s][A
Iteration:   4%|▎         | 6/165 [00:01<00:34,  4.56it/s][A
Iteration:   4%|▍         | 7/165 [00:01<00:32,  4.88it/s][A
Iteration:   5%|▍         | 8/165 [00:01<00:33,  4.75it/s][A
Iteration:   5%|▌         | 9/165 [00:01<00:32,  4.79it/s][A
Iteration:   6%|▌         | 10/165 [00:02<00:32,  4.82it/s][A
Iteration:   7%|▋         | 11/165 [00:02<00:31,  4.86it/s][A
Iteration:

CPU times: user 16min 10s, sys: 1min 17s, total: 17min 27s
Wall time: 4min 31s


  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.7676406281057444,
 'precision': 0.7681222764802478,
 'recall': 0.7676406281057444,
 'f1': 0.7678737969595669}