In [1]:
# !pip install transformers accelerate
from transformers import AutoTokenizer, AutoModel, EarlyStoppingCallback, AutoModelForSequenceClassification, AutoConfig,Trainer, TrainingArguments,DataCollatorWithPadding
import torch
import numpy as np
import pandas as pd
# !pip install datasets
from datasets import load_metric
# %pip install evaluate
from evaluate import evaluator
from sklearn.model_selection import train_test_split
import csv

# notebook_login()

CUDA_LAUNCH_BLOCKING=1


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_csv(r'train.csv', encoding='utf_8_sig')
train.fillna('', inplace=True)
test = pd.read_csv(r'test.csv', encoding='utf_8_sig')
test.fillna('', inplace=True)

In [3]:
train

Unnamed: 0.1,Unnamed: 0,label,text,diff
0,746,positive,predicate builder should not recurse for deter...,diff --git a/activerecord/lib/active_record/as...
1,9509,positive,AMQP-590: Java Deserialization White List\n\nJ...,diff --git a/spring-amqp/src/main/java/org/spr...
2,192,negative,feat(api): add endpoints to API to add comment...,diff --git a/config/json_validator/latest/Cent...
3,8812,negative,fix(documentation): remove route redeclaration...,diff --git a/doc/API/centreon-api-v2.yaml b/do...
4,1417,positive,* tools/tiffcp.c: fix uint32 underflow/overflo...,diff --git a/ChangeLog b/ChangeLog\nindex 94be...
...,...,...,...,...
399,6320,positive,CVE-2017-12998/IS-IS: Check for 2 bytes if we'...,diff --git a/print-isoclns.c b/print-isoclns.c...
400,8769,positive,NIFI-5258 - Changed addHeader to setHeader whi...,diff --git a/nifi-nar-bundles/nifi-framework-b...
401,2679,positive,Fix bug #70345 (Multiple vulnerabilities relat...,diff --git a/ext/pcre/php_pcre.c b/ext/pcre/ph...
402,926,negative,Merge pull request #100 from adisingh007/hapi#...,diff --git a/lib/batch.js b/lib/batch.js\ninde...


In [4]:
train_code_change = list(train['diff'].astype(str))
test_code_change = list(test['diff'].astype(str))

In [5]:
# !pip install setfit
from commitfit import get_templated_dataset,sample_dataset
from datasets import Dataset, load_metric

In [6]:
Dataset_train = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

In [7]:
train_dataset = get_templated_dataset(Dataset_train, candidate_labels=['positive','negative'], sample_size=8)

In [8]:
# list(train['message'].astype(str).values)
train_dataset

Dataset({
    features: ['Unnamed: 0', 'label', 'text', 'diff'],
    num_rows: 420
})

In [9]:
len(train)

404

In [10]:
train['label'].value_counts()

label
negative    254
positive    150
Name: count, dtype: int64

In [11]:
test['label'].value_counts()

label
negative    6093
positive    3615
Name: count, dtype: int64

In [12]:
len(train_dataset)

420

In [13]:
from sklearn import  metrics

def compute_metrics(y_pred, y_test):
    # print(y_pred,y_test)
    # classification_report = metrics.classification_report(y_test,y_pred)
    accuracy_score = metrics.accuracy_score(y_test,y_pred)
    precision_score = metrics.precision_score(y_test,y_pred, average='weighted')
    recall_score = metrics.recall_score(y_test,y_pred,average='weighted')
    f1_score = metrics.f1_score(y_test,y_pred,average='weighted')

    # return {"classification_report": classification_report}
    return {"accuracy": accuracy_score, "precision": precision_score,"recall": recall_score, "f1":f1_score}

In [14]:
test['label'].value_counts()

label
negative    6093
positive    3615
Name: count, dtype: int64

In [15]:
train_dataset

Dataset({
    features: ['Unnamed: 0', 'label', 'text', 'diff'],
    num_rows: 420
})

In [16]:
train

Unnamed: 0.1,Unnamed: 0,label,text,diff
0,746,positive,predicate builder should not recurse for deter...,diff --git a/activerecord/lib/active_record/as...
1,9509,positive,AMQP-590: Java Deserialization White List\n\nJ...,diff --git a/spring-amqp/src/main/java/org/spr...
2,192,negative,feat(api): add endpoints to API to add comment...,diff --git a/config/json_validator/latest/Cent...
3,8812,negative,fix(documentation): remove route redeclaration...,diff --git a/doc/API/centreon-api-v2.yaml b/do...
4,1417,positive,* tools/tiffcp.c: fix uint32 underflow/overflo...,diff --git a/ChangeLog b/ChangeLog\nindex 94be...
...,...,...,...,...
399,6320,positive,CVE-2017-12998/IS-IS: Check for 2 bytes if we'...,diff --git a/print-isoclns.c b/print-isoclns.c...
400,8769,positive,NIFI-5258 - Changed addHeader to setHeader whi...,diff --git a/nifi-nar-bundles/nifi-framework-b...
401,2679,positive,Fix bug #70345 (Multiple vulnerabilities relat...,diff --git a/ext/pcre/php_pcre.c b/ext/pcre/ph...
402,926,negative,Merge pull request #100 from adisingh007/hapi#...,diff --git a/lib/batch.js b/lib/batch.js\ninde...


In [17]:
model_id = "sentence-transformers/paraphrase-mpnet-base-v2"

In [18]:
from commitfit import CommitFitModel
model = CommitFitModel.from_pretrained(model_id)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [19]:
from commitfit import CommitFitTrainer

trainer = CommitFitTrainer(
    model=model,
    train_dataset=train_dataset,
    train_code_change = train_code_change,
    test_code_change = test_code_change,
    eval_dataset=test_dataset,
    metric = compute_metrics,
    num_iterations=20,
    num_epochs=1
)

In [20]:
%%time
trainer.train()
fewshot_metrics = trainer.evaluate()
fewshot_metrics

Generating Training Pairs: 100%|██████████| 20/20 [00:00<00:00, 25.84it/s]
***** Running training *****
  Num examples = 16800
  Num epochs = 1
  Total optimization steps = 1050
  Total train batch size = 16
Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/1050 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/1050 [00:00<15:29,  1.13it/s][A
Iteration:   0%|          | 2/1050 [00:01<17:20,  1.01it/s][A
Iteration:   0%|          | 3/1050 [00:02<14:53,  1.17it/s][A
Iteration:   0%|          | 4/1050 [00:02<11:48,  1.48it/s][A
Iteration:   0%|          | 5/1050 [00:03<10:23,  1.68it/s][A
Iteration:   1%|          | 6/1050 [00:03<10:14,  1.70it/s][A
Iteration:   1%|          | 7/1050 [00:03<09:15,  1.88it/s][A
Iteration:   1%|          | 8/1050 [00:04<08:26,  2.06it/s][A
Iteration:   1%|          | 9/1050 [00:04<08:42,  1.99it/s][A
Iteration:   1%|          | 10/1050 [00:05<09:29,  1.83it/s][A
Iteration:   1%|          | 11/1050 [00:05<09:04,  1.91it/s]

CPU times: user 1h 40min 4s, sys: 3min 6s, total: 1h 43min 11s
Wall time: 15min 52s


{'accuracy': 0.8262257931602802,
 'precision': 0.8277127790301548,
 'recall': 0.8262257931602802,
 'f1': 0.8268192648109999}

In [21]:
import ipynbname
nb_fname = ipynbname.name()
print(nb_fname)

M-2-150-code change


In [22]:
# store the checkpoints
trainer.model.save_pretrained(nb_fname)

In [23]:
# import os
# os.system('shutdown')