In [1]:
# !pip install transformers accelerate
from transformers import AutoTokenizer, AutoModel, EarlyStoppingCallback, AutoModelForSequenceClassification, AutoConfig,Trainer, TrainingArguments,DataCollatorWithPadding
import torch
import numpy as np
import pandas as pd
# !pip install datasets
from datasets import load_metric
# %pip install evaluate
from evaluate import evaluator
from sklearn.model_selection import train_test_split
# from ipywidgets import FloatProgress
import csv
from optuna import Trial
from typing import Dict, Union, Any
import os
import sys
# notebook_login()

csv.field_size_limit(500 * 1024 * 1024)
CUDA_LAUNCH_BLOCKING=1
# get pwd
notebook_path = os.path.abspath('')

# Find the part of the path that contains 'commitFit'
commit_fit_path = None
for part in notebook_path.split(os.sep):
    print(part)
    if 'CommitFit' in part:
        commit_fit_path = notebook_path.split(part)[0] + part
        break

if commit_fit_path is None:
    raise ValueError("Path containing 'commitFit' not found in notebook path.")

# Add commitFit directory to Python path, so we can import moudule from commitfit folder directly
if commit_fit_path not in sys.path:
    sys.path.append(commit_fit_path)

from commitfit import CommitFitModel, CommitFitTrainer 


CommitFit


In [2]:
train = pd.read_csv(r'train.csv', encoding='utf_8_sig')
test = pd.read_csv(r'test.csv', encoding='utf_8_sig')


In [3]:
train

Unnamed: 0,label,text,diff
0,negative,Fixed link errors,diff --git a/src/irisnet/CMakeLists.txt b/src/...
1,positive,Check types to avoid invalid reads/writes.,diff --git a/src/file.c b/src/file.c\nindex 4d...
2,positive,https://github.com/ImageMagick/ImageMagick/iss...,diff --git a/coders/png.c b/coders/png.c\ninde...
3,positive,XSS 취약점 수정,diff --git a/adm/boardgroup_form.php b/adm/boa...
4,negative,Change distribution URL.\n\ngit-svn-id: https:...,diff --git a/c/xml-security-c.spec b/c/xml-sec...
5,negative,Merge pull request #2426 from alvarobartt/deve...,diff --git a/nltk/tag/stanford.py b/nltk/tag/s...
6,negative,2021 license,diff --git a/Gruntfile.js b/Gruntfile.js\ninde...
7,positive,SPOOLSS: Try to avoid an infinite loop.\n\nUse...,diff --git a/epan/dissectors/packet-dcerpc-spo...
8,negative,Upgrade: Bump grape from 1.3.3 to 1.5.0\n\nBum...,diff --git a/Gemfile.lock b/Gemfile.lock\ninde...
9,negative,Merge branch '8.5' into 9.5,diff --git a/src/Framework/Assert.php b/src/Fr...


In [4]:
train_code_change = list(train['diff'].astype(str))
test_code_change = list(test['diff'].astype(str))

In [5]:
# !pip install setfit
from commitfit import get_templated_dataset,sample_dataset
from datasets import Dataset, load_metric

In [6]:
Dataset_train = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

In [7]:
train_dataset = get_templated_dataset(Dataset_train, candidate_labels=['positive','negative'], sample_size=8)

In [8]:
# list(train['message'].astype(str).values)
train_dataset

Dataset({
    features: ['label', 'text', 'diff'],
    num_rows: 26
})

In [9]:
len(train)

10

In [10]:
train['label'].value_counts()

label
negative    6
positive    4
Name: count, dtype: int64

In [11]:
test['label'].value_counts()

label
negative    6341
positive    3761
Name: count, dtype: int64

In [12]:
len(train_dataset)

26

In [13]:
from sklearn import  metrics

def compute_metrics(y_pred, y_test):
    # print(y_pred,y_test)
    # classification_report = metrics.classification_report(y_test,y_pred)
    accuracy_score = metrics.accuracy_score(y_test,y_pred)
    precision_score = metrics.precision_score(y_test,y_pred, average='weighted')
    recall_score = metrics.recall_score(y_test,y_pred,average='weighted')
    f1_score = metrics.f1_score(y_test,y_pred,average='weighted')

    # return {"classification_report": classification_report}
    return {"precision": precision_score,"recall": recall_score, "f1":f1_score, "accuracy": accuracy_score}

In [14]:
test['label'].value_counts()

label
negative    6341
positive    3761
Name: count, dtype: int64

In [15]:
train_dataset

Dataset({
    features: ['label', 'text', 'diff'],
    num_rows: 26
})

In [16]:
train

Unnamed: 0,label,text,diff
0,negative,Fixed link errors,diff --git a/src/irisnet/CMakeLists.txt b/src/...
1,positive,Check types to avoid invalid reads/writes.,diff --git a/src/file.c b/src/file.c\nindex 4d...
2,positive,https://github.com/ImageMagick/ImageMagick/iss...,diff --git a/coders/png.c b/coders/png.c\ninde...
3,positive,XSS 취약점 수정,diff --git a/adm/boardgroup_form.php b/adm/boa...
4,negative,Change distribution URL.\n\ngit-svn-id: https:...,diff --git a/c/xml-security-c.spec b/c/xml-sec...
5,negative,Merge pull request #2426 from alvarobartt/deve...,diff --git a/nltk/tag/stanford.py b/nltk/tag/s...
6,negative,2021 license,diff --git a/Gruntfile.js b/Gruntfile.js\ninde...
7,positive,SPOOLSS: Try to avoid an infinite loop.\n\nUse...,diff --git a/epan/dissectors/packet-dcerpc-spo...
8,negative,Upgrade: Bump grape from 1.3.3 to 1.5.0\n\nBum...,diff --git a/Gemfile.lock b/Gemfile.lock\ninde...
9,negative,Merge branch '8.5' into 9.5,diff --git a/src/Framework/Assert.php b/src/Fr...


In [17]:
model_id = r"../../sentence-transformers/paraphrase-mpnet-base-v2"

In [18]:
def hp_space(trial: Trial) -> Dict[str, Union[float, int, str]]:
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-3, log=True),
        # "num_epochs": trial.suggest_int("num_epochs", 1, 3),
        # "batch_size": trial.suggest_categorical("batch_size", [8, 12, 16]),
        # "seed": trial.suggest_int("seed", 1, 40),
        # "num_iterations": trial.suggest_int("num_iterations", 10, 20),
        # "solver": trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear"]),
    }

In [19]:
def model_init(params: Dict[str, Any]) -> CommitFitModel:
    params = params or {}
    # learning_rate = params.get("learning_rate")
    # num_iterations = params.get("num_iterations", 20)
    # solver = params.get("solver", "liblinear")
    # params = {
    #     "head_params": {
    #         # "max_iter": num_iterations,
    #         # "solver": solver,
    #     }
    # }
    return CommitFitModel.from_pretrained(model_id, **params)

In [20]:
def my_compute_objective(metrics):
    print('+++++++++++',metrics)
    return  metrics['accuracy']

In [21]:
trainer = CommitFitTrainer(
    train_dataset=train_dataset,
    train_code_change = train_code_change,
    test_code_change = test_code_change,
    eval_dataset=test_dataset,
    model_init=model_init,
    metric = compute_metrics,
    num_iterations=20,
    num_epochs=1
)
best_run = trainer.hyperparameter_search(direction="maximize", hp_space=hp_space, compute_objective=my_compute_objective, n_trials=10)

model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
[I 2024-05-26 10:34:01,468] A new study created in memory with name: no-name-60ff1b94-82fe-4fcc-8b21-53b23a0ffa41
Trial: {'learning_rate': 2.4259405334828273e-06}
model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1040
  Num epochs = 1
  Total optimization steps = 65
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/65 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 10:36:59,141] Trial 0 finished with value: 0.7005543456741239 and parameters: {'learning_rate': 2.4259405334828273e-06}. Best is trial 0 with value: 0.7005543456741239.
Trial: {'learning_rate': 5.821075126267048e-06}


+++++++++++ {'precision': 0.6942595231891902, 'recall': 0.7005543456741239, 'f1': 0.6774727919592385, 'accuracy': 0.7005543456741239}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1040
  Num epochs = 1
  Total optimization steps = 65
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/65 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 10:39:54,306] Trial 1 finished with value: 0.7002573747772718 and parameters: {'learning_rate': 5.821075126267048e-06}. Best is trial 0 with value: 0.7005543456741239.
Trial: {'learning_rate': 7.588943266152602e-05}


+++++++++++ {'precision': 0.6911779934723907, 'recall': 0.7002573747772718, 'f1': 0.6854494640009944, 'accuracy': 0.7002573747772718}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1040
  Num epochs = 1
  Total optimization steps = 65
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/65 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 10:42:50,363] Trial 2 finished with value: 0.6311621461096812 and parameters: {'learning_rate': 7.588943266152602e-05}. Best is trial 0 with value: 0.7005543456741239.
Trial: {'learning_rate': 4.577055525176349e-06}


+++++++++++ {'precision': 0.6466816613846208, 'recall': 0.6311621461096812, 'f1': 0.636385422788865, 'accuracy': 0.6311621461096812}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1040
  Num epochs = 1
  Total optimization steps = 65
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/65 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 10:45:46,075] Trial 3 finished with value: 0.7016432389625816 and parameters: {'learning_rate': 4.577055525176349e-06}. Best is trial 3 with value: 0.7016432389625816.
Trial: {'learning_rate': 1.204468006092301e-05}


+++++++++++ {'precision': 0.6932665889674158, 'recall': 0.7016432389625816, 'f1': 0.6843368301300582, 'accuracy': 0.7016432389625816}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1040
  Num epochs = 1
  Total optimization steps = 65
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/65 [00:00<?, ?it/s]

***** Running evaluation *****
[I 2024-05-26 10:48:40,944] Trial 4 finished with value: 0.6967927143139973 and parameters: {'learning_rate': 1.204468006092301e-05}. Best is trial 3 with value: 0.7016432389625816.
Trial: {'learning_rate': 6.469870700742102e-06}


+++++++++++ {'precision': 0.6871252497382073, 'recall': 0.6967927143139973, 'f1': 0.6838061486777136, 'accuracy': 0.6967927143139973}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1040
  Num epochs = 1
  Total optimization steps = 65
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/65 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 10:51:35,500] Trial 5 finished with value: 0.7009503068699268 and parameters: {'learning_rate': 6.469870700742102e-06}. Best is trial 3 with value: 0.7016432389625816.
Trial: {'learning_rate': 1.2587191870478776e-06}


+++++++++++ {'precision': 0.6918821376782814, 'recall': 0.7009503068699268, 'f1': 0.6874932444671615, 'accuracy': 0.7009503068699268}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1040
  Num epochs = 1
  Total optimization steps = 65
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/65 [00:00<?, ?it/s]

***** Running evaluation *****
[I 2024-05-26 10:54:29,333] Trial 6 finished with value: 0.6979805979014057 and parameters: {'learning_rate': 1.2587191870478776e-06}. Best is trial 3 with value: 0.7016432389625816.
Trial: {'learning_rate': 6.323045061064815e-05}


+++++++++++ {'precision': 0.6925855710816342, 'recall': 0.6979805979014057, 'f1': 0.6716634184159911, 'accuracy': 0.6979805979014057}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1040
  Num epochs = 1
  Total optimization steps = 65
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/65 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 10:57:24,387] Trial 7 finished with value: 0.6501682835082162 and parameters: {'learning_rate': 6.323045061064815e-05}. Best is trial 3 with value: 0.7016432389625816.
Trial: {'learning_rate': 3.0933243919164293e-06}


+++++++++++ {'precision': 0.6413226942975739, 'recall': 0.6501682835082162, 'f1': 0.6435610104354436, 'accuracy': 0.6501682835082162}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1040
  Num epochs = 1
  Total optimization steps = 65
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/65 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 11:00:19,630] Trial 8 finished with value: 0.7014452583646803 and parameters: {'learning_rate': 3.0933243919164293e-06}. Best is trial 3 with value: 0.7016432389625816.
Trial: {'learning_rate': 0.0003443106668220414}


+++++++++++ {'precision': 0.6944341683863268, 'recall': 0.7014452583646803, 'f1': 0.6802967962037373, 'accuracy': 0.7014452583646803}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1040
  Num epochs = 1
  Total optimization steps = 65
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/65 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 11:03:15,619] Trial 9 finished with value: 0.5784993070679073 and parameters: {'learning_rate': 0.0003443106668220414}. Best is trial 3 with value: 0.7016432389625816.


+++++++++++ {'precision': 0.46579054526003033, 'recall': 0.5784993070679073, 'f1': 0.48654292518248027, 'accuracy': 0.5784993070679073}


In [22]:
trainer.apply_hyperparameters(best_run.hyperparameters, final_model=True)
trainer.train()

model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1040
  Num epochs = 1
  Total optimization steps = 65
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/65 [00:00<?, ?it/s]

In [24]:
best_run

BestRun(run_id='3', objective=0.7016432389625816, hyperparameters={'learning_rate': 4.577055525176349e-06}, backend=<optuna.study.study.Study object at 0x7fc099d1c790>)

In [23]:
fewshot_metrics = trainer.evaluate()
fewshot_metrics

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))


{'precision': 0.6941185078532881,
 'recall': 0.701544248663631,
 'f1': 0.6813972341741444,
 'accuracy': 0.701544248663631}