In [1]:
# !pip install transformers accelerate
from transformers import AutoTokenizer, AutoModel, EarlyStoppingCallback, AutoModelForSequenceClassification, AutoConfig,Trainer, TrainingArguments,DataCollatorWithPadding
import torch
import numpy as np
import pandas as pd
# !pip install datasets
from datasets import load_metric
# %pip install evaluate
from evaluate import evaluator
from sklearn.model_selection import train_test_split
# from ipywidgets import FloatProgress
import csv
from optuna import Trial
from typing import Dict, Union, Any
import os
import sys
# notebook_login()

csv.field_size_limit(500 * 1024 * 1024)
CUDA_LAUNCH_BLOCKING=1
# get pwd
notebook_path = os.path.abspath('')

# Find the part of the path that contains 'commitFit'
commit_fit_path = None
for part in notebook_path.split(os.sep):
    print(part)
    if 'CommitFit' in part:
        commit_fit_path = notebook_path.split(part)[0] + part
        break

if commit_fit_path is None:
    raise ValueError("Path containing 'commitFit' not found in notebook path.")

# Add commitFit directory to Python path, so we can import moudule from commitfit folder directly
if commit_fit_path not in sys.path:
    sys.path.append(commit_fit_path)

from commitfit import CommitFitModel, CommitFitTrainer 


CommitFit


In [2]:
train = pd.read_csv(r'train.csv', encoding='utf_8_sig')
train.fillna('', inplace=True)
test = pd.read_csv(r'test.csv', encoding='utf_8_sig')
test.fillna('', inplace=True)

In [3]:
# df

In [4]:
train_code_change = list(train['diff'].astype(str))
test_code_change = list(test['diff'].astype(str))

In [5]:
from commitfit import get_templated_dataset,sample_dataset
from datasets import Dataset, load_metric

In [6]:
# train = df.rename(columns={'3_labels':'label','comment':'text'})

In [7]:
Dataset_train = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)


In [8]:
train_dataset = get_templated_dataset(Dataset_train, candidate_labels=['negative','positive'], sample_size=8)

In [9]:
# list(train['message'].astype(str).values)
train_dataset

Dataset({
    features: ['label', 'text', 'diff'],
    num_rows: 56
})

In [10]:
len(train)

40

In [11]:
# encoded_train = tokenizer(train_dataset['text'].astype(str).to_list(), return_tensors='pt',truncation=True, padding=True)
# print(encoded_train["input_ids"].shape)
# encoded_test = tokenizer(test['comment'].astype(str).to_list(), return_tensors='pt',truncation=True, padding=True)
# print(encoded_test["input_ids"].shape)
# # encoded_val = tokenizer(val['comment'].astype(str).to_list(), return_tensors='pt',truncation=True, padding='max_length')

In [12]:
# encoded_train

In [13]:
train['label'].value_counts()

label
negative    25
positive    15
Name: count, dtype: int64

In [14]:
test['label'].value_counts()

label
negative    6322
positive    3750
Name: count, dtype: int64

In [15]:
# huggingface-cli login
# train_dataset = CommitDataset(encoded_train, list(train['3_labels']))
# test_dataset = CommitDataset(encoded_test, list(test['3_labels']))
# val_dataset = CommitDataset(encoded_val, list(val['label']))

In [16]:
len(train_dataset)

56

In [17]:
from sklearn import  metrics

def compute_metrics(y_pred, y_test):
    # print(y_pred,y_test)
    # classification_report = metrics.classification_report(y_test,y_pred)
    accuracy_score = metrics.accuracy_score(y_test,y_pred)
    precision_score = metrics.precision_score(y_test,y_pred, average='weighted')
    recall_score = metrics.recall_score(y_test,y_pred,average='weighted')
    f1_score = metrics.f1_score(y_test,y_pred,average='weighted')

    # return {"classification_report": classification_report}
    return {"precision": precision_score,"recall": recall_score, "f1":f1_score, "accuracy": accuracy_score}

In [18]:
train_dataset

Dataset({
    features: ['label', 'text', 'diff'],
    num_rows: 56
})

In [19]:
model_id = "../../sentence-transformers/paraphrase-mpnet-base-v2"

In [20]:
def hp_space(trial: Trial) -> Dict[str, Union[float, int, str]]:
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-3, log=True),
        # "num_epochs": trial.suggest_int("num_epochs", 1, 3),
        # "batch_size": trial.suggest_categorical("batch_size", [8, 12, 16]),
        # "seed": trial.suggest_int("seed", 1, 40),
        # "num_iterations": trial.suggest_int("num_iterations", 10, 20),
        # "solver": trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear"]),
    }

In [21]:
def model_init(params: Dict[str, Any]) -> CommitFitModel:
    params = params or {}
    # learning_rate = params.get("learning_rate")
    # num_iterations = params.get("num_iterations", 20)
    # solver = params.get("solver", "liblinear")
    # params = {
    #     "head_params": {
    #         # "max_iter": num_iterations,
    #         # "solver": solver,
    #     }
    # }
    return CommitFitModel.from_pretrained(model_id, **params)

In [22]:
def my_compute_objective(metrics):
    print('+++++++++++',metrics)
    return  metrics['accuracy']

In [23]:
trainer = CommitFitTrainer(
    train_dataset=train_dataset,
    train_code_change = train_code_change,
    test_code_change = test_code_change,
    eval_dataset=test_dataset,
    model_init=model_init,
    metric = compute_metrics,
    num_iterations=20,
    num_epochs=1
)
best_run = trainer.hyperparameter_search(direction="maximize", hp_space=hp_space, compute_objective=my_compute_objective, n_trials=10)

model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
[I 2024-05-26 14:42:20,401] A new study created in memory with name: no-name-d56a361d-7610-483b-b858-182e9578e18e
Trial: {'learning_rate': 1.2427503241705065e-06}
model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 2240
  Num epochs = 1
  Total optimization steps = 140
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/140 [00:00<?, ?it/s]

***** Running evaluation *****
[I 2024-05-26 14:45:32,709] Trial 0 finished with value: 0.7626092136616363 and parameters: {'learning_rate': 1.2427503241705065e-06}. Best is trial 0 with value: 0.7626092136616363.
Trial: {'learning_rate': 1.0628738961703904e-06}


+++++++++++ {'precision': 0.7604168922131918, 'recall': 0.7626092136616363, 'f1': 0.7611979364991678, 'accuracy': 0.7626092136616363}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 2240
  Num epochs = 1
  Total optimization steps = 140
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/140 [00:00<?, ?it/s]

***** Running evaluation *****
[I 2024-05-26 14:48:41,418] Trial 1 finished with value: 0.76131850675139 and parameters: {'learning_rate': 1.0628738961703904e-06}. Best is trial 0 with value: 0.7626092136616363.
Trial: {'learning_rate': 1.0522907996976595e-05}


+++++++++++ {'precision': 0.7586605919664738, 'recall': 0.76131850675139, 'f1': 0.7594853317844855, 'accuracy': 0.76131850675139}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 2240
  Num epochs = 1
  Total optimization steps = 140
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/140 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 14:51:49,080] Trial 2 finished with value: 0.7628077839555203 and parameters: {'learning_rate': 1.0522907996976595e-05}. Best is trial 2 with value: 0.7628077839555203.
Trial: {'learning_rate': 0.0009588651103756521}


+++++++++++ {'precision': 0.7606729214629288, 'recall': 0.7628077839555203, 'f1': 0.7614167840957015, 'accuracy': 0.7628077839555203}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 2240
  Num epochs = 1
  Total optimization steps = 140
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/140 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 14:54:58,121] Trial 3 finished with value: 0.6276806989674345 and parameters: {'learning_rate': 0.0009588651103756521}. Best is trial 2 with value: 0.7628077839555203.
Trial: {'learning_rate': 1.1137281949610779e-05}


+++++++++++ {'precision': 0.39398305985624715, 'recall': 0.6276806989674345, 'f1': 0.484103620699295, 'accuracy': 0.6276806989674345}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 2240
  Num epochs = 1
  Total optimization steps = 140
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/140 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 14:58:05,319] Trial 4 finished with value: 0.761517077045274 and parameters: {'learning_rate': 1.1137281949610779e-05}. Best is trial 2 with value: 0.7628077839555203.
Trial: {'learning_rate': 7.14605779039241e-06}


+++++++++++ {'precision': 0.7592682219458442, 'recall': 0.761517077045274, 'f1': 0.7600336231200207, 'accuracy': 0.761517077045274}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 2240
  Num epochs = 1
  Total optimization steps = 140
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/140 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 15:01:11,530] Trial 5 finished with value: 0.7402700555996823 and parameters: {'learning_rate': 7.14605779039241e-06}. Best is trial 2 with value: 0.7628077839555203.
Trial: {'learning_rate': 0.00014264841568764638}


+++++++++++ {'precision': 0.7478767065979263, 'recall': 0.7402700555996823, 'f1': 0.7427534547704648, 'accuracy': 0.7402700555996823}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 2240
  Num epochs = 1
  Total optimization steps = 140
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/140 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 15:04:18,110] Trial 6 finished with value: 0.7011517077045274 and parameters: {'learning_rate': 0.00014264841568764638}. Best is trial 2 with value: 0.7628077839555203.
Trial: {'learning_rate': 1.280543696174412e-06}


+++++++++++ {'precision': 0.731013620861637, 'recall': 0.7011517077045274, 'f1': 0.7061237088068539, 'accuracy': 0.7011517077045274}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 2240
  Num epochs = 1
  Total optimization steps = 140
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/140 [00:00<?, ?it/s]

***** Running evaluation *****
[I 2024-05-26 15:07:23,395] Trial 7 finished with value: 0.7624106433677522 and parameters: {'learning_rate': 1.280543696174412e-06}. Best is trial 2 with value: 0.7628077839555203.
Trial: {'learning_rate': 0.0005496743095830967}


+++++++++++ {'precision': 0.7602696619531975, 'recall': 0.7624106433677522, 'f1': 0.761043798164903, 'accuracy': 0.7624106433677522}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 2240
  Num epochs = 1
  Total optimization steps = 140
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/140 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 15:10:32,792] Trial 8 finished with value: 0.6276806989674345 and parameters: {'learning_rate': 0.0005496743095830967}. Best is trial 2 with value: 0.7628077839555203.
Trial: {'learning_rate': 2.8595428913109114e-05}


+++++++++++ {'precision': 0.39398305985624715, 'recall': 0.6276806989674345, 'f1': 0.484103620699295, 'accuracy': 0.6276806989674345}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 2240
  Num epochs = 1
  Total optimization steps = 140
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/140 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 15:13:39,370] Trial 9 finished with value: 0.7637013502779985 and parameters: {'learning_rate': 2.8595428913109114e-05}. Best is trial 9 with value: 0.7637013502779985.


+++++++++++ {'precision': 0.7631067059839151, 'recall': 0.7637013502779985, 'f1': 0.763380976790613, 'accuracy': 0.7637013502779985}


In [24]:
best_run

BestRun(run_id='9', objective=0.7637013502779985, hyperparameters={'learning_rate': 2.8595428913109114e-05}, backend=<optuna.study.study.Study object at 0x7f10d33ecf10>)

In [25]:
best_run.hyperparameters

{'learning_rate': 2.8595428913109114e-05}

In [26]:
trainer.apply_hyperparameters(best_run.hyperparameters, final_model=True)
trainer.train()

model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 2240
  Num epochs = 1
  Total optimization steps = 140
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/140 [00:00<?, ?it/s]

In [27]:
# best_run.hyperparameters

In [28]:
fewshot_metrics = trainer.evaluate()
fewshot_metrics

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))


{'precision': 0.7563777548610001,
 'recall': 0.7581413820492454,
 'f1': 0.7570653587911115,
 'accuracy': 0.7581413820492454}

In [29]:
# trainer.num_epochs, trainer.batch_size, trainer.seed, trainer.num_iterations, trainer.learning_rate,trainer.head.solver

In [30]:
# plot_optimization_history(trainer)

In [31]:
# from huggingface_hub import notebook_login, create_repo
# # create_repo("jiajun1992/my-awesome-model1", token="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")
# trainer.model.save_pretrained('my-awesome-model')

In [32]:
# train.to_csv('train.csv', index=False,encoding = 'utf_8_sig')

In [33]:
# test.to_csv('test.csv',index=False,encoding = 'utf_8_sig')