In [1]:
# !pip install transformers accelerate
from transformers import AutoTokenizer, AutoModel, EarlyStoppingCallback, AutoModelForSequenceClassification, AutoConfig,Trainer, TrainingArguments,DataCollatorWithPadding
import torch
import numpy as np
import pandas as pd
# !pip install datasets
from datasets import load_metric
# %pip install evaluate
from evaluate import evaluator
from sklearn.model_selection import train_test_split
# from ipywidgets import FloatProgress
import csv
from optuna import Trial
from typing import Dict, Union, Any
import os
import sys
# notebook_login()

csv.field_size_limit(500 * 1024 * 1024)
CUDA_LAUNCH_BLOCKING=1
# get pwd
notebook_path = os.path.abspath('')

# Find the part of the path that contains 'commitFit'
commit_fit_path = None
for part in notebook_path.split(os.sep):
    print(part)
    if 'CommitFit' in part:
        commit_fit_path = notebook_path.split(part)[0] + part
        break

if commit_fit_path is None:
    raise ValueError("Path containing 'commitFit' not found in notebook path.")

# Add commitFit directory to Python path, so we can import moudule from commitfit folder directly
if commit_fit_path not in sys.path:
    sys.path.append(commit_fit_path)

from commitfit import CommitFitModel, CommitFitTrainer 


CommitFit


In [3]:
df = pd.read_csv(r'../../dataset/dataset.csv', index_col=0, encoding='utf_8_sig')
df.fillna('', inplace=True)

# df = df.replace({"3_labels": label2id})
# # print(df)
# test_sample = df.sample(n=3, random_state=1)

In [4]:
df

Unnamed: 0,label,message,diff
0,negative,Merge pull request #46 from rufferson/saslx-tl...,diff --git a/lib/DJabberd.pm b/lib/DJabberd.pm...
1,positive,Fix leaks in kadmin server stubs [CVE-2015-863...,diff --git a/src/kadmin/server/server_stubs.c ...
2,positive,Validate authorization request on approval\n\n...,diff --git a/spring-security-oauth2/src/main/j...
3,positive,Release 2.72.4+171110,diff --git a/application/config/version.php b/...
4,negative,Fixing compiler warnings.,diff --git a/src/main.c b/src/main.c\nindex 50...
...,...,...,...
10107,negative,les: remove useless protocol defines (#22115)\...,diff --git a/les/benchmark.go b/les/benchmark....
10108,positive,Merge pull request #2067 from realm/tg-swift-l...,diff --git a/CHANGELOG.md b/CHANGELOG.md\ninde...
10109,negative,[fix] 新規アカウント作成ページの翻訳を修正,diff --git a/app/locales/ja_JP/LC_MESSAGES/mes...
10110,negative,Merge branch 'hotfixes',diff --git a/CHANGELOG.txt b/CHANGELOG.txt\nin...


In [5]:
df = df.rename(columns={'message':'text'})
train, test = train_test_split(df,test_size=0.998,stratify=df['label'],random_state=1)

In [6]:
from commitfit import get_templated_dataset,sample_dataset
from datasets import Dataset, load_metric

In [7]:
# train = df.rename(columns={'3_labels':'label','comment':'text'})

In [8]:
Dataset_train = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)


In [9]:
train_dataset = get_templated_dataset(Dataset_train, candidate_labels=['negative','positive'], sample_size=8)

In [10]:
# list(train['message'].astype(str).values)
train_dataset

Dataset({
    features: ['label', 'text', 'diff', '__index_level_0__'],
    num_rows: 36
})

In [11]:
len(train)

20

In [12]:
# encoded_train = tokenizer(train_dataset['text'].astype(str).to_list(), return_tensors='pt',truncation=True, padding=True)
# print(encoded_train["input_ids"].shape)
# encoded_test = tokenizer(test['comment'].astype(str).to_list(), return_tensors='pt',truncation=True, padding=True)
# print(encoded_test["input_ids"].shape)
# # encoded_val = tokenizer(val['comment'].astype(str).to_list(), return_tensors='pt',truncation=True, padding='max_length')

In [13]:
# encoded_train

In [14]:
train['label'].value_counts()

label
negative    13
positive     7
Name: count, dtype: int64

In [15]:
test['label'].value_counts()

label
negative    6334
positive    3758
Name: count, dtype: int64

In [16]:
# huggingface-cli login
# train_dataset = CommitDataset(encoded_train, list(train['3_labels']))
# test_dataset = CommitDataset(encoded_test, list(test['3_labels']))
# val_dataset = CommitDataset(encoded_val, list(val['label']))

In [17]:
len(train_dataset)

36

In [18]:
from sklearn import  metrics

def compute_metrics(y_pred, y_test):
    # print(y_pred,y_test)
    # classification_report = metrics.classification_report(y_test,y_pred)
    accuracy_score = metrics.accuracy_score(y_test,y_pred)
    precision_score = metrics.precision_score(y_test,y_pred, average='weighted')
    recall_score = metrics.recall_score(y_test,y_pred,average='weighted')
    f1_score = metrics.f1_score(y_test,y_pred,average='weighted')

    # return {"classification_report": classification_report}
    return {"precision": precision_score,"recall": recall_score, "f1":f1_score, "accuracy": accuracy_score}

In [19]:
train_dataset

Dataset({
    features: ['label', 'text', 'diff', '__index_level_0__'],
    num_rows: 36
})

In [20]:
model_id = "../../sentence-transformers/paraphrase-mpnet-base-v2"

In [21]:
def hp_space(trial: Trial) -> Dict[str, Union[float, int, str]]:
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-3, log=True),
        # "num_epochs": trial.suggest_int("num_epochs", 1, 3),
        # "batch_size": trial.suggest_categorical("batch_size", [8, 12, 16]),
        # "seed": trial.suggest_int("seed", 1, 40),
        # "num_iterations": trial.suggest_int("num_iterations", 10, 20),
        # "solver": trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear"]),
    }
def model_init(params: Dict[str, Any]) -> CommitFitModel:
    params = params or {}
    # learning_rate = params.get("learning_rate")
    # num_iterations = params.get("num_iterations", 20)
    # solver = params.get("solver", "liblinear")
    # params = {
    #     "head_params": {
    #         # "max_iter": num_iterations,
    #         # "solver": solver,
    #     }
    # }
    return CommitFitModel.from_pretrained(model_id, **params)
def my_compute_objective(metrics):
    print('+++++++++++',metrics)
    return  metrics['accuracy']

In [22]:
trainer = CommitFitTrainer(
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    model_init=model_init,
    metric = compute_metrics,
    num_iterations=20,
    num_epochs=1
)
best_run = trainer.hyperparameter_search(direction="maximize", hp_space=hp_space, compute_objective=my_compute_objective, n_trials=10)

model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
[I 2024-05-26 11:20:18,830] A new study created in memory with name: no-name-e99647a2-61b3-48b1-8e2a-86716e0e79bd
Trial: {'learning_rate': 5.059337650568091e-06}
model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1440
  Num epochs = 1
  Total optimization steps = 90
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

***** Running evaluation *****
[I 2024-05-26 11:20:45,683] Trial 0 finished with value: 0.7247324613555292 and parameters: {'learning_rate': 5.059337650568091e-06}. Best is trial 0 with value: 0.7247324613555292.
Trial: {'learning_rate': 1.196663788419191e-06}


+++++++++++ {'precision': 0.7181117400627292, 'recall': 0.7247324613555292, 'f1': 0.7165664071721723, 'accuracy': 0.7247324613555292}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1440
  Num epochs = 1
  Total optimization steps = 90
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

***** Running evaluation *****
[I 2024-05-26 11:21:10,643] Trial 1 finished with value: 0.7161117717003567 and parameters: {'learning_rate': 1.196663788419191e-06}. Best is trial 0 with value: 0.7247324613555292.
Trial: {'learning_rate': 3.388396177913298e-06}


+++++++++++ {'precision': 0.7090504107682021, 'recall': 0.7161117717003567, 'f1': 0.7028691274090092, 'accuracy': 0.7161117717003567}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1440
  Num epochs = 1
  Total optimization steps = 90
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

***** Running evaluation *****
[I 2024-05-26 11:21:35,731] Trial 2 finished with value: 0.7242370194213238 and parameters: {'learning_rate': 3.388396177913298e-06}. Best is trial 0 with value: 0.7247324613555292.
Trial: {'learning_rate': 2.9404642598088573e-06}


+++++++++++ {'precision': 0.717544900072075, 'recall': 0.7242370194213238, 'f1': 0.7149505537383545, 'accuracy': 0.7242370194213238}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1440
  Num epochs = 1
  Total optimization steps = 90
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

***** Running evaluation *****
[I 2024-05-26 11:22:00,888] Trial 3 finished with value: 0.7241379310344828 and parameters: {'learning_rate': 2.9404642598088573e-06}. Best is trial 0 with value: 0.7247324613555292.
Trial: {'learning_rate': 6.555203848014364e-05}


+++++++++++ {'precision': 0.7174720436853581, 'recall': 0.7241379310344828, 'f1': 0.714267101554392, 'accuracy': 0.7241379310344828}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1440
  Num epochs = 1
  Total optimization steps = 90
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 11:22:25,824] Trial 4 finished with value: 0.716607213634562 and parameters: {'learning_rate': 6.555203848014364e-05}. Best is trial 0 with value: 0.7247324613555292.
Trial: {'learning_rate': 0.00015458847420114763}


+++++++++++ {'precision': 0.7192959265246652, 'recall': 0.716607213634562, 'f1': 0.6921592206374233, 'accuracy': 0.716607213634562}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1440
  Num epochs = 1
  Total optimization steps = 90
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 11:22:50,723] Trial 5 finished with value: 0.6644867221561633 and parameters: {'learning_rate': 0.00015458847420114763}. Best is trial 0 with value: 0.7247324613555292.
Trial: {'learning_rate': 1.3894839202112912e-06}


+++++++++++ {'precision': 0.6602293853002592, 'recall': 0.6644867221561633, 'f1': 0.6602812507727572, 'accuracy': 0.6644867221561633}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1440
  Num epochs = 1
  Total optimization steps = 90
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

***** Running evaluation *****
[I 2024-05-26 11:23:15,679] Trial 6 finished with value: 0.7171026555687673 and parameters: {'learning_rate': 1.3894839202112912e-06}. Best is trial 0 with value: 0.7247324613555292.
Trial: {'learning_rate': 0.0008497248649187532}


+++++++++++ {'precision': 0.7101333929402251, 'recall': 0.7171026555687673, 'f1': 0.7040709085117267, 'accuracy': 0.7171026555687673}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1440
  Num epochs = 1
  Total optimization steps = 90
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 11:23:40,692] Trial 7 finished with value: 0.6276258422512881 and parameters: {'learning_rate': 0.0008497248649187532}. Best is trial 0 with value: 0.7247324613555292.
Trial: {'learning_rate': 4.4256580984478515e-05}


+++++++++++ {'precision': 0.39391419786163884, 'recall': 0.6276258422512881, 'f1': 0.48403532020207707, 'accuracy': 0.6276258422512881}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1440
  Num epochs = 1
  Total optimization steps = 90
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-26 11:24:05,735] Trial 8 finished with value: 0.7412802219579865 and parameters: {'learning_rate': 4.4256580984478515e-05}. Best is trial 8 with value: 0.7412802219579865.
Trial: {'learning_rate': 5.220427571339817e-06}


+++++++++++ {'precision': 0.7412983654766339, 'recall': 0.7412802219579865, 'f1': 0.7257452589843876, 'accuracy': 0.7412802219579865}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1440
  Num epochs = 1
  Total optimization steps = 90
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

***** Running evaluation *****
[I 2024-05-26 11:24:30,707] Trial 9 finished with value: 0.7239397542608006 and parameters: {'learning_rate': 5.220427571339817e-06}. Best is trial 8 with value: 0.7412802219579865.


+++++++++++ {'precision': 0.717238958434173, 'recall': 0.7239397542608006, 'f1': 0.7155358827579565, 'accuracy': 0.7239397542608006}


In [23]:
best_run

BestRun(run_id='8', objective=0.7412802219579865, hyperparameters={'learning_rate': 4.4256580984478515e-05}, backend=<optuna.study.study.Study object at 0x7f10d6849cd0>)

In [24]:
best_run.hyperparameters

{'learning_rate': 4.4256580984478515e-05}

In [25]:
trainer.apply_hyperparameters(best_run.hyperparameters, final_model=True)
trainer.train()

model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1440
  Num epochs = 1
  Total optimization steps = 90
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

In [26]:
train.to_csv('train.csv', index=False,encoding = 'utf_8_sig')

In [27]:
test.to_csv('test.csv',index=False,encoding = 'utf_8_sig')