In [1]:
# !pip install transformers accelerate
from transformers import AutoTokenizer, AutoModel, EarlyStoppingCallback, AutoModelForSequenceClassification, AutoConfig,Trainer, TrainingArguments,DataCollatorWithPadding
import torch
import numpy as np
import pandas as pd
# !pip install datasets
from datasets import load_metric
# %pip install evaluate
from evaluate import evaluator
from sklearn.model_selection import train_test_split
# from ipywidgets import FloatProgress
import csv
from optuna import Trial
from typing import Dict, Union, Any
import os
import sys
# notebook_login()

csv.field_size_limit(500 * 1024 * 1024)
CUDA_LAUNCH_BLOCKING=1
# get pwd
notebook_path = os.path.abspath('')

# Find the part of the path that contains 'commitFit'
commit_fit_path = None
for part in notebook_path.split(os.sep):
    print(part)
    if 'CommitFit' in part:
        commit_fit_path = notebook_path.split(part)[0] + part
        break

if commit_fit_path is None:
    raise ValueError("Path containing 'commitFit' not found in notebook path.")

# Add commitFit directory to Python path, so we can import moudule from commitfit folder directly
if commit_fit_path not in sys.path:
    sys.path.append(commit_fit_path)

from commitfit import CommitFitModel, CommitFitTrainer 


CommitFit


In [2]:
df = pd.read_csv(r'../../dataset/dataset.csv', index_col=0, encoding='utf_8_sig')
df.fillna('', inplace=True)

In [3]:
df

Unnamed: 0,label,message,diff
0,negative,Merge pull request #46 from rufferson/saslx-tl...,diff --git a/lib/DJabberd.pm b/lib/DJabberd.pm...
1,positive,Fix leaks in kadmin server stubs [CVE-2015-863...,diff --git a/src/kadmin/server/server_stubs.c ...
2,positive,Validate authorization request on approval\n\n...,diff --git a/spring-security-oauth2/src/main/j...
3,positive,Release 2.72.4+171110,diff --git a/application/config/version.php b/...
4,negative,Fixing compiler warnings.,diff --git a/src/main.c b/src/main.c\nindex 50...
...,...,...,...
10107,negative,les: remove useless protocol defines (#22115)\...,diff --git a/les/benchmark.go b/les/benchmark....
10108,positive,Merge pull request #2067 from realm/tg-swift-l...,diff --git a/CHANGELOG.md b/CHANGELOG.md\ninde...
10109,negative,[fix] 新規アカウント作成ページの翻訳を修正,diff --git a/app/locales/ja_JP/LC_MESSAGES/mes...
10110,negative,Merge branch 'hotfixes',diff --git a/CHANGELOG.txt b/CHANGELOG.txt\nin...


In [4]:
df = df.rename(columns={'message':'text'})
train, test = train_test_split(df,test_size=0.96,stratify=df['label'],random_state=1)

In [5]:
from commitfit import get_templated_dataset,sample_dataset
from datasets import Dataset, load_metric

In [6]:
# train = df.rename(columns={'3_labels':'label','comment':'text'})

In [7]:
Dataset_train = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)


In [8]:
train_dataset = get_templated_dataset(Dataset_train, candidate_labels=['negative','positive'], sample_size=8)

In [9]:
# list(train['message'].astype(str).values)
train_dataset

Dataset({
    features: ['label', 'text', 'diff', '__index_level_0__'],
    num_rows: 420
})

In [10]:
len(train)

404

In [11]:
# encoded_train

In [12]:
train['label'].value_counts()

label
negative    254
positive    150
Name: count, dtype: int64

In [13]:
test['label'].value_counts()

label
negative    6093
positive    3615
Name: count, dtype: int64

In [14]:
# huggingface-cli login
# train_dataset = CommitDataset(encoded_train, list(train['3_labels']))
# test_dataset = CommitDataset(encoded_test, list(test['3_labels']))
# val_dataset = CommitDataset(encoded_val, list(val['label']))

In [15]:
len(train_dataset)

420

In [16]:
from sklearn import  metrics

def compute_metrics(y_pred, y_test):
    # print(y_pred,y_test)
    # classification_report = metrics.classification_report(y_test,y_pred)
    accuracy_score = metrics.accuracy_score(y_test,y_pred)
    precision_score = metrics.precision_score(y_test,y_pred, average='weighted')
    recall_score = metrics.recall_score(y_test,y_pred,average='weighted')
    f1_score = metrics.f1_score(y_test,y_pred,average='weighted')

    # return {"classification_report": classification_report}
    return {"accuracy": accuracy_score, "precision": precision_score,"recall": recall_score, "f1":f1_score}

In [17]:
train_dataset

Dataset({
    features: ['label', 'text', 'diff', '__index_level_0__'],
    num_rows: 420
})

In [18]:
model_id = r"../../sentence-transformers/paraphrase-mpnet-base-v2"

In [19]:
# from commitfit import CommitFitModel
model = CommitFitModel.from_pretrained(model_id)

model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [20]:
def hp_space(trial: Trial) -> Dict[str, Union[float, int, str]]:
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-3, log=True),
        # "num_epochs": trial.suggest_int("num_epochs", 1, 3),
        # "batch_size": trial.suggest_categorical("batch_size", [8, 12, 16]),
        # "seed": trial.suggest_int("seed", 1, 40),
        # "num_iterations": trial.suggest_int("num_iterations", 10, 20),
        # "solver": trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear"]),
    }

In [21]:
def model_init(params: Dict[str, Any]) -> CommitFitModel:
    params = params or {}
    # learning_rate = params.get("learning_rate")
    # num_iterations = params.get("num_iterations", 20)
    # solver = params.get("solver", "liblinear")
    # params = {
    #     "head_params": {
    #         # "max_iter": num_iterations,
    #         # "solver": solver,
    #     }
    # }
    return CommitFitModel.from_pretrained(model_id, **params)

In [22]:
def my_compute_objective(metrics):
    print('+++++++++++',metrics)
    return  metrics['accuracy']

In [23]:
trainer = CommitFitTrainer(
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    model_init=model_init,
    # model=model,
    metric = compute_metrics,
    num_iterations=20,
    num_epochs=1
)
best_run = trainer.hyperparameter_search(direction="maximize", hp_space=hp_space, compute_objective=my_compute_objective, n_trials=3)

model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
[I 2024-05-28 11:06:13,953] A new study created in memory with name: no-name-3d6e30bf-4cdc-4bb4-bd84-9f2fdf96881a
Trial: {'learning_rate': 1.4644522712624011e-05}
model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 16800
  Num epochs = 1
  Total optimization steps = 1050
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1050 [00:00<?, ?it/s]

***** Running evaluation *****
[I 2024-05-28 11:09:28,006] Trial 0 finished with value: 0.8296250515039143 and parameters: {'learning_rate': 1.4644522712624011e-05}. Best is trial 0 with value: 0.8296250515039143.
Trial: {'learning_rate': 0.0008543639630258854}


+++++++++++ {'accuracy': 0.8296250515039143, 'precision': 0.8299042608721662, 'recall': 0.8296250515039143, 'f1': 0.829757501134196}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 16800
  Num epochs = 1
  Total optimization steps = 1050
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1050 [00:00<?, ?it/s]

***** Running evaluation *****
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-28 11:12:40,040] Trial 1 finished with value: 0.6276266996291718 and parameters: {'learning_rate': 0.0008543639630258854}. Best is trial 0 with value: 0.8296250515039143.
Trial: {'learning_rate': 1.2017285636660746e-06}


+++++++++++ {'accuracy': 0.6276266996291718, 'precision': 0.3939152740874067, 'recall': 0.6276266996291718, 'f1': 0.4840363876767982}


model_head.pkl not found in /CommitFit/sentence-transformers/paraphrase-mpnet-base-v2, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 16800
  Num epochs = 1
  Total optimization steps = 1050
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1050 [00:00<?, ?it/s]

***** Running evaluation *****
[I 2024-05-28 11:15:52,648] Trial 2 finished with value: 0.8078903996703749 and parameters: {'learning_rate': 1.2017285636660746e-06}. Best is trial 0 with value: 0.8296250515039143.


+++++++++++ {'accuracy': 0.8078903996703749, 'precision': 0.8060609922365986, 'recall': 0.8078903996703749, 'f1': 0.8064674903144499}


In [31]:
best_run

BestRun(run_id='0', objective=0.8296250515039143, hyperparameters={'learning_rate': 1.4644522712624011e-05}, backend=<optuna.study.study.Study object at 0x7f560de7a750>)

In [24]:
fewshot_metrics = trainer.evaluate()
fewshot_metrics

***** Running evaluation *****


{'accuracy': 0.8078903996703749,
 'precision': 0.8060609922365986,
 'recall': 0.8078903996703749,
 'f1': 0.8064674903144499}

In [25]:
import ipynbname
nb_fname = ipynbname.name()
print(nb_fname)

O-2-150


In [26]:
# store the checkpoints
# trainer.model.save_pretrained(nb_fname)

In [27]:
print(best_run)

BestRun(run_id='0', objective=0.8296250515039143, hyperparameters={'learning_rate': 1.4644522712624011e-05}, backend=<optuna.study.study.Study object at 0x7f560de7a750>)


In [28]:
# import os
# os.system('shutdown')

In [29]:
train.to_csv('train.csv',encoding='utf_8_sig')

In [30]:
test.to_csv('test.csv',encoding='utf_8_sig')