In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# !pip install transformers accelerate
from transformers import AutoTokenizer, AutoModel, EarlyStoppingCallback, AutoModelForSequenceClassification, AutoConfig,Trainer, TrainingArguments,DataCollatorWithPadding
import torch
import numpy as np
import pandas as pd
# !pip install datasets
from datasets import load_metric
# !pip install evaluate
from evaluate import evaluator
from sklearn.model_selection import train_test_split
import csv


# CUDA_LAUNCH_BLOCKING=1


In [3]:
label2id = {'negative':0,'positive':1}

train = pd.read_csv(r'/content/drive/MyDrive/train_50.csv', encoding='utf_8_sig')
train = train.replace({"label": label2id})
train.fillna('', inplace=True)
test = pd.read_csv(r'/content/drive/MyDrive/test_50.csv', encoding='utf_8_sig')
test = test.replace({"label": label2id})
test.fillna('', inplace=True)

In [4]:
train

Unnamed: 0,label,text,diff
0,0,Fix #2197 extended regex pattern example,diff --git a/docs/content/manual/manual.yml b/...
1,0,some test spec fixes,diff --git a/asset-pipeline-grails/src/test/gr...
2,1,"FILEUPLOAD-279: Introduce a system property, w...",diff --git a/.gitignore b/.gitignore\nnew file...
3,0,fix(documentation): remove route redeclaration...,diff --git a/doc/API/centreon-api-v2.yaml b/do...
4,0,[JENKINS-49744] - Generalize the permission ha...,diff --git a/src/main/java/com/synopsys/arc/je...
...,...,...,...
96,0,[maven-release-plugin] prepare release checkst...,diff --git a/pom.xml b/pom.xml\nindex 2043823....
97,0,Merge pull request #360 from joaosantos81/mast...,diff --git a/control/Session.php b/control/Ses...
98,0,Repairing: implement invalid node checksum fix.,diff --git a/libexfat/exfat.h b/libexfat/exfat...
99,1,[MJ2] Avoid index out of bounds access to pi->...,diff --git a/src/lib/openmj2/pi.c b/src/lib/op...


In [5]:
len(train)

101

In [6]:
train['label'].value_counts()

0    63
1    38
Name: label, dtype: int64

In [7]:
test['label'].value_counts()

0    6284
1    3727
Name: label, dtype: int64

In [8]:
from sklearn import  metrics

def compute_metrics(y_pred, y_test):
    # print(y_pred,y_test)
    # classification_report = metrics.classification_report(y_test,y_pred)
    accuracy_score = metrics.accuracy_score(y_test,y_pred)
    recall_score = metrics.recall_score(y_test,y_pred,average='weighted')
    f1_score = metrics.f1_score(y_test,y_pred,average='weighted')

    # return {"classification_report": classification_report}
    return {"accuracy": accuracy_score, "recall": recall_score, "f1":f1_score}

In [9]:
train

Unnamed: 0,label,text,diff
0,0,Fix #2197 extended regex pattern example,diff --git a/docs/content/manual/manual.yml b/...
1,0,some test spec fixes,diff --git a/asset-pipeline-grails/src/test/gr...
2,1,"FILEUPLOAD-279: Introduce a system property, w...",diff --git a/.gitignore b/.gitignore\nnew file...
3,0,fix(documentation): remove route redeclaration...,diff --git a/doc/API/centreon-api-v2.yaml b/do...
4,0,[JENKINS-49744] - Generalize the permission ha...,diff --git a/src/main/java/com/synopsys/arc/je...
...,...,...,...
96,0,[maven-release-plugin] prepare release checkst...,diff --git a/pom.xml b/pom.xml\nindex 2043823....
97,0,Merge pull request #360 from joaosantos81/mast...,diff --git a/control/Session.php b/control/Ses...
98,0,Repairing: implement invalid node checksum fix.,diff --git a/libexfat/exfat.h b/libexfat/exfat...
99,1,[MJ2] Avoid index out of bounds access to pi->...,diff --git a/src/lib/openmj2/pi.c b/src/lib/op...


In [10]:
# id2label = {0: "Adaptive", 1: "POSITIVE"}
# label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased", num_labels=2
)
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
training_args = TrainingArguments(
    output_dir="./my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [13]:
encoded_train = tokenizer(train['text'].astype(str).to_list(), return_tensors='pt',truncation=True, padding=True)
encoded_test = tokenizer(test['text'].astype(str).to_list(), return_tensors='pt',truncation=True, padding=True)

In [14]:
class CommitDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [15]:
train_dataset = CommitDataset(encoded_train, list(train['label']))
test_dataset = CommitDataset(encoded_test, list(test['label']))

In [16]:
def compute_metrics(eval_pred):
    accuracy = load_metric('accuracy')
    precision = load_metric("precision")
    recall = load_metric("recall")
    f1 = load_metric("f1")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy.compute(predictions=predictions, references=labels)
    precision = precision.compute(predictions=predictions, references=labels, average='weighted')
    recall = recall.compute(predictions=predictions, references=labels,average='weighted')
    f1 = f1.compute(predictions=predictions, references=labels,average='weighted')
    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.678072,{'precision': 0.5197161807560303},{'recall': 0.6057336929377685},{'f1': 0.508816476678477},{'accuracy': 0.6057336929377685}


  accuracy = load_metric('accuracy')
Trainer is attempting to log a value of "{'precision': 0.5197161807560303}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.6057336929377685}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.508816476678477}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.6057336929377685}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


TrainOutput(global_step=7, training_loss=0.7266293253217425, metrics={'train_runtime': 329.8005, 'train_samples_per_second': 0.306, 'train_steps_per_second': 0.021, 'total_flos': 19411634775720.0, 'train_loss': 0.7266293253217425, 'epoch': 1.0})