In [1]:
import sys
import os
import evaluate
import pandas as pd
import numpy as np
from pathlib import Path

sys.path.append(str(Path(os.path.abspath('')).absolute().parent))

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [9]:
DATASET_PATH = "../1K_issues.gzip"

dataset = pd.read_csv(DATASET_PATH, compression='gzip', lineterminator='\n')
dataset = dataset.dropna()

In [10]:

dataset.head()

Unnamed: 0,identifier,assignee,summary,body,creation_time,completion_time,extra_data
3,229807,alexdima,Fix editing session lifecycle issues,<!-- Thank you for submitting a Pull Request. ...,2024-09-26 09:38:28+00:00,2024-09-26 09:57:45+00:00,{}
4,229806,aiday-mar,Using different backgrounds on composition ins...,in relation to https://github.com/microsoft/vs...,2024-09-26 09:37:04+00:00,2024-09-26 09:55:59+00:00,{}
5,229805,aeschli,Cancelling application of edits results in a d...,Fixes https://github.com/microsoft/vscode-copi...,2024-09-26 09:28:12+00:00,2024-09-26 09:47:11+00:00,{}
8,229802,jrieken,"Revert ""Revert ""chat command center polish (#2...",This reverts commit 11fb5bbfdb1a8926b202983465...,2024-09-26 08:52:25+00:00,2024-09-26 09:11:36+00:00,{}
17,229793,aeschli,When i paste code it keeps auto closing the fi...,"When I paste a perfectly good chunk of code, t...",2024-09-26 06:36:32+00:00,2024-09-26 09:05:34+00:00,{}


In [4]:
label_as_id = {"irr" : 0, "pbr" : 1, "inq" : 2}
id_as_label = {0: "irr", 1: "pbr", 2: "inq"}

def parse_label_to_id(label):
    return label_as_id[label]

In [2]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import Dataset

distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
tokenizer = distilbert_tokenizer

In [7]:
def process_df(df):
    def process_row(row):
        return {"label": parse_label_to_id(row['category']),"text":row['text']}
    def process_data_entry(entries):
        return tokenizer(entries["text"],max_length=512, truncation=True)
    clean_df = df.apply(process_row, axis=1,result_type='expand')
    # print(clean_df)
    df_as_ds = Dataset.from_pandas(clean_df)
    df_as_ds = df_as_ds.map(process_data_entry, batched=True)
    return df_as_ds

tokenized_train = process_df(train_df)
tokenized_test = process_df(test_df)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/15079 [00:00<?, ? examples/s]

Map:   0%|          | 0/1677 [00:00<?, ? examples/s]

In [8]:
accuracy_evaluator = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_evaluator.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=3, id2label = id_as_label, label2id=label_as_id
)

training_args = TrainingArguments(
    output_dir="./test_model",
    learning_rate=2e-5,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
import os
import torch

torch.cuda.empty_cache() 

# os.environ["PYTORCH_NO_CUDA_MEMORY_CACHING"] = "1"
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32,garbage_collection_threshold:0.8"

trainer.train()
trainer.save_model("simple_feedback_classifier")

In [8]:
from transformers import pipeline

classifier = pipeline("text-classification", model="../models/simple_feedback_classifier", tokenizer=tokenizer)
result = classifier("I can't access one of my recorded trips")

result[0]['label']

'inq'

In [4]:
from deeperMatcher.src.user_feedback.classifier.hf_model_based_classifier.hf_model_based_classifier import HF_Model_Based_Classifier 

new_classifier = HF_Model_Based_Classifier()
new_classifier.classify_review("I can't access one of my recorded trips")

'bug_report'