In [1]:
import sys
import os
import evaluate
import pandas as pd
import numpy as np
from pathlib import Path

sys.path.append(str(Path(os.path.abspath('')).absolute().parent))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
TRAINING_DATASET_PATH = "../1K_issues.gzip"
TEST_DATASET_PATH = "../1K_issues.gzip"

training_dataset = pd.read_csv(TRAINING_DATASET_PATH, compression='gzip', lineterminator='\n')
training_dataset = training_dataset.dropna()

test_dataset = pd.read_csv(TEST_DATASET_PATH, compression='gzip', lineterminator='\n')
print(len(test_dataset))
test_dataset = test_dataset.dropna()
print("Size without NaNs ",len(test_dataset))

1000
Size without NaNs  471


In [3]:

training_dataset.head()

Unnamed: 0,identifier,assignee,summary,body,creation_time,completion_time,extra_data
3,229807,alexdima,Fix editing session lifecycle issues,<!-- Thank you for submitting a Pull Request. ...,2024-09-26 09:38:28+00:00,2024-09-26 09:57:45+00:00,{}
4,229806,aiday-mar,Using different backgrounds on composition ins...,in relation to https://github.com/microsoft/vs...,2024-09-26 09:37:04+00:00,2024-09-26 09:55:59+00:00,{}
5,229805,aeschli,Cancelling application of edits results in a d...,Fixes https://github.com/microsoft/vscode-copi...,2024-09-26 09:28:12+00:00,2024-09-26 09:47:11+00:00,{}
8,229802,jrieken,"Revert ""Revert ""chat command center polish (#2...",This reverts commit 11fb5bbfdb1a8926b202983465...,2024-09-26 08:52:25+00:00,2024-09-26 09:11:36+00:00,{}
17,229793,aeschli,When i paste code it keeps auto closing the fi...,"When I paste a perfectly good chunk of code, t...",2024-09-26 06:36:32+00:00,2024-09-26 09:05:34+00:00,{}


In [4]:
label_as_id = {}
id_as_label = {}

new_id = 0
for assignee in training_dataset["assignee"].unique():
	label_as_id[assignee] = new_id
	id_as_label[new_id] = assignee
	new_id+=1

def parse_label_to_id(label):
    return label_as_id[label]

# print(id_as_label)

### Importing our model and tokenizer

In [5]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer#, DataCollatorWithPadding
from datasets import Dataset

MODEL_NAME = "distilbert/distilbert-base-uncased"
CONTEXT_LENGTH = 512

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
	MODEL_NAME, num_labels=len(id_as_label), id2label =id_as_label, label2id=label_as_id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Defining our Sentinel Tokens

In [6]:
TITLE_BEGIN_SENTINEL = "<BoT>"
TITLE_END_SENTINEL = "<EoT>"
CODE_BEGIN_SENTINEL = "<BoC>"
CODE_END_SENTINEL = "<EoC>"

special_tokens_dict = {'pad_token': '[PAD]', 'additional_special_tokens': [TITLE_BEGIN_SENTINEL,TITLE_END_SENTINEL,CODE_BEGIN_SENTINEL,CODE_END_SENTINEL]}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(tokenizer.all_special_tokens) 

model.resize_token_embeddings(len(tokenizer))

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]', '<BoT>', '<EoT>', '<BoC>', '<EoC>']


Embedding(30526, 768, padding_idx=0)

### Preparing the Datasets

In [14]:
def create_input_text(issue_row):
	padded_title = TITLE_BEGIN_SENTINEL + issue_row["summary"] + TITLE_END_SENTINEL + "\n"
	return padded_title + issue_row["body"]

def process_row(row):
	return {"label": parse_label_to_id(row['assignee']),"text":create_input_text(row)}   

def process_dataset_entry(entries):
	entries =  tokenizer(entries["text"], max_length=CONTEXT_LENGTH, padding="max_length", truncation=True)
	# entries["label"] = entries["input_ids"].copy()	
	return entries

def process_df(df):   
	clean_df = df.apply(process_row, axis=1,result_type='expand')
	df_as_ds = Dataset.from_pandas(clean_df)
	df_as_ds = df_as_ds.remove_columns(["__index_level_0__"])

	df_as_ds = df_as_ds.filter(lambda x: len(tokenizer(x["text"])["input_ids"]) < CONTEXT_LENGTH)
	df_as_ds = df_as_ds.map(process_dataset_entry,remove_columns=["text"], batched=True, batch_size=16, num_proc=6) 
	return df_as_ds


tokenized_train = process_df(training_dataset)
tokenized_test = process_df(test_dataset)

print("Left with " + str(len(tokenized_train)) + " training entries")
print("      and " + str(len(tokenized_test)) + " test entries.")
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Filter: 100%|██████████| 471/471 [00:01<00:00, 317.43 examples/s]
Map (num_proc=6): 100%|██████████| 401/401 [00:00<00:00, 418.04 examples/s]
Filter: 100%|██████████| 471/471 [00:01<00:00, 409.74 examples/s]
Map (num_proc=6): 100%|██████████| 401/401 [00:00<00:00, 425.65 examples/s]


Left with 401 training entries
      and 401 test entries.


### Choosing an evaluation metric

In [15]:
accuracy_evaluator = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_evaluator.compute(predictions=predictions, references=labels)

In [20]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./training_model",
    learning_rate=2e-5,
    num_train_epochs=5,
    # weight_decay=0.01,
    save_strategy="epoch",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [21]:
import os
import torch

torch.cuda.empty_cache() 

# os.environ["PYTORCH_NO_CUDA_MEMORY_CACHING"] = "1"
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32,garbage_collection_threshold:0.8"

trainer.train()
trainer.save_model("simple_issue_classifier")

 13%|█▎        | 32/255 [11:46<1:22:00, 22.07s/it]
                                       
100%|██████████| 255/255 [1:16:31<00:00, 18.01s/it]


{'train_runtime': 4591.6029, 'train_samples_per_second': 0.437, 'train_steps_per_second': 0.056, 'train_loss': 2.940982594209559, 'epoch': 5.0}


In [25]:
from transformers import pipeline

classifier = pipeline("text-classification", model="simple_issue_classifier", tokenizer=tokenizer)
result = classifier("I can't access one of my recorded trips")

result[0]

{'label': 'lszomoru', 'score': 0.05464104562997818}