In [None]:
%pip install scikit-learn
%pip install datasets
%pip install transformers
%pip install transformers[torch]
%pip install evaluate
%pip install pandas==2.0.3
%pip install torch
%pip install jobli
%pip install tqdm
%pip install progressbar

In [None]:
# this cell is required for running in google collab VM
import os
if os.getenv("COLAB_RELEASE_TAG"):
    print("Running in Colab")
    import sys
    from google.colab import drive
    drive.mount('/content/drive/')
    sys.path.append('/content/drive/')
    %cd /content/drive/MyDrive/Faks/research_uiktp
else:
   print("NOT in Colab")

In [3]:
import joblib
import torch
import sklearn
import evaluate
import numpy as np
import pandas as pd
from train_model import softmax, validate, predict_durations_for_tokenized_tensor_inputs as run_prediction
from get_task_durations import plot_durations_histogram
from data_utils import rename_columns, get_global_constants
from datasets import DatasetDict, Dataset
from make_dataset import split_dataset
from transformers import BertTokenizer, BertModel, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
GLOBAL_CONSTANTS = get_global_constants()

In [None]:
# this cell is required only when the processed dataset is not saved on the path provided in GLOBAL_CONSTANTS
from make_dataset import get_jira_tasks
generated_dataframe = get_jira_tasks()
print(generated_dataframe)

In [None]:
dataframe = pd.read_csv(GLOBAL_CONSTANTS.CSV_DATASET_PATH)
dataframe = rename_columns(dataframe)
print(dataframe)
plot_durations_histogram(dataframe, column_name='label')

In [None]:
train_set, test_set, validation_set = split_dataset(dataframe, train_set_length=.8, test_set_length=.1, validation_set_length=.1, axis=0)
print(train_set)
print(test_set)
print(validation_set)

In [None]:
dataset = DatasetDict(
    {
        "train":Dataset.from_dict(train_set.to_dict('list')),
        "test":Dataset.from_dict(test_set.to_dict('list')),
        "validation":Dataset.from_dict(validation_set.to_dict('list'))
    }
)
print(dataset)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(jira_tasks, column_name="text"):
    ret = tokenizer(jira_tasks[column_name], padding="max_length", truncation=True)
    return ret

tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_set = tokenized_datasets["train"]
test_set = tokenized_datasets["test"]
validation_set = tokenized_datasets["validation"]
print(train_set)
print(test_set)
print(validation_set)

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=11)

model.to(GLOBAL_CONSTANTS.DEVICE)

training_args = TrainingArguments(output_dir="training_logs", evaluation_strategy="epoch")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    compute_metrics=compute_metrics
)
trainer.train()
print("Training finished")

In [None]:
torch.save(model.state_dict(), GLOBAL_CONSTANTS.MODEL_PATH)
print("Model serialized")

In [None]:
loaded_model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=11)
loaded_model.load_state_dict(torch.load(GLOBAL_CONSTANTS.MODEL_PATH, map_location=GLOBAL_CONSTANTS.DEVICE_STRING))
print("Model loaded")

In [None]:
loaded_model.eval()
VALIDATION_SAMPLE_SIZE = 16
input_text = validation_set['text']#[:VALIDATION_SAMPLE_SIZE]
input_ids = validation_set['input_ids']#[:VALIDATION_SAMPLE_SIZE]
input_masks = validation_set['attention_mask']#[:VALIDATION_SAMPLE_SIZE]
true_durations = validation_set['label']#[:VALIDATION_SAMPLE_SIZE]
input_ids = torch.tensor(input_ids)
input_masks = torch.tensor(input_masks)
print("Input sample (of type {}): {}".format(type(input_text), input_text))
print("Input ids (of type {}): {}".format(type(input_ids), input_ids))
print("Input masks (of type {}): {}".format(type(input_masks), input_masks))
predicted_duration = run_prediction(loaded_model, input_ids, input_masks)
print("Predicted duration: {}".format(predicted_duration))
print("Expected durations: {}".format(true_durations))

In [None]:
metrics = validate(true_durations, predicted_duration)
print(metrics)