In [None]:
%pip install scikit-learn
%pip install datasets
%pip install transformers
%pip install transformers[torch]
%pip install evaluate
%pip install pandas==2.0.3
%pip install torch
%pip install joblib

In [None]:
# this cell is required only when running on google collab VM

import sys
#sys.path.append('/content/drive/MyDrive/Faks/research_uiktp')

from google.colab import drive
drive.mount('/content/drive/')

sys.path.append('/content/drive/')

%pwd
%cd /content/drive/MyDrive/Faks/research_uiktp
%pwd
%ls

In [3]:
import joblib
import torch
import sklearn
import evaluate
import numpy as np
import pandas as pd
from data_utils import rename_columns
from datasets import DatasetDict, Dataset
from make_dataset import split_dataset      # Module not found on google collab
from transformers import BertTokenizer, BertModel, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
CSV_DATASET_PATH = "./processed_data/processed_data.csv"
#CSV_DATASET_PATH = "/content/drive/MyDrive/Faks/research_uiktp/processed_data/processed_data.csv"   # directory not found on google collab
dataframe = pd.read_csv(CSV_DATASET_PATH)
dataframe = rename_columns(dataframe)
print(dataframe)

In [None]:
train_set, test_set, validation_set = split_dataset(dataframe, train_set_length=.8, test_set_length=.1, validation_set_length=.1, axis=0)
print(train_set)
print(test_set)
print(validation_set)

In [None]:
dataset = DatasetDict(
    {
        "train":Dataset.from_dict(train_set.to_dict('list')),
        "test":Dataset.from_dict(test_set.to_dict('list')),
        "validation":Dataset.from_dict(validation_set.to_dict('list'))
    }
)
print(dataset)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(jira_tasks, column_name="text"):
    ret = tokenizer(jira_tasks[column_name], padding="max_length", truncation=True)
    return ret

tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_set = tokenized_datasets["train"]
test_set = tokenized_datasets["test"]
validation_set = tokenized_datasets["validation"]
print(train_set)
print(test_set)
print(validation_set)

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=11)

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    print("GPU is not available. CPU will be used to train the model")
    device = torch.device("cpu")
model.to(device)

training_args = TrainingArguments(output_dir="training_logs", evaluation_strategy="epoch")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    compute_metrics=compute_metrics
)
trainer.train()
print("Training finished")

In [8]:
model_path = "/content/drive/MyDrive/Faks/research_uiktp/trained_classifier.sav"
#model_path = "./trained_classifier.sav"
model_path = "/content/drive/MyDrive/pytorch_model.bin"
model_path = "pytorch_model.bin"

In [None]:
torch.save(model.state_dict(), model_path)
print("Model serialized")

In [None]:
MAP_LOCATION = 'cpu'
#MAP_LOCATION = 'cuda'
loaded_model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=11)
loaded_model.load_state_dict(torch.load(model_path, map_location=MAP_LOCATION))
print("Model loaded")

In [None]:
loaded_model.eval()
input_text = validation_set['text'][0]
input_ids = validation_set['input_ids'][0]
input_masks = validation_set['attention_mask'][0]
print("Input sample: {}".format(input_text))
print("Input ids: {}".format(input_ids))
print("Input masks: {}".format(input_masks))
predicted_duration = loaded_model.forward(input_ids, input_masks)
print("Predicted duration: {}".format(predicted_duration))