In [17]:
pip install pandas comet_ml datasets transformers scikit-learn

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [4]:
import comet_ml
import pandas as pd
import re

def clean_todo_text(text):
    # Regex to match patterns like # TODO, # TODO:, # TODO (name)
    cleaned_text = re.sub(r'#\s*TODO\s*(?:\([\w@]+\))?:?\s*', '', text, flags=re.IGNORECASE)
    return cleaned_text.strip()

class CometDatasetLoader:
    def __init__(self, api_key, project_name, workspace, artifact_name):
        self.api_key = api_key
        self.project_name = project_name
        self.workspace = workspace
        self.artifact_name = artifact_name
        self.experiment = self._initialize_experiment()

    def _initialize_experiment(self):
        return comet_ml.Experiment(
            api_key=self.api_key,
            project_name=self.project_name,
            workspace=self.workspace
        )

    def get_experiment(self):
        return self.experiment

    def download_dataset(self):
        logged_artifact = self.experiment.get_artifact(artifact_name=self.artifact_name)
        logged_artifact.download("./data")
        data = pd.read_csv(f"./data/{self.artifact_name}_200.csv")  # Ensure filename matches
        data['todo_text'] = data['todo_text'].apply(clean_todo_text)
        return data

    def end_experiment(self):
        self.experiment.end()

In [9]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, DataCollatorWithPadding, Trainer
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss
import numpy as np
import random

# Set a fixed seed for reproducibility
def set_seed(seed=50):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# Set the seed
set_seed(50)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    probs = np.exp(pred.predictions) / np.sum(np.exp(pred.predictions), axis=1, keepdims=True)
    loss = log_loss(labels, probs)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'cross_entropy_loss': loss
    }


class TodoClassifierTrainer:
    def __init__(self, experiment, dataset, tokenizer_name="bert-base-uncased"):
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name)
        self.model = BertForSequenceClassification.from_pretrained(tokenizer_name, num_labels=3)

        # Move model to GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

        self.data = dataset
        self.experiment = experiment
        self.train_dataset, self.val_dataset = self._prepare_datasets()

    def _prepare_datasets(self):
        # Split dataset into training and validation sets
        train_texts, val_texts, train_labels, val_labels = train_test_split(
            self.data["todo_text"].tolist(),
            self.data["priority"].tolist(),
            test_size=0.2,
            random_state=42
        )

        # Convert to Hugging Face Dataset
        train_data = Dataset.from_pandas(pd.DataFrame({"text": train_texts, "label": train_labels}))
        val_data = Dataset.from_pandas(pd.DataFrame({"text": val_texts, "label": val_labels}))

        # Tokenize dataset
        train_data = train_data.map(self._tokenize_function, batched=True)
        val_data = val_data.map(self._tokenize_function, batched=True)

        return train_data, val_data

    def _tokenize_function(self, examples):
        return self.tokenizer(examples["text"], padding="max_length", truncation=True)

    def train_model(self):
        training_args = TrainingArguments(
            output_dir="./results",
            evaluation_strategy="epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=4,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=2,
            report_to="comet_ml",
            seed=50
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.val_dataset,
            tokenizer=self.tokenizer,
            data_collator=DataCollatorWithPadding(self.tokenizer),
            compute_metrics=compute_metrics
        )

        trainer.train()

        # Save the model
        model_path = "./todo-priority-model"
        trainer.save_model(model_path)
        self.tokenizer.save_pretrained(model_path)

        # Log model as an artifact to Comet ML
        self.experiment.log_asset_folder(model_path, log_file_name="todo-priority-model")
        print("Model saved to Comet ML as an artifact.")


In [10]:
COMET_API_KEY = "<COMET_API_KEY>"
COMET_PROJECT_NAME = "todo_prioritizer"
COMET_WORKSPACE = "915-muscalagiu-ancaioana"
ARTIFACT_NAME = "TODO_dataset"

# Initialize and load dataset from Comet
comet_loader = CometDatasetLoader(COMET_API_KEY, COMET_PROJECT_NAME, COMET_WORKSPACE, ARTIFACT_NAME)
data = comet_loader.download_dataset()

#Initialize trainer and start training
trainer = TodoClassifierTrainer(experiment=comet_loader.get_experiment(),dataset=data)
trainer.train_model()
# End Comet experiment
comet_loader.end_experiment()

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/915-muscalagiu-ancaioana/todo-prioritizer/70cf9487620f40559573b0ee0080dfa5

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Artifact '915-muscalagiu-ancaioana/TODO_dataset:5.0.0' download has been started asynchronously
[1;38;5;39mCOMET INFO:[0m Still downloading 4 file(s), remaining 38.37 KB/38.37 KB
[1;38;5;39mCOMET INFO:[0m Artifact '915-muscalagiu-ancaioana/TODO_dataset:5.0.0' has been successfully downloaded
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

[1;38;5;39mCOMET INFO:[0m An experiment with the same configuration options is already running and will be reused.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Cross Entropy Loss,Runtime,Samples Per Second,Steps Per Second
1,1.1343,0.994317,0.5,0.263158,0.5,0.344828,0.994317,1.099,36.397,4.55
2,0.9878,0.992377,0.525,0.386937,0.525,0.392054,0.992377,1.122,35.651,4.456
3,0.9188,0.973459,0.55,0.607473,0.55,0.531772,0.973459,1.1187,35.755,4.469
4,0.8405,0.979137,0.525,0.489583,0.525,0.472955,0.979137,1.1859,33.73,4.216


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model saved to Comet ML as an artifact.


[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : genetic_peninsula_1984
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/915-muscalagiu-ancaioana/todo-prioritizer/70cf9487620f40559573b0ee0080dfa5
[1;38;5;39mCOMET INFO:[0m   Downloads:
[1;38;5;39mCOMET INFO:[0m     artifact assets : 4 (38.37 KB)
[1;38;5;39mCOMET INFO:[0m     artifacts       : 1
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     epoch [45]                  : (0.1, 4.0)
[1;38;5;39mCOMET INFO:[0m     eval_accuracy [4]           : (0.5, 0.55)
[1;38;5;39mCOMET INFO:[