# Multi-class classification using RoBERTa

### Installing necessary packages absent in Google Collab

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/280.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m276.5/280.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2


### The main_model function responsible for running the model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
from transformers import TrainerCallback
import os
import shutil
import re
import time
from pathlib import Path
#[4.478758514361407e-05, 0.17277768585588338, 5, 10, 8]
def main_model(file_name, ext, type):

    path_type = "Balanced" if type == 1 else "Unbalanced"

    current_file_path = Path(__file__).parent
    path_to_project = current_file_path.parents[1]

    df = pd.read_excel(f"{path_to_project}/Data/Datasets/{path_type}/{file_name}.{ext}")

    results_dir = f"{path_to_project}/Models/RoBERTa/Output/{path_type}/{file_name}"
    dump_dir = results_dir+"/Dump"

    if os.path.isdir(results_dir):
        shutil.rmtree(results_dir)

    os.mkdir(results_dir)
    os.mkdir(dump_dir)

    df = df[df['review'].notna() & (df['review'] != '')]
    # Select the text and label columns
    df['review'] = df['review'].str.replace('[^\x20-\x7E]', '', regex=True)
    X = df['review'].values
    y = df['label'].values

    X_train_CV, X_test_full, y_train_CV, y_test_full = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


    # Encode the labels to a numeric format
    label_encoder = LabelEncoder()
    y_train_CV_encoded = label_encoder.fit_transform(y_train_CV)
    y_test_full_encoded = label_encoder.transform(y_test_full)

    # Initialize the tokenizer for RoBERTa
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    # Tokenization function
    def tokenize_function(texts):
        return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

    loss_logging_callback = LossLoggingCallback()

    # Stratified K-Fold Cross-Validation
    n_splits = 5
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Variables to accumulate scores
    best_accuracy = 0
    best_model = None
    accuracy_scores = []
    metrics_df = pd.DataFrame()


    for fold, (train_index, val_index) in enumerate(kf.split(X_train_CV, y_train_CV_encoded)):
        print(f"Fold {fold+1}/{n_splits}")
        start_time = time.time()
        # Split the data
        X_train, X_val = X_train_CV[train_index], X_train_CV[val_index]
        y_train, y_val = y_train_CV_encoded[train_index], y_train_CV_encoded[val_index]


        # Tokenize the data
        train_encodings = tokenize_function(X_train.tolist())
        val_encodings = tokenize_function(X_val.tolist())

        # Create dataset objects
        train_dataset = ReviewDataset(train_encodings, y_train)
        val_dataset = ReviewDataset(val_encodings, y_val)

        # Initialize the model for each fold
        model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

        # Define training arguments for each fold, adjust hyperparameters as needed
        training_args = TrainingArguments(
            output_dir=f"{dump_dir}/res",
            num_train_epochs=5,
            per_device_train_batch_size=10,
            per_device_eval_batch_size=8,
            warmup_steps=500,
            weight_decay=0.17277768585588338,
            logging_dir=f"{dump_dir}/logs",
            logging_strategy="epoch",
            evaluation_strategy="epoch",
            learning_rate=4.478758514361407e-05,
            max_grad_norm=1.0,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            save_strategy="epoch",
            save_total_limit=2,
            lr_scheduler_type='linear'
        )

        # Initialize Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=lambda p: {"accuracy": accuracy_score(p.predictions.argmax(-1), p.label_ids)},
            callbacks=[loss_logging_callback]
        )

        # Train
        trainer.train()

        loss_logging_callback.save_logs_to_excel(f"{results_dir}/fold_loss.xlsx")

        # Evaluate
        results = trainer.evaluate()
        accuracy_scores.append(results['eval_accuracy'])

        if results['eval_accuracy'] > best_accuracy:
            best_accuracy = results['eval_accuracy']
            best_model = model  # Assign the best model

        # Get predictions and true labels
        predictions = trainer.predict(val_dataset)
        pred_labels = np.argmax(predictions.predictions, axis=-1)
        true_labels = y_val

        # Calculate accuracy
        accuracy = accuracy_score(true_labels, pred_labels)
        label_names = label_encoder.inverse_transform(range(len(label_encoder.classes_)))

        # Calculate precision, recall, and F1-score
        report_dict = classification_report(true_labels, pred_labels, output_dict=True, zero_division=0, target_names=label_names)
        # avg_metrics = report_dict['weighted avg']  # Use 'macro avg' or 'weighted avg' based on your preference
        end_time = time.time()
        # Append the metrics for this fold to the DataFrame
        metrics_df = metrics_df.append({
            ('Fold', ''): fold + 1,
            ('Accuracy', ''): accuracy,
            ('Train Time', ''): str(end_time - start_time)+" s",
            ('Bug Report', 'P'): report_dict['bug report']['precision'],
            ('Bug Report', 'R'): report_dict['bug report']['recall'],
            ('Bug Report', 'F1'): report_dict['bug report']['f1-score'],
            ('Feature Request', 'P'): report_dict['feature request']['precision'],
            ('Feature Request', 'R'): report_dict['feature request']['recall'],
            ('Feature Request', 'F1'): report_dict['feature request']['f1-score'],
            ('Rating', 'P'): report_dict['rating']['precision'],
            ('Rating', 'R'): report_dict['rating']['recall'],
            ('Rating', 'F1'): report_dict['rating']['f1-score'],
            ('User Experience', 'P'): report_dict['user experience']['precision'],
            ('User Experience', 'R'): report_dict['user experience']['recall'],
            ('User Experience', 'F1'): report_dict['user experience']['f1-score']
        }, ignore_index=True)

    # Save the DataFrame to a CSV file after completing all folds
    metrics_df.columns = pd.MultiIndex.from_tuples([(c,) if isinstance(c, str) else c for c in metrics_df.columns])
    metrics_df.to_excel(f"{results_dir}/fold_metrics.xlsx", index=True)

    # Evaluate the best model on the test set
    test_encodings = tokenize_function(X_test_full.tolist())
    test_dataset = ReviewDataset(test_encodings, y_test_full_encoded)
    test_trainer = Trainer(model=best_model)
    test_results = test_trainer.predict(test_dataset)
    test_predictions = np.argmax(test_results.predictions, axis=-1)
    test_accuracy = accuracy_score(y_test_full_encoded, test_predictions)

    label_names_full = label_encoder.inverse_transform(range(len(label_encoder.classes_)))

    # Calculate precision, recall, and F1-score
    report_dict_full = classification_report(y_test_full_encoded, test_predictions, output_dict=True, zero_division=0, target_names=label_names_full)

    full_metrics_df = pd.DataFrame()

    full_metrics_df = full_metrics_df.append({
            ('Accuracy', ''): test_accuracy,
            ('Bug Report', 'P'): report_dict_full['bug report']['precision'],
            ('Bug Report', 'R'): report_dict_full['bug report']['recall'],
            ('Bug Report', 'F1'): report_dict_full['bug report']['f1-score'],
            ('Feature Request', 'P'): report_dict_full['feature request']['precision'],
            ('Feature Request', 'R'): report_dict_full['feature request']['recall'],
            ('Feature Request', 'F1'): report_dict_full['feature request']['f1-score'],
            ('Rating', 'P'): report_dict_full['rating']['precision'],
            ('Rating', 'R'): report_dict_full['rating']['recall'],
            ('Rating', 'F1'): report_dict_full['rating']['f1-score'],
            ('User Experience', 'P'): report_dict_full['user experience']['precision'],
            ('User Experience', 'R'): report_dict_full['user experience']['recall'],
            ('User Experience', 'F1'): report_dict_full['user experience']['f1-score']
        }, ignore_index=True)

    full_metrics_df.columns = pd.MultiIndex.from_tuples([(c,) if isinstance(c, str) else c for c in full_metrics_df.columns])
    full_metrics_df.to_excel(f"{results_dir}/metrics_results_full_test.xlsx", index=True)

    print(f"Test Accuracy: {test_accuracy}")

    # Generate and print the classification report
    print(classification_report(y_test_full_encoded, test_predictions, target_names=label_encoder.classes_, zero_division=0))

    shutil.rmtree(dump_dir)

# Custom dataset class
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

class LossLoggingCallback(TrainerCallback):
    """A custom callback to log training and validation loss."""
    def __init__(self):
        super().__init__()
        self.log_history = []
        self.log_train_loss_history = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        # This method captures both training and evaluation logs, so it's more general than on_epoch_end
        if logs is not None:
            # Capture both training and evaluation steps
            if 'loss' in logs:  # Indicates a training step
                self.log_train_loss_history.append({
                    'epoch': state.epoch,
                    'training_loss': logs.get('loss'),
                })
            elif 'eval_loss' in logs:  # Indicates an evaluation step
                # Make sure to capture the last training loss as well
                last_training_loss = self.log_train_loss_history[-1]['training_loss'] if self.log_train_loss_history else None
                self.log_history.append({
                    'epoch': state.epoch,
                    'training_loss': last_training_loss,  # Include last known training loss for reference
                    'validation_loss': logs.get('eval_loss'),
                    'eval_runtime':logs.get('eval_runtime')
                })

    def save_logs_to_excel(self, file_name):
        """Save the recorded logs to a Excel file."""
        pd.DataFrame(self.log_history).to_excel(file_name, index=False)

__file__ = "/content/drive/MyDrive/FinalProject/Models/RoBERTa/RoBERTa.ipynb"
current_file_path = Path(__file__).parent
path_to_project = current_file_path.parents[1]

directory_path_multi = path_to_project / 'Data' / 'Datasets' / 'Balanced'

files_multi = [(file.name, file.stat().st_size)
               for file in directory_path_multi.iterdir()
               if file.is_file() and not file.name.startswith('.')]

files_multi.sort(key=lambda x: x[1])





Running the model on each dataset separately due to storage constraints

In [None]:
print(f"Now doing: {files_multi[1][0].split('.')[0]}")
main_model(files_multi[1][0].split('.')[0], files_multi[1][0].split('.')[1], 1)
print(f"Now doing: {files_multi[2][0].split('.')[0]}")
main_model(files_multi[2][0].split('.')[0], files_multi[2][0].split('.')[1], 1)

Now doing: dataset_balanced_4000


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Fold 1/5


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1263,0.689387,0.734375
2,0.7344,0.78186,0.7375
3,0.5431,0.716608,0.773438
4,0.296,0.837276,0.804688
5,0.1548,1.078871,0.789062


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0956,0.865004,0.673438
2,0.7424,0.762906,0.703125
3,0.5345,0.695878,0.759375
4,0.3186,1.037818,0.771875
5,0.1468,1.155005,0.76875


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_balanced_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.099,0.779159,0.70625
2,0.6935,0.798775,0.678125
3,0.5488,0.736804,0.7375
4,0.2928,1.117141,0.771875
5,0.1546,1.21074,0.760938


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_balanced_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1013,0.791442,0.707812
2,0.7249,0.730323,0.729688
3,0.5329,0.817248,0.735938
4,0.316,1.193109,0.746875
5,0.1498,1.354792,0.754687


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_balanced_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1045,0.741583,0.732812
2,0.7228,0.738733,0.7375
3,0.5335,0.806608,0.759375
4,0.3082,0.90818,0.773438
5,0.1439,1.151327,0.779687


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_balanced_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


  full_metrics_df = full_metrics_df.append({


Test Accuracy: 0.755
                 precision    recall  f1-score   support

     bug report       0.82      0.82      0.82       200
feature request       0.68      0.86      0.76       200
         rating       0.79      0.69      0.74       200
user experience       0.74      0.64      0.69       200

       accuracy                           0.76       800
      macro avg       0.76      0.76      0.75       800
   weighted avg       0.76      0.76      0.75       800

Now doing: dataset_balanced_8000
Fold 1/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9569,0.733282,0.735156
2,0.6361,0.558607,0.805469
3,0.3706,0.720856,0.828906
4,0.1994,0.568613,0.888281
5,0.0981,0.580346,0.895312


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9389,0.729496,0.730469
2,0.5789,0.613692,0.801562
3,0.3154,0.727835,0.838281
4,0.1621,0.729055,0.878125
5,0.0825,0.764449,0.878906


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_balanced_8000/Dump/res/checkpoint-2560 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9462,0.710437,0.730469
2,0.5953,0.621168,0.800781
3,0.3531,0.57549,0.865625
4,0.1891,0.676078,0.871875
5,0.0866,0.715128,0.882812


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_balanced_8000/Dump/res/checkpoint-2560 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9458,0.67118,0.760156
2,0.5984,0.549296,0.799219
3,0.3417,0.534622,0.884375
4,0.1818,0.577803,0.895312
5,0.0854,0.550851,0.90625


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_balanced_8000/Dump/res/checkpoint-2560 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9424,0.886129,0.711719
2,0.5934,0.641226,0.797656
3,0.3396,0.530001,0.874219
4,0.1763,0.580558,0.890625
5,0.0716,0.601541,0.896875


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_balanced_8000/Dump/res/checkpoint-2560 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


  full_metrics_df = full_metrics_df.append({


Test Accuracy: 0.895
                 precision    recall  f1-score   support

     bug report       0.93      0.94      0.93       400
feature request       0.90      0.91      0.90       400
         rating       0.90      0.90      0.90       400
user experience       0.86      0.84      0.85       400

       accuracy                           0.90      1600
      macro avg       0.89      0.90      0.89      1600
   weighted avg       0.89      0.90      0.89      1600



In [None]:
print(f"Now doing: {files_multi[3][0].split('.')[0]}")
main_model(files_multi[3][0].split('.')[0], files_multi[3][0].split('.')[1], 1)
print(f"Now doing: {files_multi[4][0].split('.')[0]}")
main_model(files_multi[4][0].split('.')[0], files_multi[4][0].split('.')[1], 1)

Now doing: dataset_gpt_balanced_4000


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Fold 1/5


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2605,0.615521,0.778125
2,0.4269,0.72512,0.839063
3,0.3157,0.294058,0.940625
4,0.2289,0.225339,0.95
5,0.1736,0.27332,0.946875


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1683,0.488183,0.853125
2,0.3627,0.369359,0.93125
3,0.2436,0.265273,0.945312
4,0.2062,0.294901,0.940625
5,0.1753,0.27014,0.95


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_gpt_balanced_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2044,0.586132,0.820312
2,0.5232,0.357578,0.9125
3,0.2891,0.322424,0.9375
4,0.2235,0.233396,0.95625
5,0.1866,0.210168,0.960938


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_gpt_balanced_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2065,0.574211,0.80625
2,0.3915,0.362126,0.942187
3,0.2641,0.256754,0.953125
4,0.2222,0.234711,0.951562
5,0.1956,0.255686,0.95


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_gpt_balanced_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1754,0.590562,0.818466
2,0.4182,0.433,0.898279
3,0.3091,0.309747,0.935837
4,0.249,0.303158,0.942097
5,0.1943,0.280797,0.943662


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_gpt_balanced_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


  full_metrics_df = full_metrics_df.append({


Test Accuracy: 0.9675
                 precision    recall  f1-score   support

     bug report       0.99      0.96      0.98       200
feature request       1.00      0.97      0.98       200
         rating       0.96      0.95      0.96       200
user experience       0.92      0.98      0.95       200

       accuracy                           0.97       800
      macro avg       0.97      0.97      0.97       800
   weighted avg       0.97      0.97      0.97       800

Now doing: dataset_gpt_balanced_8000
Fold 1/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7959,0.198276,0.960938
2,0.206,0.145301,0.973437
3,0.1488,0.141358,0.975
4,0.1196,0.126682,0.977344
5,0.0976,0.136613,0.974219


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8023,0.25805,0.953125
2,0.234,0.160784,0.970313
3,0.1537,0.14657,0.973437
4,0.1171,0.169134,0.970313
5,0.1115,0.146045,0.973437


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_gpt_balanced_8000/Dump/res/checkpoint-2560 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8047,0.202973,0.961719
2,0.2004,0.13036,0.975
3,0.1578,0.138168,0.975
4,0.1194,0.16717,0.973437
5,0.0966,0.134561,0.975781


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_gpt_balanced_8000/Dump/res/checkpoint-2560 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7242,0.227783,0.953125
2,0.1929,0.167608,0.96875
3,0.1366,0.163646,0.970313
4,0.1199,0.143162,0.971875
5,0.099,0.149091,0.974219


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_gpt_balanced_8000/Dump/res/checkpoint-2560 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7508,0.217496,0.956998
2,0.1764,0.198922,0.965598
3,0.1631,0.164219,0.970289
4,0.1298,0.167792,0.971853
5,0.1083,0.14745,0.974199


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_gpt_balanced_8000/Dump/res/checkpoint-2560 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


  full_metrics_df = full_metrics_df.append({


Test Accuracy: 0.978125
                 precision    recall  f1-score   support

     bug report       0.99      0.96      0.97       400
feature request       1.00      0.98      0.99       400
         rating       0.98      0.97      0.98       400
user experience       0.95      1.00      0.97       400

       accuracy                           0.98      1600
      macro avg       0.98      0.98      0.98      1600
   weighted avg       0.98      0.98      0.98      1600



In [None]:
print(f"Now doing: {files_multi[5][0].split('.')[0]}")
main_model(files_multi[5][0].split('.')[0], files_multi[5][0].split('.')[1], 1)

Now doing: dataset_gpt_balanced_20000
Fold 1/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3822,0.130308,0.978125
2,0.113,0.107747,0.98375
3,0.0797,0.077812,0.986563
4,0.0571,0.067712,0.988125
5,0.0418,0.072069,0.989375


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3655,0.079962,0.9875
2,0.0904,0.064016,0.99
3,0.0744,0.056475,0.99125
4,0.0542,0.06761,0.990625
5,0.0319,0.068518,0.990625


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_gpt_balanced_20000/Dump/res/checkpoint-6400 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3758,0.142606,0.974375
2,0.0921,0.11801,0.982812
3,0.0741,0.140125,0.977812
4,0.058,0.085526,0.987187
5,0.0407,0.068828,0.989375


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_gpt_balanced_20000/Dump/res/checkpoint-6400 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3821,0.104719,0.982187
2,0.1167,0.072657,0.988437
3,0.0827,0.059504,0.990313
4,0.0644,0.055003,0.991875
5,0.0478,0.064685,0.989688


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_gpt_balanced_20000/Dump/res/checkpoint-6400 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3719,0.221502,0.958112
2,0.0992,0.103079,0.982807
3,0.0706,0.091446,0.986246
4,0.06,0.089156,0.986558
5,0.0485,0.07798,0.986558


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_gpt_balanced_20000/Dump/res/checkpoint-6400 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


  full_metrics_df = full_metrics_df.append({


Test Accuracy: 0.99225
                 precision    recall  f1-score   support

     bug report       1.00      0.99      0.99      1000
feature request       1.00      0.99      1.00      1000
         rating       0.99      0.99      0.99      1000
user experience       0.98      0.99      0.99      1000

       accuracy                           0.99      4000
      macro avg       0.99      0.99      0.99      4000
   weighted avg       0.99      0.99      0.99      4000



In [None]:
print(f"Now doing: {files_multi[6][0].split('.')[0]}")
main_model(files_multi[6][0].split('.')[0], files_multi[6][0].split('.')[1], 1)

Now doing: dataset_gpt_balanced_32000


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Fold 1/5


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2568,0.10167,0.985352
2,0.065,0.069281,0.989844
3,0.0476,0.052402,0.991602
4,0.0365,0.047813,0.993164
5,0.0239,0.039796,0.994141


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2518,0.054762,0.991797
2,0.065,0.039868,0.993945
3,0.0469,0.038439,0.994727
4,0.0378,0.029385,0.995508
5,0.0237,0.025543,0.996094


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_gpt_balanced_32000/Dump/res/checkpoint-10240 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2545,0.088782,0.985547
2,0.0684,0.060776,0.991406
3,0.0516,0.045531,0.992969
4,0.0367,0.02929,0.994531
5,0.0244,0.037755,0.99375


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_gpt_balanced_32000/Dump/res/checkpoint-10240 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2322,0.120894,0.978516
2,0.0591,0.051543,0.991211
3,0.0431,0.042816,0.993945
4,0.0324,0.033906,0.994336
5,0.0222,0.041921,0.994141


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_gpt_balanced_32000/Dump/res/checkpoint-10240 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2612,0.077455,0.989256
2,0.0761,0.069016,0.990232
3,0.0516,0.076611,0.989451
4,0.0392,0.047816,0.993749
5,0.0272,0.045141,0.993944


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Balanced/dataset_gpt_balanced_32000/Dump/res/checkpoint-10240 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Test Accuracy: 0.9953125
                 precision    recall  f1-score   support

     bug report       1.00      0.99      1.00      1600
feature request       1.00      0.99      1.00      1600
         rating       0.99      1.00      1.00      1600
user experience       0.99      1.00      0.99      1600

       accuracy                           1.00      6400
      macro avg       1.00      1.00      1.00      6400
   weighted avg       1.00      1.00      1.00      6400



  full_metrics_df = full_metrics_df.append({


In [None]:
directory_path_unbalanced = path_to_project / 'Data' / 'Datasets' / 'Unbalanced'

files_unbalanced = [(file.name, file.stat().st_size)
               for file in directory_path_unbalanced.iterdir()
               if file.is_file() and not file.name.startswith('.')]

files_unbalanced.sort(key=lambda x: x[1])

In [None]:
print(f"Now doing: {files_unbalanced[0][0].split('.')[0]}")
main_model(files_unbalanced[0][0].split('.')[0], files_unbalanced[0][0].split('.')[1], 2)

Now doing: dataset_unbalanced_4000
Fold 1/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0957,0.838877,0.69375
2,0.6956,0.554681,0.804688
3,0.4741,0.6851,0.8125
4,0.2454,0.688187,0.84375
5,0.1192,0.7197,0.859375


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0612,0.723473,0.73125
2,0.6488,0.719636,0.746875
3,0.4638,0.846652,0.775
4,0.2767,0.726322,0.835938
5,0.0922,0.898243,0.829688


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Unbalanced/dataset_unbalanced_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0431,0.634933,0.759375
2,0.6821,0.676549,0.757812
3,0.4868,0.628034,0.807813
4,0.2481,0.718534,0.848437
5,0.1176,0.728829,0.860938


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Unbalanced/dataset_unbalanced_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.043,0.640885,0.7625
2,0.6439,0.555074,0.80625
3,0.4552,0.769565,0.8
4,0.2719,0.842062,0.828125
5,0.1177,0.779311,0.860938


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Unbalanced/dataset_unbalanced_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0538,0.736622,0.7375
2,0.6428,0.686627,0.760938
3,0.4315,0.812506,0.767188
4,0.2567,0.909222,0.80625
5,0.0911,1.023585,0.817187


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Unbalanced/dataset_unbalanced_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


  full_metrics_df = full_metrics_df.append({


Test Accuracy: 0.84
                 precision    recall  f1-score   support

     bug report       0.84      0.89      0.86       251
feature request       0.87      0.86      0.86       282
         rating       0.86      0.76      0.81       148
user experience       0.75      0.80      0.78       119

       accuracy                           0.84       800
      macro avg       0.83      0.83      0.83       800
   weighted avg       0.84      0.84      0.84       800



In [None]:
print(f"Now doing: {files_unbalanced[1][0].split('.')[0]}")
main_model(files_unbalanced[1][0].split('.')[0], files_unbalanced[1][0].split('.')[1], 2)

Now doing: dataset_gpt_unbalanced_4000
Fold 1/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1493,0.399292,0.901563
2,0.2818,0.134276,0.979688
3,0.1395,0.069249,0.989062
4,0.0642,0.05379,0.992188
5,0.0447,0.07913,0.989062


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2016,0.487687,0.85625
2,0.2956,0.198911,0.965625
3,0.125,0.139182,0.979688
4,0.0831,0.065089,0.990625
5,0.0462,0.046581,0.99375


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Unbalanced/dataset_gpt_unbalanced_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.169,0.613935,0.821875
2,0.3295,0.387611,0.920312
3,0.1403,0.095667,0.984375
4,0.0801,0.044928,0.99375
5,0.0575,0.04702,0.992188


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Unbalanced/dataset_gpt_unbalanced_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0872,0.415215,0.879687
2,0.2574,0.249668,0.960938
3,0.1266,0.075191,0.9875
4,0.0497,0.111377,0.98125
5,0.0369,0.097681,0.9875


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Unbalanced/dataset_gpt_unbalanced_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1509,0.600145,0.817187
2,0.2472,0.202452,0.96875
3,0.1107,0.194105,0.973437
4,0.0434,0.118762,0.982812
5,0.0244,0.130002,0.984375


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Unbalanced/dataset_gpt_unbalanced_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


  full_metrics_df = full_metrics_df.append({


Test Accuracy: 0.98375
                 precision    recall  f1-score   support

     bug report       0.98      0.97      0.97       100
feature request       0.99      0.98      0.99       150
         rating       1.00      0.98      0.99       250
user experience       0.97      0.99      0.98       300

       accuracy                           0.98       800
      macro avg       0.98      0.98      0.98       800
   weighted avg       0.98      0.98      0.98       800



In [None]:
print(f"Now doing: {files_unbalanced[2][0].split('.')[0]}")
main_model(files_unbalanced[2][0].split('.')[0], files_unbalanced[2][0].split('.')[1], 2)

Now doing: dataset_gpt_unbalanced_8000
Fold 1/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7321,0.267593,0.951562
2,0.1544,0.06925,0.989844
3,0.071,0.066554,0.989062
4,0.0359,0.042005,0.99375
5,0.0319,0.036634,0.994531


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7016,0.127557,0.967187
2,0.1139,0.078842,0.988281
3,0.0717,0.113727,0.98125
4,0.0532,0.069825,0.989844
5,0.0304,0.076654,0.989062


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Unbalanced/dataset_gpt_unbalanced_8000/Dump/res/checkpoint-2560 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6748,0.133248,0.978906
2,0.1227,0.094919,0.985156
3,0.0682,0.134866,0.978906
4,0.0556,0.050336,0.991406
5,0.0322,0.074935,0.989062


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Unbalanced/dataset_gpt_unbalanced_8000/Dump/res/checkpoint-2560 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7439,0.156971,0.976562
2,0.1167,0.082446,0.9875
3,0.0656,0.069463,0.989844
4,0.0518,0.069215,0.990625
5,0.0407,0.054341,0.992188


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Unbalanced/dataset_gpt_unbalanced_8000/Dump/res/checkpoint-2560 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7486,0.216995,0.959343
2,0.1413,0.077555,0.988272
3,0.0764,0.055724,0.992181
4,0.0439,0.042666,0.993745
5,0.0345,0.053724,0.992963


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Unbalanced/dataset_gpt_unbalanced_8000/Dump/res/checkpoint-2560 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


  full_metrics_df = full_metrics_df.append({


Test Accuracy: 0.99375
                 precision    recall  f1-score   support

     bug report       1.00      0.98      0.99       200
feature request       1.00      1.00      1.00       300
         rating       1.00      0.99      0.99       500
user experience       0.99      1.00      0.99       600

       accuracy                           0.99      1600
      macro avg       1.00      0.99      0.99      1600
   weighted avg       0.99      0.99      0.99      1600



In [None]:
print(f"Now doing: {files_unbalanced[3][0].split('.')[0]}")
main_model(files_unbalanced[3][0].split('.')[0], files_unbalanced[3][0].split('.')[1], 2)

Now doing: dataset_gpt_unbalanced_16000
Fold 1/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4034,0.10012,0.985547
2,0.0712,0.109032,0.984766
3,0.0471,0.086477,0.989844
4,0.0318,0.077915,0.990625
5,0.0209,0.073261,0.990625


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4084,0.149748,0.976953
2,0.0729,0.12039,0.98125
3,0.0526,0.06503,0.989844
4,0.0343,0.056673,0.991016
5,0.0239,0.050728,0.992969


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Unbalanced/dataset_gpt_unbalanced_16000/Dump/res/checkpoint-5120 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4319,0.095596,0.986328
2,0.0796,0.066098,0.989844
3,0.054,0.056405,0.992969
4,0.0412,0.047965,0.99375
5,0.024,0.040829,0.994922


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Unbalanced/dataset_gpt_unbalanced_16000/Dump/res/checkpoint-5120 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4364,0.06934,0.989453
2,0.077,0.052585,0.991797
3,0.0531,0.047137,0.992578
4,0.045,0.042994,0.99375
5,0.032,0.043878,0.993359


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Unbalanced/dataset_gpt_unbalanced_16000/Dump/res/checkpoint-5120 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3843,0.065578,0.988667
2,0.075,0.067063,0.989449
3,0.0467,0.047685,0.993357
4,0.0358,0.042327,0.994138
5,0.0248,0.043971,0.99492


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Unbalanced/dataset_gpt_unbalanced_16000/Dump/res/checkpoint-5120 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


  full_metrics_df = full_metrics_df.append({


Test Accuracy: 0.995625
                 precision    recall  f1-score   support

     bug report       0.99      0.99      0.99       400
feature request       1.00      0.99      1.00       600
         rating       1.00      0.99      1.00      1000
user experience       0.99      1.00      1.00      1200

       accuracy                           1.00      3200
      macro avg       1.00      1.00      1.00      3200
   weighted avg       1.00      1.00      1.00      3200



## Multi-label classification using RoBERTa

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from transformers import TrainerCallback
import os
import shutil
import re
import time
from pathlib import Path

def multi_main_model(file_name, ext):

    current_file_path = Path(__file__).parent

    path_to_project = current_file_path.parents[1]

    df = pd.read_csv(f"{path_to_project}/Data/Datasets/Multi-label/{file_name}.{ext}")

    results_dir = f"{path_to_project}/Models/RoBERTa/Output/Multi-label/{file_name}"
    dump_dir = results_dir+"/Dump"

    if os.path.isdir(results_dir):
        shutil.rmtree(results_dir)

    os.mkdir(results_dir)
    os.mkdir(dump_dir)

    df = df[df['review'].notna() & (df['review'] != '')]
    df['review'] = df['review'].str.replace('[^\x20-\x7E]', '', regex=True)

    X = df['review'].values
    y = df[['bug report', 'user experience', 'rating', 'feature request']].values

    X_train_CV, X_test_full, y_train_CV, y_test_full = train_test_split(X, y, test_size=0.2, random_state=42)

    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    def tokenize_function(examples):
        return tokenizer(examples, padding="max_length", truncation=True, max_length=128)

    loss_logging_callback = LossLoggingCallback()

    # K-Fold Cross-Validation
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Variables to accumulate scores
    best_f1 = 0
    best_model = None
    metrics_df = pd.DataFrame()

    for fold, (train_index, val_index) in enumerate(kf.split(X_train_CV, y_train_CV)):
        print(f"Fold {fold+1}/{n_splits}")
        start_time = time.time()

        X_train, X_val = X_train_CV[train_index], X_train_CV[val_index]
        y_train, y_val = y_train_CV[train_index], y_train_CV[val_index]

        train_encodings = tokenize_function(X_train.tolist())
        val_encodings = tokenize_function(X_val.tolist())

        train_dataset = MultiLabelDataset(train_encodings, y_train)
        val_dataset = MultiLabelDataset(val_encodings, y_val)

        model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4, problem_type="multi_label_classification")

        training_args = TrainingArguments(
            output_dir=f"{dump_dir}/res",
            num_train_epochs=5,
            per_device_train_batch_size=10,
            per_device_eval_batch_size=8,
            warmup_steps=500,
            weight_decay=0.17277768585588338,
            logging_dir=f"{dump_dir}/logs",
            logging_strategy="epoch",
            evaluation_strategy="epoch",
            learning_rate=4.478758514361407e-05,
            max_grad_norm=1.0,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            save_strategy="epoch",
            save_total_limit=2,
            lr_scheduler_type='linear'
        )

        def compute_metrics(p):
            predictions, labels = p
            predictions = torch.sigmoid(torch.tensor(predictions)).numpy()
            threshold = 0.5
            predictions = (predictions > threshold).astype(int)
            precision = precision_score(labels, predictions, average='micro')
            recall = recall_score(labels, predictions, average='micro')
            f1 = f1_score(labels, predictions, average='micro')
            return {'precision': precision, 'recall': recall, 'f1': f1}


        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            callbacks=[loss_logging_callback]
        )

        trainer.train()

        loss_logging_callback.save_logs_to_excel(f"{results_dir}/fold_loss.xlsx")

        results = trainer.evaluate()

        if results['eval_f1'] > best_f1:
            best_f1 = results['eval_f1']
            best_model = model


        predictions = trainer.predict(val_dataset)
        pred_probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()
        threshold = 0.5
        binary_predictions = (pred_probs > threshold).astype(int)

        # True labels
        true_labels = predictions.label_ids
        f1 = f1_score(true_labels, binary_predictions, average='micro')

        report_dict = classification_report(true_labels, binary_predictions, output_dict=True, zero_division=0, target_names=['bug report', 'user experience', 'rating', 'feature request'])
        # avg_metrics = report_dict['weighted avg']  # Use 'macro avg' or 'weighted avg' based on your preference
        end_time = time.time()
        # Append the metrics for this fold to the DataFrame
        metrics_df = metrics_df.append({
            ('Fold', ''): fold + 1,
            ('F1-Score', ''): f1,
            ('Train Time', ''): str(end_time - start_time)+" s",
            ('Bug Report', 'P'): report_dict['bug report']['precision'],
            ('Bug Report', 'R'): report_dict['bug report']['recall'],
            ('Bug Report', 'F1'): report_dict['bug report']['f1-score'],
            ('Feature Request', 'P'): report_dict['feature request']['precision'],
            ('Feature Request', 'R'): report_dict['feature request']['recall'],
            ('Feature Request', 'F1'): report_dict['feature request']['f1-score'],
            ('Rating', 'P'): report_dict['rating']['precision'],
            ('Rating', 'R'): report_dict['rating']['recall'],
            ('Rating', 'F1'): report_dict['rating']['f1-score'],
            ('User Experience', 'P'): report_dict['user experience']['precision'],
            ('User Experience', 'R'): report_dict['user experience']['recall'],
            ('User Experience', 'F1'): report_dict['user experience']['f1-score']
        }, ignore_index=True)

    metrics_df.columns = pd.MultiIndex.from_tuples([(c,) if isinstance(c, str) else c for c in metrics_df.columns])
    metrics_df.to_excel(f"{results_dir}/fold_metrics.xlsx", index=True)

    test_encodings = tokenize_function(X_test_full.tolist())
    test_dataset = MultiLabelDataset(test_encodings, y_test_full)
    test_trainer = Trainer(model=best_model)
    test_predictions = test_trainer.predict(test_dataset)
    test_pred_probs = torch.sigmoid(torch.tensor(test_predictions.predictions)).numpy()
    threshold = 0.5
    test_binary_predictions = (test_pred_probs > threshold).astype(int)

    test_true_labels = test_predictions.label_ids
    test_f1 = f1_score(test_true_labels, test_binary_predictions, average='micro')

    test_report_dict = classification_report(test_true_labels, test_binary_predictions, output_dict=True, zero_division=0, target_names=['bug report', 'user experience', 'rating', 'feature request'])
    # avg_metrics = report_dict['weighted avg']  # Use 'macro avg' or 'weighted avg' based on your preference
    # Append the metrics for this fold to the DataFrame
    test_metrics_df = pd.DataFrame()

    test_metrics_df = test_metrics_df.append({
            ('F1', ''): test_f1,
            ('Bug Report', 'P'): test_report_dict['bug report']['precision'],
            ('Bug Report', 'R'): test_report_dict['bug report']['recall'],
            ('Bug Report', 'F1'): test_report_dict['bug report']['f1-score'],
            ('Feature Request', 'P'): test_report_dict['feature request']['precision'],
            ('Feature Request', 'R'): test_report_dict['feature request']['recall'],
            ('Feature Request', 'F1'): test_report_dict['feature request']['f1-score'],
            ('Rating', 'P'): test_report_dict['rating']['precision'],
            ('Rating', 'R'): test_report_dict['rating']['recall'],
            ('Rating', 'F1'): test_report_dict['rating']['f1-score'],
            ('User Experience', 'P'): test_report_dict['user experience']['precision'],
            ('User Experience', 'R'): test_report_dict['user experience']['recall'],
            ('User Experience', 'F1'): test_report_dict['user experience']['f1-score']
        }, ignore_index=True)

    test_metrics_df.columns = pd.MultiIndex.from_tuples([(c,) if isinstance(c, str) else c for c in test_metrics_df.columns])
    test_metrics_df.to_excel(f"{results_dir}/metrics_results_full_test.xlsx", index=True)

    print(f"Test F1: {test_f1}")

    # Generate and print the classification report
    print(test_report_dict)

    shutil.rmtree(dump_dir)

class MultiLabelDataset(Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)  # Ensure float32 for BCEWithLogitsLoss
        return item

    def __len__(self):
        return len(self.labels)
class LossLoggingCallback(TrainerCallback):
    """A custom callback to log training and validation loss."""
    def __init__(self):
        super().__init__()
        self.log_history = []
        self.log_train_loss_history = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        # This method captures both training and evaluation logs, so it's more general than on_epoch_end
        if logs is not None:
            # Capture both training and evaluation steps
            if 'loss' in logs:  # Indicates a training step
                self.log_train_loss_history.append({
                    'epoch': state.epoch,
                    'training_loss': logs.get('loss'),
                })
            elif 'eval_loss' in logs:  # Indicates an evaluation step
                # Make sure to capture the last training loss as well
                last_training_loss = self.log_train_loss_history[-1]['training_loss'] if self.log_train_loss_history else None
                self.log_history.append({
                    'epoch': state.epoch,
                    'training_loss': last_training_loss,  # Include last known training loss for reference
                    'validation_loss': logs.get('eval_loss'),
                    'eval_runtime':logs.get('eval_runtime')
                })

    def save_logs_to_excel(self, file_name):
        """Save the recorded logs to a Excel file."""
        pd.DataFrame(self.log_history).to_excel(file_name, index=False)

__file__ = "/content/drive/MyDrive/FinalProject/Models/RoBERTa/RoBERTa.ipynb"
current_file_path = Path(__file__).parent
path_to_project = current_file_path.parents[1]

directory_path_multi_label = path_to_project / 'Data' / 'Datasets' / 'Multi-label'

files_multi_label = [(file.name, file.stat().st_size)
               for file in directory_path_multi_label.iterdir()
               if file.is_file() and not file.name.startswith('.')]

files_multi_label.sort(key=lambda x: x[1])

In [4]:
print(f"Now doing: {files_multi_label[1][0].split('.')[0]}")
multi_main_model(files_multi_label[1][0].split('.')[0], files_multi_label[1][0].split('.')[1])

Now doing: dataset_gpt_multi_label_4000


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Fold 1/5


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.6072,0.493197,0.772052,0.685136,0.726002
2,0.3314,0.250265,0.981462,0.838171,0.904175
3,0.1922,0.192962,0.99311,0.887423,0.937297
4,0.1574,0.189824,0.984481,0.8927,0.936347
5,0.1328,0.17755,0.988327,0.89358,0.938568


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.6237,0.484546,0.821965,0.61346,0.702569
2,0.3797,0.259125,0.992769,0.829163,0.90362
3,0.2228,0.19165,0.987619,0.894737,0.938886
4,0.1793,0.176265,0.990521,0.901639,0.943993
5,0.1488,0.161239,0.985942,0.907679,0.945193


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Multi-label/dataset_gpt_multi_label_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.6165,0.478471,0.819398,0.636915,0.716724
2,0.349,0.257383,0.91831,0.847487,0.881478
3,0.2492,0.180932,0.990385,0.892548,0.938924
4,0.1791,0.174286,0.98482,0.89948,0.940217
5,0.1548,0.182016,0.97026,0.904679,0.936323


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Multi-label/dataset_gpt_multi_label_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.6136,0.516153,0.747485,0.625421,0.681027
2,0.3495,0.273543,0.951315,0.822391,0.882167
3,0.2262,0.192204,0.972801,0.903199,0.936709
4,0.172,0.17486,0.98,0.907407,0.942308
5,0.1375,0.178667,0.976534,0.910774,0.942509


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Multi-label/dataset_gpt_multi_label_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.6248,0.501887,0.754579,0.677075,0.713729
2,0.3698,0.27658,0.985119,0.815941,0.892584
3,0.2212,0.200876,0.986351,0.890715,0.936097
4,0.1655,0.18229,0.98741,0.902219,0.942894
5,0.1339,0.183941,0.976148,0.90797,0.940826


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Multi-label/dataset_gpt_multi_label_4000/Dump/res/checkpoint-1280 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


  test_metrics_df = test_metrics_df.append({


Test F1: 0.9316725978647686
{'bug report': {'precision': 1.0, 'recall': 0.9513888888888888, 'f1-score': 0.9750889679715302, 'support': 288}, 'user experience': {'precision': 0.9770318021201413, 'recall': 0.9388794567062818, 'f1-score': 0.9575757575757575, 'support': 589}, 'rating': {'precision': 0.9657142857142857, 'recall': 0.6575875486381323, 'f1-score': 0.7824074074074074, 'support': 257}, 'feature request': {'precision': 0.963076923076923, 'recall': 0.9315476190476191, 'f1-score': 0.9470499243570348, 'support': 336}, 'micro avg': {'precision': 0.9768656716417911, 'recall': 0.8904761904761904, 'f1-score': 0.9316725978647686, 'support': 1470}, 'macro avg': {'precision': 0.9764557527278375, 'recall': 0.8698508783202304, 'f1-score': 0.9155305143279325, 'support': 1470}, 'weighted avg': {'precision': 0.9763633666878783, 'recall': 0.8904761904761904, 'f1-score': 0.9279763416840744, 'support': 1470}, 'samples avg': {'precision': 0.8810416666666666, 'recall': 0.8252083333333333, 'f1-score'

In [5]:
print(f"Now doing: {files_multi_label[2][0].split('.')[0]}")
multi_main_model(files_multi_label[2][0].split('.')[0], files_multi_label[2][0].split('.')[1])

Now doing: dataset_gpt_multi_label_8000
Fold 1/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4427,0.204325,0.989275,0.86978,0.925687
2,0.1709,0.133817,0.99564,0.922766,0.957819
3,0.1266,0.121994,0.998073,0.9304,0.963049
4,0.1121,0.110343,0.998081,0.933992,0.964973
5,0.0988,0.114882,0.991441,0.936237,0.963048


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4553,0.200509,0.988542,0.872644,0.926984
2,0.1563,0.153874,0.977262,0.928736,0.952381
3,0.1288,0.127809,0.989701,0.927816,0.95776
4,0.1104,0.120126,0.990201,0.929195,0.958729
5,0.0872,0.12185,0.984563,0.938391,0.960923


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Multi-label/dataset_gpt_multi_label_8000/Dump/res/checkpoint-2560 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4522,0.214876,0.995063,0.847664,0.915468
2,0.1764,0.147628,0.998976,0.911682,0.953335
3,0.1372,0.139167,0.996945,0.914953,0.954191
4,0.1236,0.131509,0.994934,0.917757,0.954789
5,0.108,0.125602,0.991457,0.921963,0.955448


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Multi-label/dataset_gpt_multi_label_8000/Dump/res/checkpoint-2560 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4606,0.202547,0.995659,0.859485,0.922574
2,0.1864,0.147229,0.987903,0.918033,0.951687
3,0.1349,0.131091,0.992944,0.922717,0.956543
4,0.1148,0.122753,0.979902,0.9363,0.957605
5,0.091,0.109293,0.98963,0.938642,0.963462


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Multi-label/dataset_gpt_multi_label_8000/Dump/res/checkpoint-2560 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4621,0.21296,0.979224,0.850024,0.910061
2,0.179,0.142793,0.993326,0.918367,0.954377
3,0.1262,0.130781,0.996914,0.919791,0.956801
4,0.1161,0.13205,0.987836,0.925012,0.955392
5,0.1026,0.136749,0.982872,0.925961,0.953568


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Multi-label/dataset_gpt_multi_label_8000/Dump/res/checkpoint-2560 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


  test_metrics_df = test_metrics_df.append({


Test F1: 0.9621684867394695
{'bug report': {'precision': 1.0, 'recall': 0.9663366336633663, 'f1-score': 0.9828801611278952, 'support': 505}, 'user experience': {'precision': 0.997131931166348, 'recall': 0.953382084095064, 'f1-score': 0.9747663551401868, 'support': 1094}, 'rating': {'precision': 0.9892183288409704, 'recall': 0.7858672376873662, 'f1-score': 0.8758949880668259, 'support': 467}, 'feature request': {'precision': 0.9982456140350877, 'recall': 0.969335604770017, 'f1-score': 0.9835782195332756, 'support': 587}, 'micro avg': {'precision': 0.9967676767676767, 'recall': 0.9298906897851489, 'f1-score': 0.9621684867394695, 'support': 2653}, 'macro avg': {'precision': 0.9961489685106015, 'recall': 0.9187303900539534, 'f1-score': 0.9542799309670459, 'support': 2653}, 'weighted avg': {'precision': 0.9965312731637069, 'recall': 0.9298906897851489, 'f1-score': 0.9608564825428542, 'support': 2653}, 'samples avg': {'precision': 0.8660416666666666, 'recall': 0.8311458333333335, 'f1-score':

In [6]:
print(f"Now doing: {files_multi_label[3][0].split('.')[0]}")
multi_main_model(files_multi_label[3][0].split('.')[0], files_multi_label[3][0].split('.')[1])

Now doing: dataset_gpt_multi_label_16000
Fold 1/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2962,0.133154,0.968395,0.937991,0.952951
2,0.1097,0.097482,0.99255,0.94113,0.966156
3,0.0891,0.0725,0.996451,0.954997,0.975284
4,0.0619,0.054692,0.997844,0.968864,0.983141
5,0.042,0.052534,0.993054,0.972527,0.982683


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2846,0.122388,0.998304,0.922173,0.958729
2,0.0927,0.094533,1.0,0.938626,0.968342
3,0.0782,0.084709,0.995883,0.947506,0.971092
4,0.0561,0.073816,0.988701,0.959781,0.974026
5,0.0393,0.072761,0.990599,0.963176,0.976695


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Multi-label/dataset_gpt_multi_label_16000/Dump/res/checkpoint-5120 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2994,0.118921,0.992808,0.927151,0.958857
2,0.1122,0.118741,0.975642,0.941617,0.958328
3,0.0956,0.099421,0.990484,0.9411,0.965161
4,0.0812,0.076735,0.994624,0.955825,0.974839
5,0.0609,0.073108,0.992248,0.958925,0.975302


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Multi-label/dataset_gpt_multi_label_16000/Dump/res/checkpoint-5120 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2941,0.116192,0.994224,0.933385,0.962845
2,0.1145,0.096422,0.995896,0.93984,0.967056
3,0.0838,0.076661,0.99355,0.954557,0.973663
4,0.0674,0.072229,0.990415,0.960496,0.975226
5,0.0529,0.066968,0.991501,0.963852,0.977481


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Multi-label/dataset_gpt_multi_label_16000/Dump/res/checkpoint-5120 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2798,0.131679,0.981687,0.929341,0.954797
2,0.1064,0.104019,0.997206,0.937484,0.966423
3,0.0871,0.092153,0.994467,0.944313,0.968742
4,0.0668,0.070491,0.989437,0.959548,0.974263
5,0.046,0.071937,0.989755,0.964276,0.976849


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/RoBERTa/Output/Multi-label/dataset_gpt_multi_label_16000/Dump/res/checkpoint-5120 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


  test_metrics_df = test_metrics_df.append({


Test F1: 0.9793326157158235
{'bug report': {'precision': 0.9988674971687429, 'recall': 0.9767441860465116, 'f1-score': 0.9876819708846585, 'support': 903}, 'user experience': {'precision': 0.9995007488766849, 'recall': 0.9713731198447355, 'f1-score': 0.9852362204724409, 'support': 2061}, 'rating': {'precision': 0.9970014992503748, 'recall': 0.8902275769745649, 'f1-score': 0.9405940594059405, 'support': 747}, 'feature request': {'precision': 0.998003992015968, 'recall': 0.9765625, 'f1-score': 0.9871668311944719, 'support': 1024}, 'micro avg': {'precision': 0.9986827661909989, 'recall': 0.9607180570221753, 'f1-score': 0.9793326157158235, 'support': 4735}, 'macro avg': {'precision': 0.9983434343279427, 'recall': 0.9537268457164529, 'f1-score': 0.975169770489378, 'support': 4735}, 'weighted avg': {'precision': 0.9986620065771075, 'recall': 0.9607180570221753, 'f1-score': 0.9790773532464465, 'support': 4735}, 'samples avg': {'precision': 0.84359375, 'recall': 0.8265885416666666, 'f1-score':