## Multi-class classification using ELECTRA

In [12]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
pip install accelerate -U

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from transformers import ElectraTokenizer, ElectraForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
from transformers import TrainerCallback
import os
import shutil
import re
import time
from pathlib import Path
# [5e-05, 0.135161130336292, 5, 13, 13]

def main_model(file_name, ext, type):

    path_type = "Balanced" if type == 1 else "Unbalanced"

    current_file_path = Path(__file__).parent
    path_to_project = current_file_path.parents[1]

    df = pd.read_excel(f"{path_to_project}/Data/Datasets/{path_type}/{file_name}.{ext}")

    results_dir = f"{path_to_project}/Models/ELECTRA/Output/{path_type}/{file_name}"
    dump_dir = results_dir+"/Dump"

    if os.path.isdir(results_dir):
        shutil.rmtree(results_dir)

    os.mkdir(results_dir)
    os.mkdir(dump_dir)

    df = df[df['review'].notna() & (df['review'] != '')]
    # Select the text and label columns
    df['review'] = df['review'].str.replace('[^\x20-\x7E]', '', regex=True)
    X = df['review'].values
    y = df['label'].values

    X_train_CV, X_test_full, y_train_CV, y_test_full = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


    # Encode the labels to a numeric format
    label_encoder = LabelEncoder()
    y_train_CV_encoded = label_encoder.fit_transform(y_train_CV)
    y_test_full_encoded = label_encoder.transform(y_test_full)

    # Initialize the tokenizer for RoBERTa
    tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')

    # Tokenization function
    def tokenize_function(texts):
        return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

    loss_logging_callback = LossLoggingCallback()

    # Stratified K-Fold Cross-Validation
    n_splits = 5
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Variables to accumulate scores
    best_accuracy = 0
    best_model = None
    accuracy_scores = []
    metrics_df = pd.DataFrame()


    for fold, (train_index, val_index) in enumerate(kf.split(X_train_CV, y_train_CV_encoded)):
        print(f"Fold {fold+1}/{n_splits}")
        start_time = time.time()
        # Split the data
        X_train, X_val = X_train_CV[train_index], X_train_CV[val_index]
        y_train, y_val = y_train_CV_encoded[train_index], y_train_CV_encoded[val_index]


        # Tokenize the data
        train_encodings = tokenize_function(X_train.tolist())
        val_encodings = tokenize_function(X_val.tolist())

        # Create dataset objects
        train_dataset = ReviewDataset(train_encodings, y_train)
        val_dataset = ReviewDataset(val_encodings, y_val)

        # Initialize the model for each fold
        model = ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator', num_labels=len(label_encoder.classes_))

        # Define training arguments for each fold, adjust hyperparameters as needed
        training_args = TrainingArguments(
            output_dir=f"{dump_dir}/res",
            num_train_epochs=5,
            per_device_train_batch_size=13,
            per_device_eval_batch_size=13,
            warmup_steps=500,
            weight_decay=0.135161130336292,
            logging_dir=f"{dump_dir}/logs",
            logging_strategy="epoch",
            evaluation_strategy="epoch",
            learning_rate=5e-05,
            max_grad_norm=1.0,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            save_strategy="epoch",
            save_total_limit=2,
            lr_scheduler_type='linear'
        )

        # Initialize Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=lambda p: {"accuracy": accuracy_score(p.predictions.argmax(-1), p.label_ids)},
            callbacks=[loss_logging_callback]
        )

        # Train
        trainer.train()

        loss_logging_callback.save_logs_to_excel(f"{results_dir}/fold_loss.xlsx")

        # Evaluate
        results = trainer.evaluate()
        accuracy_scores.append(results['eval_accuracy'])

        if results['eval_accuracy'] > best_accuracy:
            best_accuracy = results['eval_accuracy']
            best_model = model  # Assign the best model

        # Get predictions and true labels
        predictions = trainer.predict(val_dataset)
        pred_labels = np.argmax(predictions.predictions, axis=-1)
        true_labels = y_val

        # Calculate accuracy
        accuracy = accuracy_score(true_labels, pred_labels)
        label_names = label_encoder.inverse_transform(range(len(label_encoder.classes_)))

        # Calculate precision, recall, and F1-score
        report_dict = classification_report(true_labels, pred_labels, output_dict=True, zero_division=0, target_names=label_names)
        # avg_metrics = report_dict['weighted avg']  # Use 'macro avg' or 'weighted avg' based on your preference
        end_time = time.time()
        # Append the metrics for this fold to the DataFrame
        metrics_df = metrics_df.append({
            ('Fold', ''): fold + 1,
            ('Accuracy', ''): accuracy,
            ('Train Time', ''): str(end_time - start_time)+" s",
            ('Bug Report', 'P'): report_dict['bug report']['precision'],
            ('Bug Report', 'R'): report_dict['bug report']['recall'],
            ('Bug Report', 'F1'): report_dict['bug report']['f1-score'],
            ('Feature Request', 'P'): report_dict['feature request']['precision'],
            ('Feature Request', 'R'): report_dict['feature request']['recall'],
            ('Feature Request', 'F1'): report_dict['feature request']['f1-score'],
            ('Rating', 'P'): report_dict['rating']['precision'],
            ('Rating', 'R'): report_dict['rating']['recall'],
            ('Rating', 'F1'): report_dict['rating']['f1-score'],
            ('User Experience', 'P'): report_dict['user experience']['precision'],
            ('User Experience', 'R'): report_dict['user experience']['recall'],
            ('User Experience', 'F1'): report_dict['user experience']['f1-score']
        }, ignore_index=True)

    # Save the DataFrame to a CSV file after completing all folds
    metrics_df.columns = pd.MultiIndex.from_tuples([(c,) if isinstance(c, str) else c for c in metrics_df.columns])
    metrics_df.to_excel(f"{results_dir}/fold_metrics.xlsx", index=True)

    # Evaluate the best model on the test set
    test_encodings = tokenize_function(X_test_full.tolist())
    test_dataset = ReviewDataset(test_encodings, y_test_full_encoded)
    test_trainer = Trainer(model=best_model)
    test_results = test_trainer.predict(test_dataset)
    test_predictions = np.argmax(test_results.predictions, axis=-1)
    test_accuracy = accuracy_score(y_test_full_encoded, test_predictions)

    label_names_full = label_encoder.inverse_transform(range(len(label_encoder.classes_)))

    # Calculate precision, recall, and F1-score
    report_dict_full = classification_report(y_test_full_encoded, test_predictions, output_dict=True, zero_division=0, target_names=label_names_full)
    print(report_dict_full)
    full_metrics_df = pd.DataFrame()

    full_metrics_df = full_metrics_df.append({
            ('Accuracy', ''): test_accuracy,
            ('Bug Report', 'P'): report_dict_full['bug report']['precision'],
            ('Bug Report', 'R'): report_dict_full['bug report']['recall'],
            ('Bug Report', 'F1'): report_dict_full['bug report']['f1-score'],
            ('Feature Request', 'P'): report_dict_full['feature request']['precision'],
            ('Feature Request', 'R'): report_dict_full['feature request']['recall'],
            ('Feature Request', 'F1'): report_dict_full['feature request']['f1-score'],
            ('Rating', 'P'): report_dict_full['rating']['precision'],
            ('Rating', 'R'): report_dict_full['rating']['recall'],
            ('Rating', 'F1'): report_dict_full['rating']['f1-score'],
            ('User Experience', 'P'): report_dict_full['user experience']['precision'],
            ('User Experience', 'R'): report_dict_full['user experience']['recall'],
            ('User Experience', 'F1'): report_dict_full['user experience']['f1-score']
        }, ignore_index=True)

    full_metrics_df.columns = pd.MultiIndex.from_tuples([(c,) if isinstance(c, str) else c for c in full_metrics_df.columns])
    full_metrics_df.to_excel(f"{results_dir}/metrics_results_full_test.xlsx", index=True)

    print(f"Test Accuracy: {test_accuracy}")

    # Generate and print the classification report
    print(classification_report(y_test_full_encoded, test_predictions, target_names=label_encoder.classes_, zero_division=0))

    shutil.rmtree(dump_dir)

# Custom dataset class
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

class LossLoggingCallback(TrainerCallback):
    """A custom callback to log training and validation loss."""
    def __init__(self):
        super().__init__()
        self.log_history = []
        self.log_train_loss_history = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        # This method captures both training and evaluation logs, so it's more general than on_epoch_end
        if logs is not None:
            # Capture both training and evaluation steps
            if 'loss' in logs:  # Indicates a training step
                self.log_train_loss_history.append({
                    'epoch': state.epoch,
                    'training_loss': logs.get('loss'),
                })
            elif 'eval_loss' in logs:  # Indicates an evaluation step
                # Make sure to capture the last training loss as well
                last_training_loss = self.log_train_loss_history[-1]['training_loss'] if self.log_train_loss_history else None
                self.log_history.append({
                    'epoch': state.epoch,
                    'training_loss': last_training_loss,  # Include last known training loss for reference
                    'validation_loss': logs.get('eval_loss'),
                    'eval_runtime':logs.get('eval_runtime')
                })

    def save_logs_to_excel(self, file_name):
        """Save the recorded logs to a Excel file."""
        pd.DataFrame(self.log_history).to_excel(file_name, index=False)

__file__ = "/content/drive/MyDrive/FinalProject/Models/ELECTRA/ELECTRA.ipynb"
current_file_path = Path(__file__).parent
path_to_project = current_file_path.parents[1]

directory_path_multi = path_to_project / 'Data' / 'Datasets' / 'Balanced'

files_multi = [(file.name, file.stat().st_size)
               for file in directory_path_multi.iterdir()
               if file.is_file() and not file.name.startswith('.')]

files_multi.sort(key=lambda x: x[1])





Running each dataset on the model separately due to storage constraints

In [None]:
print(f"Now doing: {files_multi[1][0].split('.')[0]}")
main_model(files_multi[1][0].split('.')[0], files_multi[1][0].split('.')[1], 1)
print(f"Now doing: {files_multi[2][0].split('.')[0]}")
main_model(files_multi[2][0].split('.')[0], files_multi[2][0].split('.')[1], 1)

Now doing: dataset_balanced_4000


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Fold 1/5


pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2641,0.931262,0.665625
2,0.8049,0.731725,0.71875
3,0.5964,0.635211,0.785937
4,0.368,0.75719,0.776563
5,0.1638,0.79477,0.796875


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2837,0.937463,0.660937
2,0.786,0.770115,0.714063
3,0.6015,0.689276,0.746875
4,0.3568,0.845167,0.76875
5,0.1804,0.94774,0.76875


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_balanced_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2825,0.954132,0.665625
2,0.8082,0.728378,0.748437
3,0.5742,0.793369,0.6875
4,0.3305,0.863912,0.767188
5,0.1719,0.963478,0.76875


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_balanced_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2841,0.910351,0.696875
2,0.7837,0.710084,0.742188
3,0.5609,0.755914,0.7375
4,0.3513,0.926712,0.7375
5,0.1737,1.024713,0.75


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_balanced_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2752,0.913003,0.701562
2,0.7974,0.848625,0.7
3,0.5949,0.803305,0.734375
4,0.3635,0.847295,0.76875
5,0.1894,0.884359,0.782813


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_balanced_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


{'bug report': {'precision': 0.8291457286432161, 'recall': 0.825, 'f1-score': 0.8270676691729322, 'support': 200}, 'feature request': {'precision': 0.7727272727272727, 'recall': 0.85, 'f1-score': 0.8095238095238095, 'support': 200}, 'rating': {'precision': 0.7833333333333333, 'recall': 0.705, 'f1-score': 0.7421052631578948, 'support': 200}, 'user experience': {'precision': 0.6965174129353234, 'recall': 0.7, 'f1-score': 0.6982543640897755, 'support': 200}, 'accuracy': 0.77, 'macro avg': {'precision': 0.7704309369097864, 'recall': 0.77, 'f1-score': 0.769237776486103, 'support': 800}, 'weighted avg': {'precision': 0.7704309369097864, 'recall': 0.77, 'f1-score': 0.7692377764861029, 'support': 800}}
Test Accuracy: 0.77
                 precision    recall  f1-score   support

     bug report       0.83      0.82      0.83       200
feature request       0.77      0.85      0.81       200
         rating       0.78      0.70      0.74       200
user experience       0.70      0.70      0.70 

  full_metrics_df = full_metrics_df.append({


Now doing: dataset_balanced_8000
Fold 1/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0408,0.810023,0.698438
2,0.6073,0.564191,0.808594
3,0.3256,0.464299,0.860156
4,0.159,0.54495,0.875
5,0.0778,0.576757,0.88125


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0687,0.761495,0.721094
2,0.6185,0.603637,0.799219
3,0.3462,0.59584,0.846094
4,0.1926,0.596771,0.8625
5,0.0981,0.602818,0.871094


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_balanced_8000/Dump/res/checkpoint-1970 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0686,0.753059,0.717187
2,0.5995,0.598113,0.796094
3,0.3308,0.507877,0.863281
4,0.172,0.521583,0.885938
5,0.0845,0.527466,0.889062


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_balanced_8000/Dump/res/checkpoint-1970 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0687,0.699973,0.740625
2,0.6242,0.615373,0.785937
3,0.3395,0.562533,0.863281
4,0.1853,0.529581,0.885938
5,0.0827,0.551732,0.890625


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_balanced_8000/Dump/res/checkpoint-1970 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0612,0.721894,0.746875
2,0.6358,0.663521,0.789062
3,0.3527,0.539904,0.853906
4,0.1747,0.578044,0.878125
5,0.0906,0.547317,0.89375


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_balanced_8000/Dump/res/checkpoint-1970 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


{'bug report': {'precision': 0.9082125603864735, 'recall': 0.94, 'f1-score': 0.9238329238329238, 'support': 400}, 'feature request': {'precision': 0.905, 'recall': 0.905, 'f1-score': 0.905, 'support': 400}, 'rating': {'precision': 0.9093264248704663, 'recall': 0.8775, 'f1-score': 0.8931297709923663, 'support': 400}, 'user experience': {'precision': 0.8575, 'recall': 0.8575, 'f1-score': 0.8575, 'support': 400}, 'accuracy': 0.895, 'macro avg': {'precision': 0.8950097463142349, 'recall': 0.895, 'f1-score': 0.8948656737063225, 'support': 1600}, 'weighted avg': {'precision': 0.8950097463142349, 'recall': 0.895, 'f1-score': 0.8948656737063225, 'support': 1600}}
Test Accuracy: 0.895
                 precision    recall  f1-score   support

     bug report       0.91      0.94      0.92       400
feature request       0.91      0.91      0.91       400
         rating       0.91      0.88      0.89       400
user experience       0.86      0.86      0.86       400

       accuracy             

  full_metrics_df = full_metrics_df.append({


In [None]:
print(f"Now doing: {files_multi[3][0].split('.')[0]}")
main_model(files_multi[3][0].split('.')[0], files_multi[3][0].split('.')[1], 1)
print(f"Now doing: {files_multi[4][0].split('.')[0]}")
main_model(files_multi[4][0].split('.')[0], files_multi[4][0].split('.')[1], 1)

Now doing: dataset_gpt_balanced_4000
Fold 1/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3627,1.165973,0.45
2,0.6897,0.329568,0.914062
3,0.309,0.242556,0.942187
4,0.234,0.271135,0.939063
5,0.1927,0.186608,0.953125


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3713,1.221511,0.485938
2,0.6166,0.674775,0.821875
3,0.2553,0.276518,0.940625
4,0.2032,0.238706,0.945312
5,0.1761,0.27136,0.942187


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_gpt_balanced_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3503,1.131753,0.564063
2,0.6074,0.260498,0.940625
3,0.2844,0.215962,0.957812
4,0.2157,0.206285,0.957812
5,0.195,0.198513,0.957812


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_gpt_balanced_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3699,1.259069,0.492188
2,0.6044,0.253765,0.939063
3,0.3058,0.302571,0.935937
4,0.2211,0.184294,0.95625
5,0.1929,0.202911,0.95


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_gpt_balanced_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3668,1.255395,0.446009
2,0.6422,0.397323,0.904538
3,0.3347,0.471449,0.888889
4,0.3036,0.305421,0.929577
5,0.2686,0.264732,0.938967


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_gpt_balanced_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


{'bug report': {'precision': 1.0, 'recall': 0.94, 'f1-score': 0.9690721649484536, 'support': 200}, 'feature request': {'precision': 0.9846938775510204, 'recall': 0.965, 'f1-score': 0.9747474747474748, 'support': 200}, 'rating': {'precision': 0.9545454545454546, 'recall': 0.945, 'f1-score': 0.949748743718593, 'support': 200}, 'user experience': {'precision': 0.9128440366972477, 'recall': 0.995, 'f1-score': 0.9521531100478469, 'support': 200}, 'accuracy': 0.96125, 'macro avg': {'precision': 0.9630208421984306, 'recall': 0.9612499999999999, 'f1-score': 0.9614303733655921, 'support': 800}, 'weighted avg': {'precision': 0.9630208421984306, 'recall': 0.96125, 'f1-score': 0.961430373365592, 'support': 800}}
Test Accuracy: 0.96125
                 precision    recall  f1-score   support

     bug report       1.00      0.94      0.97       200
feature request       0.98      0.96      0.97       200
         rating       0.95      0.94      0.95       200
user experience       0.91      0.99  

  full_metrics_df = full_metrics_df.append({


Now doing: dataset_gpt_balanced_8000
Fold 1/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9512,0.202127,0.953125
2,0.1742,0.123243,0.976562
3,0.136,0.153415,0.970313
4,0.1173,0.143609,0.971875
5,0.1045,0.132055,0.974219


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9436,0.243963,0.95
2,0.2317,0.233348,0.95
3,0.2108,0.219348,0.953906
4,0.173,0.143179,0.971094
5,0.107,0.148024,0.967969


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_gpt_balanced_8000/Dump/res/checkpoint-1970 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.962,0.343204,0.921875
2,0.2079,0.113159,0.978125
3,0.134,0.107821,0.978906
4,0.1166,0.113139,0.979688
5,0.1036,0.102408,0.980469


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_gpt_balanced_8000/Dump/res/checkpoint-1970 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9212,0.260052,0.936719
2,0.1851,0.145233,0.971094
3,0.1252,0.140957,0.971875
4,0.1138,0.137918,0.96875
5,0.0979,0.110504,0.973437


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_gpt_balanced_8000/Dump/res/checkpoint-1970 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9189,0.256047,0.935887
2,0.1733,0.187253,0.967162
3,0.144,0.125035,0.97498
4,0.112,0.140384,0.97498
5,0.0918,0.132801,0.972635


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_gpt_balanced_8000/Dump/res/checkpoint-1970 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


{'bug report': {'precision': 0.9897172236503856, 'recall': 0.9625, 'f1-score': 0.9759188846641318, 'support': 400}, 'feature request': {'precision': 0.9974619289340102, 'recall': 0.9825, 'f1-score': 0.9899244332493703, 'support': 400}, 'rating': {'precision': 0.9775561097256857, 'recall': 0.98, 'f1-score': 0.978776529338327, 'support': 400}, 'user experience': {'precision': 0.9543269230769231, 'recall': 0.9925, 'f1-score': 0.9730392156862746, 'support': 400}, 'accuracy': 0.979375, 'macro avg': {'precision': 0.9797655463467512, 'recall': 0.979375, 'f1-score': 0.9794147657345259, 'support': 1600}, 'weighted avg': {'precision': 0.9797655463467511, 'recall': 0.979375, 'f1-score': 0.9794147657345259, 'support': 1600}}
Test Accuracy: 0.979375
                 precision    recall  f1-score   support

     bug report       0.99      0.96      0.98       400
feature request       1.00      0.98      0.99       400
         rating       0.98      0.98      0.98       400
user experience       0.

  full_metrics_df = full_metrics_df.append({


In [None]:
print(f"Now doing: {files_multi[5][0].split('.')[0]}")
main_model(files_multi[5][0].split('.')[0], files_multi[5][0].split('.')[1], 1)

Now doing: dataset_gpt_balanced_20000
Fold 1/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4412,0.083262,0.985
2,0.0834,0.065835,0.987187
3,0.0621,0.097049,0.98375
4,0.0539,0.063919,0.987812
5,0.0462,0.067694,0.98875


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4216,0.094919,0.984688
2,0.0771,0.080336,0.986875
3,0.0675,0.069888,0.9875
4,0.0569,0.060708,0.988125
5,0.0511,0.070779,0.988125


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_gpt_balanced_20000/Dump/res/checkpoint-4925 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4301,0.189776,0.964063
2,0.0775,0.070711,0.989375
3,0.0661,0.140714,0.96
4,0.059,0.057676,0.990625
5,0.046,0.061193,0.989688


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_gpt_balanced_20000/Dump/res/checkpoint-4925 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4132,0.064307,0.989688
2,0.0906,0.074056,0.987812
3,0.076,0.054644,0.99125
4,0.066,0.043858,0.9925
5,0.057,0.04539,0.992188


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_gpt_balanced_20000/Dump/res/checkpoint-4925 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4198,0.09106,0.982495
2,0.075,0.082893,0.984058
3,0.0614,0.073411,0.988121
4,0.0539,0.069624,0.987496
5,0.0423,0.071269,0.986558


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_gpt_balanced_20000/Dump/res/checkpoint-4925 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


{'bug report': {'precision': 0.9989868287740629, 'recall': 0.986, 'f1-score': 0.992450931051837, 'support': 1000}, 'feature request': {'precision': 1.0, 'recall': 0.99, 'f1-score': 0.9949748743718593, 'support': 1000}, 'rating': {'precision': 0.9919759277833501, 'recall': 0.989, 'f1-score': 0.9904857285928894, 'support': 1000}, 'user experience': {'precision': 0.9746588693957114, 'recall': 1.0, 'f1-score': 0.9871668311944718, 'support': 1000}, 'accuracy': 0.99125, 'macro avg': {'precision': 0.9914054064882811, 'recall': 0.99125, 'f1-score': 0.9912695913027643, 'support': 4000}, 'weighted avg': {'precision': 0.9914054064882811, 'recall': 0.99125, 'f1-score': 0.9912695913027643, 'support': 4000}}
Test Accuracy: 0.99125
                 precision    recall  f1-score   support

     bug report       1.00      0.99      0.99      1000
feature request       1.00      0.99      0.99      1000
         rating       0.99      0.99      0.99      1000
user experience       0.97      1.00      0.

  full_metrics_df = full_metrics_df.append({


In [4]:
print(f"Now doing: {files_multi[6][0].split('.')[0]}")
main_model(files_multi[6][0].split('.')[0], files_multi[6][0].split('.')[1], 1)

Now doing: dataset_gpt_balanced_32000


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Fold 1/5


pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2793,0.088106,0.984375
2,0.0641,0.061234,0.989844
3,0.0483,0.043499,0.991797
4,0.0403,0.035809,0.992578
5,0.0325,0.038069,0.993164


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2686,0.057705,0.990234
2,0.0685,0.057927,0.989648
3,0.0532,0.043438,0.992578
4,0.0453,0.03566,0.99375
5,0.0391,0.03089,0.994531


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_gpt_balanced_32000/Dump/res/checkpoint-7880 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2887,0.126815,0.975195
2,0.0725,0.063021,0.989453
3,0.0532,0.056331,0.99043
4,0.0446,0.043543,0.991797
5,0.028,0.044042,0.991797


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_gpt_balanced_32000/Dump/res/checkpoint-7880 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.273,0.07339,0.988477
2,0.0582,0.043373,0.992188
3,0.0445,0.06256,0.991602
4,0.0382,0.066576,0.990625
5,0.0312,0.046963,0.992188


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_gpt_balanced_32000/Dump/res/checkpoint-7880 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2781,0.064412,0.991209
2,0.062,0.050653,0.992772
3,0.0436,0.046889,0.992967
4,0.0403,0.041409,0.992967
5,0.0303,0.036905,0.993944


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Balanced/dataset_gpt_balanced_32000/Dump/res/checkpoint-7880 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


{'bug report': {'precision': 0.9993706733794839, 'recall': 0.9925, 'f1-score': 0.9959234869865162, 'support': 1600}, 'feature request': {'precision': 1.0, 'recall': 0.9925, 'f1-score': 0.9962358845671266, 'support': 1600}, 'rating': {'precision': 0.9931120851596744, 'recall': 0.99125, 'f1-score': 0.9921801689083515, 'support': 1600}, 'user experience': {'precision': 0.984009840098401, 'recall': 1.0, 'f1-score': 0.9919404835709857, 'support': 1600}, 'accuracy': 0.9940625, 'macro avg': {'precision': 0.9941231496593899, 'recall': 0.9940625000000001, 'f1-score': 0.994070006008245, 'support': 6400}, 'weighted avg': {'precision': 0.9941231496593899, 'recall': 0.9940625, 'f1-score': 0.9940700060082451, 'support': 6400}}
Test Accuracy: 0.9940625
                 precision    recall  f1-score   support

     bug report       1.00      0.99      1.00      1600
feature request       1.00      0.99      1.00      1600
         rating       0.99      0.99      0.99      1600
user experience       0

  full_metrics_df = full_metrics_df.append({


In [5]:
directory_path_unbalanced = path_to_project / 'Data' / 'Datasets' / 'Unbalanced'

files_unbalanced = [(file.name, file.stat().st_size)
               for file in directory_path_unbalanced.iterdir()
               if file.is_file() and not file.name.startswith('.')]

files_unbalanced.sort(key=lambda x: x[1])

In [6]:
print(f"Now doing: {files_unbalanced[0][0].split('.')[0]}")
main_model(files_unbalanced[0][0].split('.')[0], files_unbalanced[0][0].split('.')[1], 2)
print(f"Now doing: {files_unbalanced[1][0].split('.')[0]}")
main_model(files_unbalanced[1][0].split('.')[0], files_unbalanced[1][0].split('.')[1], 2)

Now doing: dataset_unbalanced_4000
Fold 1/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.232,0.9259,0.679688
2,0.7772,0.58883,0.79375
3,0.5394,0.613534,0.79375
4,0.2767,0.645255,0.83125
5,0.1301,0.685296,0.84375


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2475,0.973273,0.635938
2,0.7218,0.825489,0.721875
3,0.4971,0.81122,0.748437
4,0.2874,0.823042,0.7875
5,0.1254,0.897053,0.81875


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Unbalanced/dataset_unbalanced_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2468,0.927407,0.646875
2,0.7865,0.640841,0.765625
3,0.5118,0.538854,0.815625
4,0.2868,0.660182,0.817187
5,0.1365,0.712613,0.835938


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Unbalanced/dataset_unbalanced_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2405,0.882363,0.7125
2,0.7902,0.610852,0.782813
3,0.5194,0.635199,0.784375
4,0.2521,0.641488,0.842187
5,0.1014,0.705836,0.84375


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Unbalanced/dataset_unbalanced_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2452,0.94827,0.684375
2,0.7671,0.689071,0.754687
3,0.5131,0.701725,0.775
4,0.2778,0.857548,0.784375
5,0.104,0.925903,0.790625


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Unbalanced/dataset_unbalanced_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


{'bug report': {'precision': 0.8503937007874016, 'recall': 0.8605577689243028, 'f1-score': 0.8554455445544554, 'support': 251}, 'feature request': {'precision': 0.8424657534246576, 'recall': 0.8723404255319149, 'f1-score': 0.8571428571428571, 'support': 282}, 'rating': {'precision': 0.8813559322033898, 'recall': 0.7027027027027027, 'f1-score': 0.7819548872180451, 'support': 148}, 'user experience': {'precision': 0.6764705882352942, 'recall': 0.773109243697479, 'f1-score': 0.7215686274509803, 'support': 119}, 'accuracy': 0.8225, 'macro avg': {'precision': 0.8126714936626858, 'recall': 0.8021775352140998, 'f1-score': 0.8040279790915845, 'support': 800}, 'weighted avg': {'precision': 0.8274560491618662, 'recall': 0.8225, 'f1-score': 0.8225338842154892, 'support': 800}}
Test Accuracy: 0.8225
                 precision    recall  f1-score   support

     bug report       0.85      0.86      0.86       251
feature request       0.84      0.87      0.86       282
         rating       0.88   

  full_metrics_df = full_metrics_df.append({


Now doing: dataset_gpt_unbalanced_4000
Fold 1/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2961,1.109917,0.567187
2,0.5235,0.08646,0.98125
3,0.0959,0.080812,0.9875
4,0.0539,0.069469,0.989062
5,0.0439,0.049602,0.992188


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.279,1.044288,0.615625
2,0.5625,0.1969,0.95
3,0.1218,0.105322,0.982812
4,0.0618,0.064372,0.989062
5,0.0411,0.059948,0.989062


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Unbalanced/dataset_gpt_unbalanced_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2856,1.03251,0.651563
2,0.5635,0.199986,0.959375
3,0.1425,0.086206,0.984375
4,0.0883,0.12745,0.98125
5,0.0668,0.013335,0.996875


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Unbalanced/dataset_gpt_unbalanced_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2916,1.055521,0.645312
2,0.6083,0.180158,0.95625
3,0.1262,0.070354,0.985938
4,0.0807,0.090199,0.984375
5,0.0456,0.105791,0.982812


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Unbalanced/dataset_gpt_unbalanced_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2858,1.005161,0.667188
2,0.5001,0.239002,0.959375
3,0.0916,0.172834,0.973437
4,0.0455,0.112512,0.982812
5,0.0267,0.115472,0.982812


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Unbalanced/dataset_gpt_unbalanced_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


{'bug report': {'precision': 0.98989898989899, 'recall': 0.98, 'f1-score': 0.9849246231155778, 'support': 100}, 'feature request': {'precision': 0.9932885906040269, 'recall': 0.9866666666666667, 'f1-score': 0.9899665551839465, 'support': 150}, 'rating': {'precision': 0.9959514170040485, 'recall': 0.984, 'f1-score': 0.9899396378269618, 'support': 250}, 'user experience': {'precision': 0.980327868852459, 'recall': 0.9966666666666667, 'f1-score': 0.9884297520661156, 'support': 300}, 'accuracy': 0.98875, 'macro avg': {'precision': 0.9898667165898811, 'recall': 0.9868333333333333, 'f1-score': 0.9883151420481505, 'support': 800}, 'weighted avg': {'precision': 0.9888367531090662, 'recall': 0.98875, 'f1-score': 0.9887516008321562, 'support': 800}}
Test Accuracy: 0.98875
                 precision    recall  f1-score   support

     bug report       0.99      0.98      0.98       100
feature request       0.99      0.99      0.99       150
         rating       1.00      0.98      0.99       25

  full_metrics_df = full_metrics_df.append({


In [7]:
print(f"Now doing: {files_unbalanced[2][0].split('.')[0]}")
main_model(files_unbalanced[2][0].split('.')[0], files_unbalanced[2][0].split('.')[1], 2)

Now doing: dataset_gpt_unbalanced_8000
Fold 1/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8856,0.192889,0.960156
2,0.1002,0.071206,0.9875
3,0.0682,0.055661,0.991406
4,0.0402,0.045501,0.992188
5,0.032,0.044189,0.992969


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9065,0.169752,0.96875
2,0.1076,0.073145,0.986719
3,0.052,0.086382,0.984375
4,0.033,0.061338,0.991406
5,0.0262,0.060797,0.990625


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Unbalanced/dataset_gpt_unbalanced_8000/Dump/res/checkpoint-1970 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9403,0.111115,0.975781
2,0.0857,0.116509,0.979688
3,0.0527,0.045519,0.992969
4,0.038,0.038643,0.99375
5,0.0288,0.040124,0.99375


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Unbalanced/dataset_gpt_unbalanced_8000/Dump/res/checkpoint-1970 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8889,0.143528,0.975
2,0.1081,0.044565,0.992969
3,0.0448,0.045891,0.99375
4,0.0375,0.045667,0.992969
5,0.0314,0.040697,0.992969


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Unbalanced/dataset_gpt_unbalanced_8000/Dump/res/checkpoint-1970 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9601,0.159222,0.969507
2,0.101,0.060379,0.9914
3,0.0493,0.047026,0.993745
4,0.0359,0.050152,0.992181
5,0.024,0.055655,0.9914


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Unbalanced/dataset_gpt_unbalanced_8000/Dump/res/checkpoint-1970 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


{'bug report': {'precision': 0.9849246231155779, 'recall': 0.98, 'f1-score': 0.9824561403508771, 'support': 200}, 'feature request': {'precision': 0.9966555183946488, 'recall': 0.9933333333333333, 'f1-score': 0.9949916527545909, 'support': 300}, 'rating': {'precision': 0.9979838709677419, 'recall': 0.99, 'f1-score': 0.9939759036144579, 'support': 500}, 'user experience': {'precision': 0.9834983498349835, 'recall': 0.9933333333333333, 'f1-score': 0.9883913764510779, 'support': 600}, 'accuracy': 0.990625, 'macro avg': {'precision': 0.9907655905782381, 'recall': 0.9891666666666665, 'f1-score': 0.989953768292751, 'support': 1600}, 'weighted avg': {'precision': 0.9906703284539822, 'recall': 0.990625, 'f1-score': 0.9906321884840176, 'support': 1600}}
Test Accuracy: 0.990625
                 precision    recall  f1-score   support

     bug report       0.98      0.98      0.98       200
feature request       1.00      0.99      0.99       300
         rating       1.00      0.99      0.99   

  full_metrics_df = full_metrics_df.append({


In [8]:
print(f"Now doing: {files_unbalanced[3][0].split('.')[0]}")
main_model(files_unbalanced[3][0].split('.')[0], files_unbalanced[3][0].split('.')[1], 2)

Now doing: dataset_gpt_unbalanced_16000
Fold 1/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4979,0.071143,0.989453
2,0.0528,0.064746,0.991016
3,0.0388,0.064255,0.991406
4,0.0314,0.055914,0.992188
5,0.0264,0.057325,0.992188


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5532,0.065621,0.989062
2,0.0618,0.087037,0.986328
3,0.0443,0.099833,0.985547
4,0.0342,0.065515,0.991797
5,0.0268,0.05391,0.992969


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Unbalanced/dataset_gpt_unbalanced_16000/Dump/res/checkpoint-3940 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4726,0.133023,0.977734
2,0.0806,0.036278,0.994531
3,0.0481,0.038682,0.994531
4,0.0329,0.045131,0.992969
5,0.0189,0.032536,0.994922


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Unbalanced/dataset_gpt_unbalanced_16000/Dump/res/checkpoint-3940 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5236,0.083968,0.987109
2,0.0653,0.036928,0.994141
3,0.0416,0.027822,0.995313
4,0.0354,0.024075,0.995703
5,0.0244,0.02679,0.995313


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Unbalanced/dataset_gpt_unbalanced_16000/Dump/res/checkpoint-3940 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5149,0.101965,0.981633
2,0.0558,0.050255,0.992966
3,0.0457,0.036748,0.994529
4,0.0392,0.028362,0.994529
5,0.0249,0.049673,0.992575


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Unbalanced/dataset_gpt_unbalanced_16000/Dump/res/checkpoint-3940 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


{'bug report': {'precision': 1.0, 'recall': 0.985, 'f1-score': 0.9924433249370278, 'support': 400}, 'feature request': {'precision': 1.0, 'recall': 0.9966666666666667, 'f1-score': 0.9983305509181971, 'support': 600}, 'rating': {'precision': 0.996, 'recall': 0.996, 'f1-score': 0.996, 'support': 1000}, 'user experience': {'precision': 0.9917218543046358, 'recall': 0.9983333333333333, 'f1-score': 0.9950166112956811, 'support': 1200}, 'accuracy': 0.995625, 'macro avg': {'precision': 0.9969304635761589, 'recall': 0.994, 'f1-score': 0.9954476217877266, 'support': 3200}, 'weighted avg': {'precision': 0.9956456953642384, 'recall': 0.995625, 'f1-score': 0.9956236231501708, 'support': 3200}}
Test Accuracy: 0.995625
                 precision    recall  f1-score   support

     bug report       1.00      0.98      0.99       400
feature request       1.00      1.00      1.00       600
         rating       1.00      1.00      1.00      1000
user experience       0.99      1.00      1.00      1200

  full_metrics_df = full_metrics_df.append({


## Multi-label classification using ELECTRA

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from transformers import TrainerCallback
import os
import shutil
import re
import time
from pathlib import Path

def multi_main_model(file_name, ext):

    current_file_path = Path(__file__).parent

    path_to_project = current_file_path.parents[1]

    df = pd.read_csv(f"{path_to_project}/Data/Datasets/Multi-label/{file_name}.{ext}")

    results_dir = f"{path_to_project}/Models/ELECTRA/Output/Multi-label/{file_name}"
    dump_dir = results_dir+"/Dump"

    if os.path.isdir(results_dir):
        shutil.rmtree(results_dir)

    os.mkdir(results_dir)
    os.mkdir(dump_dir)

    df = df[df['review'].notna() & (df['review'] != '')]
    df['review'] = df['review'].str.replace('[^\x20-\x7E]', '', regex=True)

    X = df['review'].values
    y = df[['bug report', 'user experience', 'rating', 'feature request']].values

    X_train_CV, X_test_full, y_train_CV, y_test_full = train_test_split(X, y, test_size=0.2, random_state=42)

    tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')

    def tokenize_function(examples):
        return tokenizer(examples, padding="max_length", truncation=True, max_length=128)

    loss_logging_callback = LossLoggingCallback()

    # K-Fold Cross-Validation
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Variables to accumulate scores
    best_f1 = 0
    best_model = None
    metrics_df = pd.DataFrame()

    for fold, (train_index, val_index) in enumerate(kf.split(X_train_CV, y_train_CV)):
        print(f"Fold {fold+1}/{n_splits}")
        start_time = time.time()

        X_train, X_val = X_train_CV[train_index], X_train_CV[val_index]
        y_train, y_val = y_train_CV[train_index], y_train_CV[val_index]

        train_encodings = tokenize_function(X_train.tolist())
        val_encodings = tokenize_function(X_val.tolist())

        train_dataset = MultiLabelDataset(train_encodings, y_train)
        val_dataset = MultiLabelDataset(val_encodings, y_val)

        model = ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator', num_labels=4, problem_type="multi_label_classification")

        training_args = TrainingArguments(
            output_dir=f"{dump_dir}/res",
            num_train_epochs=5,
            per_device_train_batch_size=13,
            per_device_eval_batch_size=13,
            warmup_steps=500,
            weight_decay=0.135161130336292,
            logging_dir=f"{dump_dir}/logs",
            logging_strategy="epoch",
            evaluation_strategy="epoch",
            learning_rate=5e-05,
            max_grad_norm=1.0,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            save_strategy="epoch",
            save_total_limit=2,
            lr_scheduler_type='linear'
        )

        def compute_metrics(p):
            predictions, labels = p
            predictions = torch.sigmoid(torch.tensor(predictions)).numpy()
            threshold = 0.5
            predictions = (predictions > threshold).astype(int)
            precision = precision_score(labels, predictions, average='micro')
            recall = recall_score(labels, predictions, average='micro')
            f1 = f1_score(labels, predictions, average='micro')
            return {'precision': precision, 'recall': recall, 'f1': f1}


        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            callbacks=[loss_logging_callback]
        )

        trainer.train()

        loss_logging_callback.save_logs_to_excel(f"{results_dir}/fold_loss.xlsx")

        results = trainer.evaluate()

        if results['eval_f1'] > best_f1:
            best_f1 = results['eval_f1']
            best_model = model


        predictions = trainer.predict(val_dataset)
        pred_probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()
        threshold = 0.5
        binary_predictions = (pred_probs > threshold).astype(int)

        # True labels
        true_labels = predictions.label_ids
        f1 = f1_score(true_labels, binary_predictions, average='micro')

        report_dict = classification_report(true_labels, binary_predictions, output_dict=True, zero_division=0, target_names=['bug report', 'user experience', 'rating', 'feature request'])
        # avg_metrics = report_dict['weighted avg']  # Use 'macro avg' or 'weighted avg' based on your preference
        end_time = time.time()
        # Append the metrics for this fold to the DataFrame
        metrics_df = metrics_df.append({
            ('Fold', ''): fold + 1,
            ('F1-Score', ''): f1,
            ('Train Time', ''): str(end_time - start_time)+" s",
            ('Bug Report', 'P'): report_dict['bug report']['precision'],
            ('Bug Report', 'R'): report_dict['bug report']['recall'],
            ('Bug Report', 'F1'): report_dict['bug report']['f1-score'],
            ('Feature Request', 'P'): report_dict['feature request']['precision'],
            ('Feature Request', 'R'): report_dict['feature request']['recall'],
            ('Feature Request', 'F1'): report_dict['feature request']['f1-score'],
            ('Rating', 'P'): report_dict['rating']['precision'],
            ('Rating', 'R'): report_dict['rating']['recall'],
            ('Rating', 'F1'): report_dict['rating']['f1-score'],
            ('User Experience', 'P'): report_dict['user experience']['precision'],
            ('User Experience', 'R'): report_dict['user experience']['recall'],
            ('User Experience', 'F1'): report_dict['user experience']['f1-score']
        }, ignore_index=True)

    metrics_df.columns = pd.MultiIndex.from_tuples([(c,) if isinstance(c, str) else c for c in metrics_df.columns])
    metrics_df.to_excel(f"{results_dir}/fold_metrics.xlsx", index=True)

    test_encodings = tokenize_function(X_test_full.tolist())
    test_dataset = MultiLabelDataset(test_encodings, y_test_full)
    test_trainer = Trainer(model=best_model)
    test_predictions = test_trainer.predict(test_dataset)
    test_pred_probs = torch.sigmoid(torch.tensor(test_predictions.predictions)).numpy()
    threshold = 0.5
    test_binary_predictions = (test_pred_probs > threshold).astype(int)

    test_true_labels = test_predictions.label_ids
    test_f1 = f1_score(test_true_labels, test_binary_predictions, average='micro')

    test_report_dict = classification_report(test_true_labels, test_binary_predictions, output_dict=True, zero_division=0, target_names=['bug report', 'user experience', 'rating', 'feature request'])
    # avg_metrics = report_dict['weighted avg']  # Use 'macro avg' or 'weighted avg' based on your preference
    # Append the metrics for this fold to the DataFrame
    test_metrics_df = pd.DataFrame()

    test_metrics_df = test_metrics_df.append({
            ('F1', ''): test_f1,
            ('Bug Report', 'P'): test_report_dict['bug report']['precision'],
            ('Bug Report', 'R'): test_report_dict['bug report']['recall'],
            ('Bug Report', 'F1'): test_report_dict['bug report']['f1-score'],
            ('Feature Request', 'P'): test_report_dict['feature request']['precision'],
            ('Feature Request', 'R'): test_report_dict['feature request']['recall'],
            ('Feature Request', 'F1'): test_report_dict['feature request']['f1-score'],
            ('Rating', 'P'): test_report_dict['rating']['precision'],
            ('Rating', 'R'): test_report_dict['rating']['recall'],
            ('Rating', 'F1'): test_report_dict['rating']['f1-score'],
            ('User Experience', 'P'): test_report_dict['user experience']['precision'],
            ('User Experience', 'R'): test_report_dict['user experience']['recall'],
            ('User Experience', 'F1'): test_report_dict['user experience']['f1-score']
        }, ignore_index=True)

    test_metrics_df.columns = pd.MultiIndex.from_tuples([(c,) if isinstance(c, str) else c for c in test_metrics_df.columns])
    test_metrics_df.to_excel(f"{results_dir}/metrics_results_full_test.xlsx", index=True)

    print(f"Test F1: {test_f1}")

    # Generate and print the classification report
    print(test_report_dict)

    shutil.rmtree(dump_dir)

class MultiLabelDataset(Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)  # Ensure float32 for BCEWithLogitsLoss
        return item

    def __len__(self):
        return len(self.labels)
class LossLoggingCallback(TrainerCallback):
    """A custom callback to log training and validation loss."""
    def __init__(self):
        super().__init__()
        self.log_history = []
        self.log_train_loss_history = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        # This method captures both training and evaluation logs, so it's more general than on_epoch_end
        if logs is not None:
            # Capture both training and evaluation steps
            if 'loss' in logs:  # Indicates a training step
                self.log_train_loss_history.append({
                    'epoch': state.epoch,
                    'training_loss': logs.get('loss'),
                })
            elif 'eval_loss' in logs:  # Indicates an evaluation step
                # Make sure to capture the last training loss as well
                last_training_loss = self.log_train_loss_history[-1]['training_loss'] if self.log_train_loss_history else None
                self.log_history.append({
                    'epoch': state.epoch,
                    'training_loss': last_training_loss,  # Include last known training loss for reference
                    'validation_loss': logs.get('eval_loss'),
                    'eval_runtime':logs.get('eval_runtime')
                })

    def save_logs_to_excel(self, file_name):
        """Save the recorded logs to a Excel file."""
        pd.DataFrame(self.log_history).to_excel(file_name, index=False)

__file__ = "/content/drive/MyDrive/FinalProject/Models/ELECTRA/ELECTRA.ipynb"
current_file_path = Path(__file__).parent
path_to_project = current_file_path.parents[1]

directory_path_multi_label = path_to_project / 'Data' / 'Datasets' / 'Multi-label'

files_multi_label = [(file.name, file.stat().st_size)
               for file in directory_path_multi_label.iterdir()
               if file.is_file() and not file.name.startswith('.')]

files_multi_label.sort(key=lambda x: x[1])

In [10]:
print(f"Now doing: {files_multi_label[1][0].split('.')[0]}")
multi_main_model(files_multi_label[1][0].split('.')[0], files_multi_label[1][0].split('.')[1])

Now doing: dataset_gpt_multi_label_4000
Fold 1/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.6534,0.582976,0.705948,0.55321,0.620316
2,0.4983,0.425053,0.827263,0.699208,0.757865
3,0.3242,0.297678,0.966551,0.737027,0.836327
4,0.2656,0.266205,0.998819,0.744063,0.852823
5,0.2467,0.262646,0.995316,0.747581,0.853842


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.6541,0.571552,0.769492,0.587575,0.666341
2,0.4804,0.395828,0.860317,0.701467,0.772814
3,0.3275,0.260638,0.99096,0.756687,0.858121
4,0.2537,0.283914,0.956897,0.766178,0.850982
5,0.236,0.260447,0.97062,0.769629,0.858518


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Multi-label/dataset_gpt_multi_label_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.65,0.580677,0.734191,0.593588,0.656445
2,0.487,0.317482,0.98374,0.733969,0.840695
3,0.2988,0.257079,0.960385,0.777296,0.859195
4,0.2496,0.247903,0.986784,0.77643,0.869059
5,0.2321,0.244109,0.979235,0.77643,0.866119


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Multi-label/dataset_gpt_multi_label_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.6499,0.606558,0.7122,0.574916,0.636237
2,0.4584,0.322108,0.940426,0.744108,0.830827
3,0.2934,0.282368,0.966738,0.758418,0.85
4,0.2549,0.252645,0.986885,0.760101,0.858773
5,0.2304,0.251185,0.976369,0.765152,0.857952


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Multi-label/dataset_gpt_multi_label_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.6513,0.607845,0.71215,0.62613,0.666375
2,0.4715,0.372665,0.918675,0.751849,0.826932
3,0.2952,0.272045,0.993583,0.763353,0.863383
4,0.2472,0.257396,0.992585,0.769926,0.867191
5,0.2211,0.247315,0.987578,0.783895,0.874027


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Multi-label/dataset_gpt_multi_label_4000/Dump/res/checkpoint-985 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


  test_metrics_df = test_metrics_df.append({


Test F1: 0.8806026365348399
{'bug report': {'precision': 0.996309963099631, 'recall': 0.9375, 'f1-score': 0.966010733452594, 'support': 288}, 'user experience': {'precision': 0.9823321554770318, 'recall': 0.9439728353140917, 'f1-score': 0.9627705627705628, 'support': 589}, 'rating': {'precision': 1.0, 'recall': 0.12062256809338522, 'f1-score': 0.2152777777777778, 'support': 257}, 'feature request': {'precision': 0.9842271293375394, 'recall': 0.9285714285714286, 'f1-score': 0.9555895865237367, 'support': 336}, 'micro avg': {'precision': 0.9864978902953586, 'recall': 0.7952380952380952, 'f1-score': 0.8806026365348399, 'support': 1470}, 'macro avg': {'precision': 0.9907173119785506, 'recall': 0.7326667079947264, 'f1-score': 0.7749121651311679, 'support': 1470}, 'weighted avg': {'precision': 0.9885926696639991, 'recall': 0.7952380952380952, 'f1-score': 0.8310798929707979, 'support': 1470}, 'samples avg': {'precision': 0.8677083333333334, 'recall': 0.7525, 'f1-score': 0.7934642857142857, 's

In [11]:
print(f"Now doing: {files_multi_label[2][0].split('.')[0]}")
multi_main_model(files_multi_label[2][0].split('.')[0], files_multi_label[2][0].split('.')[1])
print(f"Now doing: {files_multi_label[3][0].split('.')[0]}")
multi_main_model(files_multi_label[3][0].split('.')[0], files_multi_label[3][0].split('.')[1])

Now doing: dataset_gpt_multi_label_8000
Fold 1/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.5555,0.329015,0.927575,0.776381,0.84527
2,0.2438,0.192972,0.993264,0.860799,0.9223
3,0.1769,0.171468,0.996399,0.86978,0.928794
4,0.1609,0.170048,0.997433,0.872474,0.930778
5,0.1521,0.163847,0.996921,0.872474,0.930556


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.5413,0.279965,0.949511,0.847356,0.89553
2,0.2123,0.190769,0.994667,0.857471,0.920988
3,0.1673,0.164393,0.993743,0.876322,0.931346
4,0.1277,0.122659,0.995522,0.92,0.956272
5,0.1008,0.124626,0.992563,0.92046,0.955153


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Multi-label/dataset_gpt_multi_label_8000/Dump/res/checkpoint-1970 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.5421,0.341868,0.965152,0.724766,0.827862
2,0.2319,0.180236,0.99946,0.865421,0.927623
3,0.1692,0.161372,0.995765,0.878972,0.93373
4,0.1396,0.115136,0.989563,0.930374,0.959056
5,0.0991,0.113246,0.989076,0.930841,0.959076


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Multi-label/dataset_gpt_multi_label_8000/Dump/res/checkpoint-1970 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.5444,0.299843,0.921107,0.842155,0.879863
2,0.2261,0.175955,0.98792,0.88103,0.931419
3,0.1667,0.156846,0.986068,0.895082,0.938375
4,0.132,0.105591,0.987129,0.933958,0.959807
5,0.0962,0.100914,0.991067,0.935363,0.96241


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Multi-label/dataset_gpt_multi_label_8000/Dump/res/checkpoint-1970 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.5416,0.306471,0.919958,0.829141,0.872192
2,0.2151,0.173811,0.998912,0.871381,0.930798
3,0.1586,0.126723,0.997419,0.916944,0.95549
4,0.1222,0.120698,0.995363,0.916944,0.954545
5,0.1035,0.128236,0.983342,0.924537,0.953033


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Multi-label/dataset_gpt_multi_label_8000/Dump/res/checkpoint-1970 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


  test_metrics_df = test_metrics_df.append({


Test F1: 0.9602173491170192
{'bug report': {'precision': 0.9938775510204082, 'recall': 0.9643564356435643, 'f1-score': 0.9788944723618089, 'support': 505}, 'user experience': {'precision': 0.9858757062146892, 'recall': 0.9570383912248629, 'f1-score': 0.9712430426716141, 'support': 1094}, 'rating': {'precision': 0.9893899204244032, 'recall': 0.7987152034261242, 'f1-score': 0.8838862559241707, 'support': 467}, 'feature request': {'precision': 0.9929947460595446, 'recall': 0.9659284497444633, 'f1-score': 0.9792746113989637, 'support': 587}, 'micro avg': {'precision': 0.9896, 'recall': 0.9325292122125896, 'f1-score': 0.9602173491170192, 'support': 2653}, 'macro avg': {'precision': 0.9905344809297613, 'recall': 0.9215096200097538, 'f1-score': 0.9533245955891394, 'support': 2653}, 'weighted avg': {'precision': 0.9895926101165945, 'recall': 0.9325292122125896, 'f1-score': 0.9590993877245529, 'support': 2653}, 'samples avg': {'precision': 0.8611458333333334, 'recall': 0.8324479166666667, 'f1-s

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4045,0.183681,0.966004,0.877289,0.919512
2,0.1557,0.145918,0.996467,0.885662,0.937803
3,0.123,0.101883,0.998312,0.92831,0.962039
4,0.1084,0.101919,0.998874,0.92831,0.9623
5,0.0855,0.075389,0.998076,0.950026,0.973458


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.3453,0.149548,0.996758,0.883259,0.936583
2,0.1118,0.099131,0.996113,0.937059,0.965684
3,0.0797,0.088282,0.996963,0.943066,0.969266
4,0.0636,0.075853,0.986323,0.960564,0.973273
5,0.0427,0.064753,0.992465,0.963176,0.977601


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Multi-label/dataset_gpt_multi_label_16000/Dump/res/checkpoint-3940 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.3425,0.150756,0.995924,0.883751,0.936491
2,0.1139,0.099306,0.994799,0.938776,0.965976
3,0.0852,0.089601,0.996193,0.946267,0.970588
4,0.0733,0.081283,0.991922,0.951692,0.971391
5,0.0623,0.072289,0.995144,0.952984,0.973608


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Multi-label/dataset_gpt_multi_label_16000/Dump/res/checkpoint-3940 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.3475,0.145297,0.999709,0.885618,0.939211
2,0.1435,0.134904,0.976757,0.922282,0.948738
3,0.1005,0.092316,0.993153,0.936225,0.963849
4,0.0902,0.083936,0.995631,0.941389,0.96775
5,0.0755,0.078124,0.99566,0.947844,0.971164


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Multi-label/dataset_gpt_multi_label_16000/Dump/res/checkpoint-3940 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.3612,0.151996,0.994135,0.890465,0.939449
2,0.1335,0.110774,0.999717,0.927239,0.962115
3,0.1008,0.100266,0.998589,0.929603,0.962862
4,0.0794,0.072384,0.995625,0.956396,0.975616
5,0.0542,0.062248,0.990294,0.964802,0.977382


Checkpoint destination directory /content/drive/MyDrive/FinalProject/Models/ELECTRA/Output/Multi-label/dataset_gpt_multi_label_16000/Dump/res/checkpoint-3940 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  metrics_df = metrics_df.append({


  test_metrics_df = test_metrics_df.append({


Test F1: 0.9786187727175539
{'bug report': {'precision': 0.997737556561086, 'recall': 0.9767441860465116, 'f1-score': 0.987129266927812, 'support': 903}, 'user experience': {'precision': 0.9891572203055693, 'recall': 0.9737991266375546, 'f1-score': 0.9814180929095354, 'support': 2061}, 'rating': {'precision': 0.9827833572453372, 'recall': 0.9170013386880856, 'f1-score': 0.948753462603878, 'support': 747}, 'feature request': {'precision': 0.9940535183349851, 'recall': 0.9794921875, 'f1-score': 0.9867191342843089, 'support': 1024}, 'micro avg': {'precision': 0.9909071227538429, 'recall': 0.9666314677930307, 'f1-score': 0.9786187727175539, 'support': 4735}, 'macro avg': {'precision': 0.9909329131117445, 'recall': 0.961759209718038, 'f1-score': 0.9760049891813836, 'support': 4735}, 'weighted avg': {'precision': 0.9908468881228575, 'recall': 0.9666314677930307, 'f1-score': 0.9785004535574648, 'support': 4735}, 'samples avg': {'precision': 0.8426822916666665, 'recall': 0.831328125, 'f1-score