## Multi-class classification using ERNIE

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

In [None]:
pip install accelerate -U

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
from transformers import TrainerCallback
import os
import shutil
import re
import time
from pathlib import Path
# [1e-05, 0.01, 2, 32, 64]

def main_model(file_name, ext, type):

    path_type = "Balanced" if type == 1 else "Unbalanced"

    current_file_path = Path(__file__).parent
    path_to_project = current_file_path.parents[1]

    df = pd.read_excel(f"{path_to_project}/Data/Datasets/{path_type}/{file_name}.{ext}")

    results_dir = f"{path_to_project}/Models/ERNIE/Output/{path_type}/{file_name}"
    dump_dir = results_dir+"/Dump"

    if os.path.isdir(results_dir):
        shutil.rmtree(results_dir)

    os.mkdir(results_dir)
    os.mkdir(dump_dir)

    df = df[df['review'].notna() & (df['review'] != '')]
    # Select the text and label columns
    df['review'] = df['review'].str.replace('[^\x20-\x7E]', '', regex=True)
    X = df['review'].values
    y = df['label'].values

    X_train_CV, X_test_full, y_train_CV, y_test_full = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


    # Encode the labels to a numeric format
    label_encoder = LabelEncoder()
    y_train_CV_encoded = label_encoder.fit_transform(y_train_CV)
    y_test_full_encoded = label_encoder.transform(y_test_full)

    # Initialize the tokenizer for RoBERTa
    tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-base-en")

    # Tokenization function
    def tokenize_function(texts):
        return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

    loss_logging_callback = LossLoggingCallback()

    # Stratified K-Fold Cross-Validation
    n_splits = 5
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Variables to accumulate scores
    best_accuracy = 0
    best_model = None
    accuracy_scores = []
    metrics_df = pd.DataFrame()


    for fold, (train_index, val_index) in enumerate(kf.split(X_train_CV, y_train_CV_encoded)):
        print(f"Fold {fold+1}/{n_splits}")
        start_time = time.time()
        # Split the data
        X_train, X_val = X_train_CV[train_index], X_train_CV[val_index]
        y_train, y_val = y_train_CV_encoded[train_index], y_train_CV_encoded[val_index]


        # Tokenize the data
        train_encodings = tokenize_function(X_train.tolist())
        val_encodings = tokenize_function(X_val.tolist())

        # Create dataset objects
        train_dataset = ReviewDataset(train_encodings, y_train)
        val_dataset = ReviewDataset(val_encodings, y_val)

        # Initialize the model for each fold
        model = AutoModelForSequenceClassification.from_pretrained("nghuyong/ernie-2.0-base-en", num_labels=len(label_encoder.classes_))

        # Define training arguments for each fold, adjust hyperparameters as needed
        training_args = TrainingArguments(
            output_dir=f"{dump_dir}/res",
            num_train_epochs=5,
            per_device_train_batch_size=10,
            per_device_eval_batch_size=49,
            warmup_steps=500,
            weight_decay=0.07996589256970411,
            logging_dir=f"{dump_dir}/logs",
            logging_strategy="epoch",
            evaluation_strategy="epoch",
            learning_rate=3.592687488204789e-05,
            max_grad_norm=1.0,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            save_strategy="epoch",
            save_total_limit=2,
            lr_scheduler_type='linear'
        )

        # Initialize Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=lambda p: {"accuracy": accuracy_score(p.predictions.argmax(-1), p.label_ids)},
            callbacks=[loss_logging_callback]
        )

        # Train
        trainer.train()

        loss_logging_callback.save_logs_to_excel(f"{results_dir}/fold_loss.xlsx")

        # Evaluate
        results = trainer.evaluate()
        accuracy_scores.append(results['eval_accuracy'])

        if results['eval_accuracy'] > best_accuracy:
            best_accuracy = results['eval_accuracy']
            best_model = model  # Assign the best model

        # Get predictions and true labels
        predictions = trainer.predict(val_dataset)
        pred_labels = np.argmax(predictions.predictions, axis=-1)
        true_labels = y_val

        # Calculate accuracy
        accuracy = accuracy_score(true_labels, pred_labels)
        label_names = label_encoder.inverse_transform(range(len(label_encoder.classes_)))

        # Calculate precision, recall, and F1-score
        report_dict = classification_report(true_labels, pred_labels, output_dict=True, zero_division=0, target_names=label_names)
        # avg_metrics = report_dict['weighted avg']  # Use 'macro avg' or 'weighted avg' based on your preference
        end_time = time.time()
        # Append the metrics for this fold to the DataFrame
        metrics_df = metrics_df.append({
            ('Fold', ''): fold + 1,
            ('Accuracy', ''): accuracy,
            ('Train Time', ''): str(end_time - start_time)+" s",
            ('Bug Report', 'P'): report_dict['bug report']['precision'],
            ('Bug Report', 'R'): report_dict['bug report']['recall'],
            ('Bug Report', 'F1'): report_dict['bug report']['f1-score'],
            ('Feature Request', 'P'): report_dict['feature request']['precision'],
            ('Feature Request', 'R'): report_dict['feature request']['recall'],
            ('Feature Request', 'F1'): report_dict['feature request']['f1-score'],
            ('Rating', 'P'): report_dict['rating']['precision'],
            ('Rating', 'R'): report_dict['rating']['recall'],
            ('Rating', 'F1'): report_dict['rating']['f1-score'],
            ('User Experience', 'P'): report_dict['user experience']['precision'],
            ('User Experience', 'R'): report_dict['user experience']['recall'],
            ('User Experience', 'F1'): report_dict['user experience']['f1-score']
        }, ignore_index=True)

    # Save the DataFrame to a CSV file after completing all folds
    metrics_df.columns = pd.MultiIndex.from_tuples([(c,) if isinstance(c, str) else c for c in metrics_df.columns])
    metrics_df.to_excel(f"{results_dir}/fold_metrics.xlsx", index=True)

    # Evaluate the best model on the test set
    test_encodings = tokenize_function(X_test_full.tolist())
    test_dataset = ReviewDataset(test_encodings, y_test_full_encoded)
    test_trainer = Trainer(model=best_model)
    test_results = test_trainer.predict(test_dataset)
    test_predictions = np.argmax(test_results.predictions, axis=-1)
    test_accuracy = accuracy_score(y_test_full_encoded, test_predictions)

    label_names_full = label_encoder.inverse_transform(range(len(label_encoder.classes_)))

    # Calculate precision, recall, and F1-score
    report_dict_full = classification_report(y_test_full_encoded, test_predictions, output_dict=True, zero_division=0, target_names=label_names_full)
    print(report_dict_full)
    full_metrics_df = pd.DataFrame()

    full_metrics_df = full_metrics_df.append({
            ('Accuracy', ''): test_accuracy,
            ('Bug Report', 'P'): report_dict_full['bug report']['precision'],
            ('Bug Report', 'R'): report_dict_full['bug report']['recall'],
            ('Bug Report', 'F1'): report_dict_full['bug report']['f1-score'],
            ('Feature Request', 'P'): report_dict_full['feature request']['precision'],
            ('Feature Request', 'R'): report_dict_full['feature request']['recall'],
            ('Feature Request', 'F1'): report_dict_full['feature request']['f1-score'],
            ('Rating', 'P'): report_dict_full['rating']['precision'],
            ('Rating', 'R'): report_dict_full['rating']['recall'],
            ('Rating', 'F1'): report_dict_full['rating']['f1-score'],
            ('User Experience', 'P'): report_dict_full['user experience']['precision'],
            ('User Experience', 'R'): report_dict_full['user experience']['recall'],
            ('User Experience', 'F1'): report_dict_full['user experience']['f1-score']
        }, ignore_index=True)

    full_metrics_df.columns = pd.MultiIndex.from_tuples([(c,) if isinstance(c, str) else c for c in full_metrics_df.columns])
    full_metrics_df.to_excel(f"{results_dir}/metrics_results_full_test.xlsx", index=True)

    print(f"Test Accuracy: {test_accuracy}")

    # Generate and print the classification report
    print(classification_report(y_test_full_encoded, test_predictions, target_names=label_encoder.classes_, zero_division=0))

    shutil.rmtree(dump_dir)

# Custom dataset class
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

class LossLoggingCallback(TrainerCallback):
    """A custom callback to log training and validation loss."""
    def __init__(self):
        super().__init__()
        self.log_history = []
        self.log_train_loss_history = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        # This method captures both training and evaluation logs, so it's more general than on_epoch_end
        if logs is not None:
            # Capture both training and evaluation steps
            if 'loss' in logs:  # Indicates a training step
                self.log_train_loss_history.append({
                    'epoch': state.epoch,
                    'training_loss': logs.get('loss'),
                })
            elif 'eval_loss' in logs:  # Indicates an evaluation step
                # Make sure to capture the last training loss as well
                last_training_loss = self.log_train_loss_history[-1]['training_loss'] if self.log_train_loss_history else None
                self.log_history.append({
                    'epoch': state.epoch,
                    'training_loss': last_training_loss,  # Include last known training loss for reference
                    'validation_loss': logs.get('eval_loss'),
                    'eval_runtime':logs.get('eval_runtime')
                })

    def save_logs_to_excel(self, file_name):
        """Save the recorded logs to a Excel file."""
        pd.DataFrame(self.log_history).to_excel(file_name, index=False)

__file__ = "/content/drive/MyDrive/FinalProject/Models/ERNIE/ERNIE.ipynb"
current_file_path = Path(__file__).parent
path_to_project = current_file_path.parents[1]

directory_path_multi = path_to_project / 'Data' / 'Datasets' / 'Balanced'

files_multi = [(file.name, file.stat().st_size)
               for file in directory_path_multi.iterdir()
               if file.is_file() and not file.name.startswith('.')]

files_multi.sort(key=lambda x: x[1])





Running the model on each dataset separately due to storage constraints

In [4]:
print(f"Now doing: {files_multi[1][0].split('.')[0]}")
main_model(files_multi[1][0].split('.')[0], files_multi[1][0].split('.')[1], 1)
print(f"Now doing: {files_multi[2][0].split('.')[0]}")
main_model(files_multi[2][0].split('.')[0], files_multi[2][0].split('.')[1], 1)

Now doing: dataset_balanced_4000


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Fold 1/5


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9734,0.872371,0.678125
2,0.6861,0.668687,0.757812
3,0.4905,0.672112,0.792188
4,0.2487,0.842687,0.803125
5,0.1139,0.969064,0.78125


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9895,0.728995,0.740625
2,0.6931,0.740818,0.746875
3,0.4781,0.757608,0.742188
4,0.2272,0.992024,0.75625
5,0.0969,1.165606,0.767188


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9854,0.74516,0.723437
2,0.6634,0.756357,0.709375
3,0.4596,0.799884,0.746875
4,0.2486,1.021074,0.760938
5,0.1143,1.17157,0.760938


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0018,0.738738,0.71875
2,0.641,0.874451,0.703125
3,0.4373,0.938832,0.720313
4,0.2042,1.093882,0.757812
5,0.093,1.239169,0.74375


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9971,0.653119,0.760938
2,0.6933,0.821538,0.714063
3,0.4737,0.694076,0.764062
4,0.2293,0.98935,0.778125
5,0.0942,1.05814,0.778125


  metrics_df = metrics_df.append({


{'bug report': {'precision': 0.7426160337552743, 'recall': 0.88, 'f1-score': 0.8054919908466819, 'support': 200}, 'feature request': {'precision': 0.7616580310880829, 'recall': 0.735, 'f1-score': 0.7480916030534353, 'support': 200}, 'rating': {'precision': 0.7888198757763976, 'recall': 0.635, 'f1-score': 0.703601108033241, 'support': 200}, 'user experience': {'precision': 0.6794258373205742, 'recall': 0.71, 'f1-score': 0.6943765281173594, 'support': 200}, 'accuracy': 0.74, 'macro avg': {'precision': 0.7431299444850823, 'recall': 0.74, 'f1-score': 0.7378903075126794, 'support': 800}, 'weighted avg': {'precision': 0.7431299444850822, 'recall': 0.74, 'f1-score': 0.7378903075126795, 'support': 800}}
Test Accuracy: 0.74
                 precision    recall  f1-score   support

     bug report       0.74      0.88      0.81       200
feature request       0.76      0.73      0.75       200
         rating       0.79      0.64      0.70       200
user experience       0.68      0.71      0.69

  full_metrics_df = full_metrics_df.append({


Now doing: dataset_balanced_8000
Fold 1/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8684,0.657379,0.767188
2,0.5223,0.532269,0.809375
3,0.2677,0.640789,0.850781
4,0.1261,0.608108,0.8875
5,0.0518,0.631092,0.894531


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8673,0.647065,0.76875
2,0.4994,0.544725,0.828906
3,0.2545,0.580071,0.871094
4,0.1084,0.633275,0.884375
5,0.0419,0.642106,0.889062


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8634,0.767896,0.73125
2,0.5324,0.523709,0.828906
3,0.2713,0.520372,0.870313
4,0.1207,0.602805,0.885156
5,0.0542,0.608581,0.890625


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.863,0.625155,0.792188
2,0.5097,0.55117,0.842187
3,0.2423,0.622827,0.871094
4,0.1288,0.583187,0.892969
5,0.0626,0.558971,0.898438


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.872,0.658779,0.769531
2,0.5307,0.459666,0.850781
3,0.2633,0.474178,0.888281
4,0.1145,0.510199,0.903906
5,0.0486,0.523981,0.910937


  metrics_df = metrics_df.append({


{'bug report': {'precision': 0.9026128266033254, 'recall': 0.95, 'f1-score': 0.9257003654080389, 'support': 400}, 'feature request': {'precision': 0.9007444168734491, 'recall': 0.9075, 'f1-score': 0.904109589041096, 'support': 400}, 'rating': {'precision': 0.902200488997555, 'recall': 0.9225, 'f1-score': 0.9122373300370828, 'support': 400}, 'user experience': {'precision': 0.8991825613079019, 'recall': 0.825, 'f1-score': 0.8604954367666231, 'support': 400}, 'accuracy': 0.90125, 'macro avg': {'precision': 0.9011850734455578, 'recall': 0.9012499999999999, 'f1-score': 0.9006356803132102, 'support': 1600}, 'weighted avg': {'precision': 0.9011850734455578, 'recall': 0.90125, 'f1-score': 0.9006356803132103, 'support': 1600}}
Test Accuracy: 0.90125
                 precision    recall  f1-score   support

     bug report       0.90      0.95      0.93       400
feature request       0.90      0.91      0.90       400
         rating       0.90      0.92      0.91       400
user experience    

  full_metrics_df = full_metrics_df.append({


In [5]:
print(f"Now doing: {files_multi[3][0].split('.')[0]}")
main_model(files_multi[3][0].split('.')[0], files_multi[3][0].split('.')[1], 1)
print(f"Now doing: {files_multi[4][0].split('.')[0]}")
main_model(files_multi[4][0].split('.')[0], files_multi[4][0].split('.')[1], 1)

Now doing: dataset_gpt_balanced_4000
Fold 1/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1888,0.617782,0.790625
2,0.3619,0.297733,0.935937
3,0.2629,0.250673,0.951562
4,0.2154,0.213353,0.953125
5,0.1523,0.219592,0.95


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1446,0.479405,0.865625
2,0.3612,0.56839,0.878125
3,0.2514,0.292677,0.939063
4,0.1943,0.294539,0.940625
5,0.1326,0.334869,0.939063


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1658,0.53306,0.825
2,0.3763,0.247012,0.953125
3,0.2608,0.219071,0.95625
4,0.2235,0.195024,0.959375
5,0.1602,0.233017,0.946875


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.166,0.648787,0.784375
2,0.3644,0.267665,0.942187
3,0.2569,0.344584,0.920312
4,0.203,0.275941,0.940625
5,0.1612,0.231939,0.953125


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1754,0.581321,0.812207
2,0.3744,0.291303,0.934272
3,0.2521,0.2577,0.945227
4,0.1932,0.259711,0.943662
5,0.1542,0.245549,0.949922


  metrics_df = metrics_df.append({


{'bug report': {'precision': 0.979381443298969, 'recall': 0.95, 'f1-score': 0.9644670050761421, 'support': 200}, 'feature request': {'precision': 0.9947916666666666, 'recall': 0.955, 'f1-score': 0.9744897959183673, 'support': 200}, 'rating': {'precision': 0.9597989949748744, 'recall': 0.955, 'f1-score': 0.9573934837092732, 'support': 200}, 'user experience': {'precision': 0.9162790697674419, 'recall': 0.985, 'f1-score': 0.9493975903614459, 'support': 200}, 'accuracy': 0.96125, 'macro avg': {'precision': 0.962562793676988, 'recall': 0.9612499999999999, 'f1-score': 0.9614369687663071, 'support': 800}, 'weighted avg': {'precision': 0.9625627936769879, 'recall': 0.96125, 'f1-score': 0.9614369687663072, 'support': 800}}
Test Accuracy: 0.96125
                 precision    recall  f1-score   support

     bug report       0.98      0.95      0.96       200
feature request       0.99      0.95      0.97       200
         rating       0.96      0.95      0.96       200
user experience       0

  full_metrics_df = full_metrics_df.append({


Now doing: dataset_gpt_balanced_8000
Fold 1/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7043,0.343019,0.934375
2,0.1872,0.139384,0.975
3,0.1306,0.123044,0.975781
4,0.1164,0.107928,0.977344
5,0.0895,0.120033,0.978125


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7201,0.26438,0.95
2,0.1902,0.247556,0.954688
3,0.1434,0.147445,0.973437
4,0.1252,0.137434,0.973437
5,0.0983,0.148003,0.970313


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7102,0.137163,0.975
2,0.1605,0.221214,0.963281
3,0.1358,0.187586,0.96875
4,0.1185,0.138619,0.976562
5,0.084,0.170339,0.972656


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7031,0.286728,0.942969
2,0.1766,0.151493,0.971094
3,0.1319,0.138008,0.972656
4,0.1148,0.11608,0.974219
5,0.0926,0.132796,0.973437


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6826,0.213205,0.962471
2,0.1656,0.207592,0.963253
3,0.1254,0.17109,0.971853
4,0.1133,0.144443,0.970289
5,0.0832,0.142981,0.971071


  metrics_df = metrics_df.append({


{'bug report': {'precision': 0.9922680412371134, 'recall': 0.9625, 'f1-score': 0.9771573604060914, 'support': 400}, 'feature request': {'precision': 1.0, 'recall': 0.98, 'f1-score': 0.98989898989899, 'support': 400}, 'rating': {'precision': 0.9799498746867168, 'recall': 0.9775, 'f1-score': 0.9787234042553192, 'support': 400}, 'user experience': {'precision': 0.9477434679334917, 'recall': 0.9975, 'f1-score': 0.971985383678441, 'support': 400}, 'accuracy': 0.979375, 'macro avg': {'precision': 0.9799903459643304, 'recall': 0.979375, 'f1-score': 0.9794412845597105, 'support': 1600}, 'weighted avg': {'precision': 0.9799903459643307, 'recall': 0.979375, 'f1-score': 0.9794412845597102, 'support': 1600}}
Test Accuracy: 0.979375
                 precision    recall  f1-score   support

     bug report       0.99      0.96      0.98       400
feature request       1.00      0.98      0.99       400
         rating       0.98      0.98      0.98       400
user experience       0.95      1.00     

  full_metrics_df = full_metrics_df.append({


In [7]:
print(f"Now doing: {files_multi[5][0].split('.')[0]}")
main_model(files_multi[5][0].split('.')[0], files_multi[5][0].split('.')[1], 1)

Now doing: dataset_gpt_balanced_20000
Fold 1/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3263,0.105641,0.982187
2,0.0696,0.0725,0.988125
3,0.0592,0.070755,0.988437
4,0.0526,0.065328,0.987812
5,0.0396,0.067753,0.988125


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3119,0.097456,0.985
2,0.0839,0.069631,0.989375
3,0.0658,0.061883,0.989375
4,0.0547,0.049732,0.989688
5,0.0448,0.062302,0.989688


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3038,0.073555,0.989062
2,0.079,0.062453,0.989062
3,0.074,0.063428,0.989688
4,0.0628,0.062838,0.989375
5,0.0495,0.04832,0.990313


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3098,0.071601,0.988437
2,0.0802,0.054698,0.990938
3,0.0649,0.051774,0.990938


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3098,0.071601,0.988437
2,0.0802,0.054698,0.990938
3,0.0649,0.051774,0.990938
4,0.0572,0.051639,0.992188
5,0.0457,0.052382,0.991875


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3309,0.086606,0.985933
2,0.0753,0.071269,0.986871
3,0.0665,0.076213,0.987809
4,0.0521,0.066323,0.988434
5,0.0401,0.075243,0.987496


  metrics_df = metrics_df.append({


{'bug report': {'precision': 0.9989878542510121, 'recall': 0.987, 'f1-score': 0.9929577464788732, 'support': 1000}, 'feature request': {'precision': 1.0, 'recall': 0.991, 'f1-score': 0.9954796584630838, 'support': 1000}, 'rating': {'precision': 0.992, 'recall': 0.992, 'f1-score': 0.992, 'support': 1000}, 'user experience': {'precision': 0.9774730656219393, 'recall': 0.998, 'f1-score': 0.987629886194953, 'support': 1000}, 'accuracy': 0.992, 'macro avg': {'precision': 0.9921152299682379, 'recall': 0.992, 'f1-score': 0.9920168227842275, 'support': 4000}, 'weighted avg': {'precision': 0.9921152299682379, 'recall': 0.992, 'f1-score': 0.9920168227842275, 'support': 4000}}
Test Accuracy: 0.992
                 precision    recall  f1-score   support

     bug report       1.00      0.99      0.99      1000
feature request       1.00      0.99      1.00      1000
         rating       0.99      0.99      0.99      1000
user experience       0.98      1.00      0.99      1000

       accuracy  

  full_metrics_df = full_metrics_df.append({


In [8]:
print(f"Now doing: {files_multi[6][0].split('.')[0]}")
main_model(files_multi[6][0].split('.')[0], files_multi[6][0].split('.')[1], 1)

Now doing: dataset_gpt_balanced_32000
Fold 1/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2035,0.071254,0.99043
2,0.0537,0.053818,0.991406
3,0.0453,0.040816,0.992188
4,0.0409,0.051745,0.99043
5,0.0361,0.04325,0.992969


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2068,0.055431,0.991406
2,0.0562,0.052944,0.991406
3,0.042,0.03217,0.99375
4,0.0339,0.035222,0.99375
5,0.0241,0.046731,0.992773


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1978,0.054246,0.991211
2,0.0512,0.042471,0.991992
3,0.0418,0.038735,0.992383
4,0.0352,0.044861,0.991211
5,0.0274,0.04703,0.992773


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2014,0.050885,0.990234
2,0.0581,0.054209,0.991016
3,0.0528,0.048329,0.993359
4,0.0412,0.043807,0.993945
5,0.0331,0.030086,0.993945


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2139,0.0548,0.9916
2,0.0558,0.044293,0.992577
3,0.0415,0.042649,0.992772
4,0.0403,0.040241,0.993163
5,0.0338,0.042012,0.993163


  metrics_df = metrics_df.append({


{'bug report': {'precision': 1.0, 'recall': 0.9925, 'f1-score': 0.9962358845671266, 'support': 1600}, 'feature request': {'precision': 0.9993710691823899, 'recall': 0.993125, 'f1-score': 0.9962382445141066, 'support': 1600}, 'rating': {'precision': 0.9931292941911305, 'recall': 0.99375, 'f1-score': 0.9934395501405809, 'support': 1600}, 'user experience': {'precision': 0.987045033929673, 'recall': 1.0, 'f1-score': 0.9934802856255821, 'support': 1600}, 'accuracy': 0.99484375, 'macro avg': {'precision': 0.9948863493257983, 'recall': 0.99484375, 'f1-score': 0.9948484912118492, 'support': 6400}, 'weighted avg': {'precision': 0.9948863493257984, 'recall': 0.99484375, 'f1-score': 0.994848491211849, 'support': 6400}}
Test Accuracy: 0.99484375
                 precision    recall  f1-score   support

     bug report       1.00      0.99      1.00      1600
feature request       1.00      0.99      1.00      1600
         rating       0.99      0.99      0.99      1600
user experience       0.99

  full_metrics_df = full_metrics_df.append({


In [9]:
directory_path_unbalanced = path_to_project / 'Data' / 'Datasets' / 'Unbalanced'

files_unbalanced = [(file.name, file.stat().st_size)
               for file in directory_path_unbalanced.iterdir()
               if file.is_file() and not file.name.startswith('.')]

files_unbalanced.sort(key=lambda x: x[1])

In [10]:
print(f"Now doing: {files_unbalanced[0][0].split('.')[0]}")
main_model(files_unbalanced[0][0].split('.')[0], files_unbalanced[0][0].split('.')[1], 2)
print(f"Now doing: {files_unbalanced[1][0].split('.')[0]}")
main_model(files_unbalanced[1][0].split('.')[0], files_unbalanced[1][0].split('.')[1], 2)

Now doing: dataset_unbalanced_4000
Fold 1/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9684,0.639955,0.776563
2,0.6313,0.550206,0.796875
3,0.3894,0.730154,0.8
4,0.2159,0.719427,0.842187
5,0.0695,0.726607,0.853125


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9654,0.82598,0.671875
2,0.631,0.75856,0.734375
3,0.3957,0.765885,0.778125
4,0.1974,0.908934,0.8
5,0.0816,0.916788,0.815625


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9738,0.697608,0.748437


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9738,0.697608,0.748437
2,0.6157,0.64721,0.742188
3,0.3899,0.578295,0.823438
4,0.1925,0.789595,0.814063
5,0.0576,0.757313,0.845313


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9734,0.778535,0.698438
2,0.6358,0.66121,0.776563
3,0.3808,0.614429,0.828125
4,0.168,0.811596,0.842187
5,0.0584,0.874856,0.845313


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9882,0.724335,0.729688
2,0.5956,0.715124,0.754687
3,0.3596,0.764371,0.8
4,0.168,1.012417,0.792188
5,0.0609,1.011942,0.814063


  metrics_df = metrics_df.append({


{'bug report': {'precision': 0.8264150943396227, 'recall': 0.8725099601593626, 'f1-score': 0.8488372093023255, 'support': 251}, 'feature request': {'precision': 0.8372093023255814, 'recall': 0.8936170212765957, 'f1-score': 0.8644939965694683, 'support': 282}, 'rating': {'precision': 0.8849557522123894, 'recall': 0.6756756756756757, 'f1-score': 0.7662835249042146, 'support': 148}, 'user experience': {'precision': 0.7272727272727273, 'recall': 0.7394957983193278, 'f1-score': 0.7333333333333334, 'support': 119}, 'accuracy': 0.82375, 'macro avg': {'precision': 0.8189632190375802, 'recall': 0.7953246138577403, 'f1-score': 0.8032370160273354, 'support': 800}, 'weighted avg': {'precision': 0.8263026472599343, 'recall': 0.82375, 'f1-score': 0.8219025936499552, 'support': 800}}
Test Accuracy: 0.82375
                 precision    recall  f1-score   support

     bug report       0.83      0.87      0.85       251
feature request       0.84      0.89      0.86       282
         rating       0.8

  full_metrics_df = full_metrics_df.append({


Now doing: dataset_gpt_unbalanced_4000
Fold 1/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1083,0.341507,0.9125
2,0.1776,0.063436,0.989062
3,0.0836,0.045825,0.99375
4,0.0466,0.044342,0.99375
5,0.0296,0.045217,0.992188


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.056,0.453727,0.859375
2,0.2184,0.095274,0.98125
3,0.0727,0.062286,0.990625
4,0.0351,0.064156,0.9875
5,0.0212,0.058615,0.990625


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0113,0.325261,0.903125
2,0.1578,0.045051,0.989062
3,0.0813,0.063443,0.990625
4,0.04,0.069804,0.989062
5,0.0206,0.038743,0.99375


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0247,0.312583,0.909375
2,0.1485,0.114026,0.978125
3,0.0777,0.105369,0.979688
4,0.0396,0.052826,0.992188
5,0.0324,0.050547,0.99375


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0337,0.503624,0.835938
2,0.1322,0.265148,0.95625
3,0.0676,0.14997,0.975
4,0.0296,0.136568,0.982812
5,0.0203,0.132609,0.982812


  metrics_df = metrics_df.append({


{'bug report': {'precision': 0.98, 'recall': 0.98, 'f1-score': 0.98, 'support': 100}, 'feature request': {'precision': 0.9868421052631579, 'recall': 1.0, 'f1-score': 0.9933774834437086, 'support': 150}, 'rating': {'precision': 1.0, 'recall': 0.98, 'f1-score': 0.98989898989899, 'support': 250}, 'user experience': {'precision': 0.9801980198019802, 'recall': 0.99, 'f1-score': 0.9850746268656716, 'support': 300}, 'accuracy': 0.9875, 'macro avg': {'precision': 0.9867600312662845, 'recall': 0.9875, 'f1-score': 0.9870877750520926, 'support': 800}, 'weighted avg': {'precision': 0.9876071521625847, 'recall': 0.9875, 'f1-score': 0.9875046975637565, 'support': 800}}
Test Accuracy: 0.9875
                 precision    recall  f1-score   support

     bug report       0.98      0.98      0.98       100
feature request       0.99      1.00      0.99       150
         rating       1.00      0.98      0.99       250
user experience       0.98      0.99      0.99       300

       accuracy            

  full_metrics_df = full_metrics_df.append({


In [11]:
print(f"Now doing: {files_unbalanced[2][0].split('.')[0]}")
main_model(files_unbalanced[2][0].split('.')[0], files_unbalanced[2][0].split('.')[1], 2)

Now doing: dataset_gpt_unbalanced_8000
Fold 1/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6192,0.132124,0.975
2,0.0756,0.088967,0.986719
3,0.0474,0.056272,0.992188
4,0.0399,0.063727,0.990625
5,0.0256,0.052724,0.990625


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6335,0.121813,0.980469
2,0.0813,0.097383,0.982812
3,0.0444,0.050078,0.992969
4,0.0347,0.050146,0.992969
5,0.0162,0.057059,0.992188


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5692,0.123196,0.979688
2,0.0729,0.121464,0.979688
3,0.0523,0.068088,0.990625
4,0.0411,0.049615,0.992188
5,0.0218,0.041747,0.99375


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5941,0.07097,0.9875
2,0.081,0.048504,0.992188
3,0.0465,0.029822,0.995313
4,0.0366,0.045512,0.992969
5,0.0256,0.034805,0.994531


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5766,0.119268,0.982799
2,0.0598,0.046429,0.993745
3,0.0412,0.070789,0.990618
4,0.0324,0.071396,0.989054
5,0.0213,0.072983,0.988272


  metrics_df = metrics_df.append({


{'bug report': {'precision': 1.0, 'recall': 0.98, 'f1-score': 0.98989898989899, 'support': 200}, 'feature request': {'precision': 0.9900332225913622, 'recall': 0.9933333333333333, 'f1-score': 0.9916805324459235, 'support': 300}, 'rating': {'precision': 1.0, 'recall': 0.988, 'f1-score': 0.993963782696177, 'support': 500}, 'user experience': {'precision': 0.9819376026272578, 'recall': 0.9966666666666667, 'f1-score': 0.9892473118279571, 'support': 600}, 'accuracy': 0.99125, 'macro avg': {'precision': 0.992992706304655, 'recall': 0.9894999999999999, 'f1-score': 0.9911976542172619, 'support': 1600}, 'weighted avg': {'precision': 0.9913578302211021, 'recall': 0.99125, 'f1-score': 0.9912588975990236, 'support': 1600}}
Test Accuracy: 0.99125
                 precision    recall  f1-score   support

     bug report       1.00      0.98      0.99       200
feature request       0.99      0.99      0.99       300
         rating       1.00      0.99      0.99       500
user experience       0.98 

  full_metrics_df = full_metrics_df.append({


In [13]:
print(f"Now doing: {files_unbalanced[3][0].split('.')[0]}")
main_model(files_unbalanced[3][0].split('.')[0], files_unbalanced[3][0].split('.')[1], 2)

Now doing: dataset_gpt_unbalanced_16000
Fold 1/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3608,0.065576,0.989453
2,0.0467,0.053249,0.991797
3,0.0416,0.051581,0.992188
4,0.0317,0.063536,0.991406
5,0.0245,0.062026,0.991797


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.327,0.133655,0.980859
2,0.0481,0.043407,0.994141
3,0.0345,0.054024,0.992188
4,0.032,0.046759,0.993359
5,0.024,0.043468,0.99375


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3365,0.06383,0.991016
2,0.052,0.031686,0.995313
3,0.0389,0.042265,0.994141
4,0.0303,0.032716,0.994531
5,0.0165,0.035918,0.994141


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3354,0.081449,0.987109


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3354,0.081449,0.987109
2,0.0541,0.040544,0.993359
3,0.0451,0.030175,0.995313
4,0.0335,0.032274,0.994922
5,0.0211,0.029187,0.994922


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3263,0.041155,0.993357
2,0.0535,0.039599,0.99492
3,0.0424,0.032715,0.995701
4,0.0349,0.028007,0.995701
5,0.017,0.039934,0.99492


  metrics_df = metrics_df.append({


{'bug report': {'precision': 1.0, 'recall': 0.985, 'f1-score': 0.9924433249370278, 'support': 400}, 'feature request': {'precision': 1.0, 'recall': 0.9966666666666667, 'f1-score': 0.9983305509181971, 'support': 600}, 'rating': {'precision': 0.9979959919839679, 'recall': 0.996, 'f1-score': 0.9969969969969971, 'support': 1000}, 'user experience': {'precision': 0.9917355371900827, 'recall': 1.0, 'f1-score': 0.995850622406639, 'support': 1200}, 'accuracy': 0.99625, 'macro avg': {'precision': 0.9974328822935126, 'recall': 0.9944166666666667, 'f1-score': 0.9959053738147152, 'support': 3200}, 'weighted avg': {'precision': 0.9962745739412711, 'recall': 0.99625, 'f1-score': 0.9962479388783416, 'support': 3200}}
Test Accuracy: 0.99625
                 precision    recall  f1-score   support

     bug report       1.00      0.98      0.99       400
feature request       1.00      1.00      1.00       600
         rating       1.00      1.00      1.00      1000
user experience       0.99      1.00

  full_metrics_df = full_metrics_df.append({


## Multi-label classification using ERNIE

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from transformers import TrainerCallback
import os
import shutil
import re
import time
from pathlib import Path

def multi_main_model(file_name, ext):

    current_file_path = Path(__file__).parent

    path_to_project = current_file_path.parents[1]

    df = pd.read_csv(f"{path_to_project}/Data/Datasets/Multi-label/{file_name}.{ext}")

    results_dir = f"{path_to_project}/Models/ERNIE/Output/Multi-label/{file_name}"
    dump_dir = results_dir+"/Dump"

    if os.path.isdir(results_dir):
        shutil.rmtree(results_dir)

    os.mkdir(results_dir)
    os.mkdir(dump_dir)

    df = df[df['review'].notna() & (df['review'] != '')]
    df['review'] = df['review'].str.replace('[^\x20-\x7E]', '', regex=True)

    X = df['review'].values
    y = df[['bug report', 'user experience', 'rating', 'feature request']].values

    X_train_CV, X_test_full, y_train_CV, y_test_full = train_test_split(X, y, test_size=0.2, random_state=42)

    tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-base-en")

    def tokenize_function(examples):
        return tokenizer(examples, padding="max_length", truncation=True, max_length=128)

    loss_logging_callback = LossLoggingCallback()

    # K-Fold Cross-Validation
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Variables to accumulate scores
    best_f1 = 0
    best_model = None
    metrics_df = pd.DataFrame()

    for fold, (train_index, val_index) in enumerate(kf.split(X_train_CV, y_train_CV)):
        print(f"Fold {fold+1}/{n_splits}")
        start_time = time.time()

        X_train, X_val = X_train_CV[train_index], X_train_CV[val_index]
        y_train, y_val = y_train_CV[train_index], y_train_CV[val_index]

        train_encodings = tokenize_function(X_train.tolist())
        val_encodings = tokenize_function(X_val.tolist())

        train_dataset = MultiLabelDataset(train_encodings, y_train)
        val_dataset = MultiLabelDataset(val_encodings, y_val)

        model = AutoModelForSequenceClassification.from_pretrained("nghuyong/ernie-2.0-base-en", num_labels=4, problem_type="multi_label_classification")

        training_args = TrainingArguments(
            output_dir=f"{dump_dir}/res",
            num_train_epochs=5,
            per_device_train_batch_size=10,
            per_device_eval_batch_size=49,
            warmup_steps=500,
            weight_decay=0.07,
            logging_dir=f"{dump_dir}/logs",
            logging_strategy="epoch",
            evaluation_strategy="epoch",
            learning_rate=3e-5,
            max_grad_norm=1.0,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            save_strategy="epoch",
            save_total_limit=2,
            lr_scheduler_type='linear'
        )

        def compute_metrics(p):
            predictions, labels = p
            predictions = torch.sigmoid(torch.tensor(predictions)).numpy()
            threshold = 0.5
            predictions = (predictions > threshold).astype(int)
            precision = precision_score(labels, predictions, average='micro')
            recall = recall_score(labels, predictions, average='micro')
            f1 = f1_score(labels, predictions, average='micro')
            return {'precision': precision, 'recall': recall, 'f1': f1}


        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            callbacks=[loss_logging_callback]
        )

        trainer.train()

        loss_logging_callback.save_logs_to_excel(f"{results_dir}/fold_loss.xlsx")

        results = trainer.evaluate()

        if results['eval_f1'] > best_f1:
            best_f1 = results['eval_f1']
            best_model = model


        predictions = trainer.predict(val_dataset)
        pred_probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()
        threshold = 0.5
        binary_predictions = (pred_probs > threshold).astype(int)

        # True labels
        true_labels = predictions.label_ids
        f1 = f1_score(true_labels, binary_predictions, average='micro')

        report_dict = classification_report(true_labels, binary_predictions, output_dict=True, zero_division=0, target_names=['bug report', 'user experience', 'rating', 'feature request'])
        # avg_metrics = report_dict['weighted avg']  # Use 'macro avg' or 'weighted avg' based on your preference
        end_time = time.time()
        # Append the metrics for this fold to the DataFrame
        metrics_df = metrics_df.append({
            ('Fold', ''): fold + 1,
            ('F1-Score', ''): f1,
            ('Train Time', ''): str(end_time - start_time)+" s",
            ('Bug Report', 'P'): report_dict['bug report']['precision'],
            ('Bug Report', 'R'): report_dict['bug report']['recall'],
            ('Bug Report', 'F1'): report_dict['bug report']['f1-score'],
            ('Feature Request', 'P'): report_dict['feature request']['precision'],
            ('Feature Request', 'R'): report_dict['feature request']['recall'],
            ('Feature Request', 'F1'): report_dict['feature request']['f1-score'],
            ('Rating', 'P'): report_dict['rating']['precision'],
            ('Rating', 'R'): report_dict['rating']['recall'],
            ('Rating', 'F1'): report_dict['rating']['f1-score'],
            ('User Experience', 'P'): report_dict['user experience']['precision'],
            ('User Experience', 'R'): report_dict['user experience']['recall'],
            ('User Experience', 'F1'): report_dict['user experience']['f1-score']
        }, ignore_index=True)

    metrics_df.columns = pd.MultiIndex.from_tuples([(c,) if isinstance(c, str) else c for c in metrics_df.columns])
    metrics_df.to_excel(f"{results_dir}/fold_metrics.xlsx", index=True)

    test_encodings = tokenize_function(X_test_full.tolist())
    test_dataset = MultiLabelDataset(test_encodings, y_test_full)
    test_trainer = Trainer(model=best_model)
    test_predictions = test_trainer.predict(test_dataset)
    test_pred_probs = torch.sigmoid(torch.tensor(test_predictions.predictions)).numpy()
    threshold = 0.5
    test_binary_predictions = (test_pred_probs > threshold).astype(int)

    test_true_labels = test_predictions.label_ids
    test_f1 = f1_score(test_true_labels, test_binary_predictions, average='micro')

    test_report_dict = classification_report(test_true_labels, test_binary_predictions, output_dict=True, zero_division=0, target_names=['bug report', 'user experience', 'rating', 'feature request'])
    # avg_metrics = report_dict['weighted avg']  # Use 'macro avg' or 'weighted avg' based on your preference
    # Append the metrics for this fold to the DataFrame
    test_metrics_df = pd.DataFrame()

    test_metrics_df = test_metrics_df.append({
            ('F1', ''): test_f1,
            ('Bug Report', 'P'): test_report_dict['bug report']['precision'],
            ('Bug Report', 'R'): test_report_dict['bug report']['recall'],
            ('Bug Report', 'F1'): test_report_dict['bug report']['f1-score'],
            ('Feature Request', 'P'): test_report_dict['feature request']['precision'],
            ('Feature Request', 'R'): test_report_dict['feature request']['recall'],
            ('Feature Request', 'F1'): test_report_dict['feature request']['f1-score'],
            ('Rating', 'P'): test_report_dict['rating']['precision'],
            ('Rating', 'R'): test_report_dict['rating']['recall'],
            ('Rating', 'F1'): test_report_dict['rating']['f1-score'],
            ('User Experience', 'P'): test_report_dict['user experience']['precision'],
            ('User Experience', 'R'): test_report_dict['user experience']['recall'],
            ('User Experience', 'F1'): test_report_dict['user experience']['f1-score']
        }, ignore_index=True)

    test_metrics_df.columns = pd.MultiIndex.from_tuples([(c,) if isinstance(c, str) else c for c in test_metrics_df.columns])
    test_metrics_df.to_excel(f"{results_dir}/metrics_results_full_test.xlsx", index=True)

    print(f"Test F1: {test_f1}")

    # Generate and print the classification report
    print(test_report_dict)

    shutil.rmtree(dump_dir)

class MultiLabelDataset(Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)  # Ensure float32 for BCEWithLogitsLoss
        return item

    def __len__(self):
        return len(self.labels)
class LossLoggingCallback(TrainerCallback):
    """A custom callback to log training and validation loss."""
    def __init__(self):
        super().__init__()
        self.log_history = []
        self.log_train_loss_history = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        # This method captures both training and evaluation logs, so it's more general than on_epoch_end
        if logs is not None:
            # Capture both training and evaluation steps
            if 'loss' in logs:  # Indicates a training step
                self.log_train_loss_history.append({
                    'epoch': state.epoch,
                    'training_loss': logs.get('loss'),
                })
            elif 'eval_loss' in logs:  # Indicates an evaluation step
                # Make sure to capture the last training loss as well
                last_training_loss = self.log_train_loss_history[-1]['training_loss'] if self.log_train_loss_history else None
                self.log_history.append({
                    'epoch': state.epoch,
                    'training_loss': last_training_loss,  # Include last known training loss for reference
                    'validation_loss': logs.get('eval_loss'),
                    'eval_runtime':logs.get('eval_runtime')
                })

    def save_logs_to_excel(self, file_name):
        """Save the recorded logs to a Excel file."""
        pd.DataFrame(self.log_history).to_excel(file_name, index=False)

__file__ = "/content/drive/MyDrive/FinalProject/Models/ERNIE/ERNIE.ipynb"
current_file_path = Path(__file__).parent
path_to_project = current_file_path.parents[1]

directory_path_multi_label = path_to_project / 'Data' / 'Datasets' / 'Multi-label'

files_multi_label = [(file.name, file.stat().st_size)
               for file in directory_path_multi_label.iterdir()
               if file.is_file() and not file.name.startswith('.')]

files_multi_label.sort(key=lambda x: x[1])

In [5]:
print(f"Now doing: {files_multi_label[1][0].split('.')[0]}")
multi_main_model(files_multi_label[1][0].split('.')[0], files_multi_label[1][0].split('.')[1])

Now doing: dataset_gpt_multi_label_4000


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Fold 1/5


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.6021,0.506183,0.822134,0.548813,0.658228
2,0.3366,0.250721,0.986096,0.810906,0.889961
3,0.2043,0.205241,0.97561,0.879507,0.925069
4,0.1416,0.193023,0.990119,0.881266,0.932527
5,0.0995,0.200561,0.966507,0.888303,0.925756


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.602,0.487226,0.702319,0.810181,0.752404
2,0.3563,0.253575,0.961765,0.846419,0.900413
3,0.2361,0.203417,0.935714,0.904228,0.919702
4,0.1636,0.160195,0.988722,0.907679,0.946469
5,0.1206,0.162845,0.97786,0.914582,0.945163


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.6067,0.491276,0.742358,0.736568,0.739452
2,0.3606,0.244161,0.98,0.84922,0.909935
3,0.2308,0.21352,0.987915,0.850087,0.913833
4,0.1741,0.180231,0.966667,0.904679,0.934646
5,0.1302,0.174651,0.970288,0.905546,0.9368


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.6007,0.510278,0.736641,0.649832,0.690519
2,0.3569,0.265984,0.971087,0.819865,0.889092
3,0.2341,0.226135,0.986042,0.832492,0.902784
4,0.1762,0.19243,0.958148,0.905724,0.931199
5,0.1206,0.180328,0.953671,0.91835,0.935678


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.6016,0.505865,0.789855,0.626952,0.699038
2,0.3593,0.282151,0.943171,0.845522,0.891681
3,0.2302,0.23189,0.993184,0.838127,0.909091
4,0.1606,0.191561,0.981115,0.896467,0.936883
5,0.11,0.180542,0.978667,0.904684,0.940222


  metrics_df = metrics_df.append({


  test_metrics_df = test_metrics_df.append({


Test F1: 0.932905067808708
{'bug report': {'precision': 0.9889705882352942, 'recall': 0.9340277777777778, 'f1-score': 0.9607142857142857, 'support': 288}, 'user experience': {'precision': 0.980565371024735, 'recall': 0.9422750424448217, 'f1-score': 0.961038961038961, 'support': 589}, 'rating': {'precision': 0.9941520467836257, 'recall': 0.6614785992217899, 'f1-score': 0.794392523364486, 'support': 257}, 'feature request': {'precision': 0.9690402476780186, 'recall': 0.9315476190476191, 'f1-score': 0.9499241274658573, 'support': 336}, 'micro avg': {'precision': 0.9812312312312312, 'recall': 0.8891156462585034, 'f1-score': 0.932905067808708, 'support': 1470}, 'macro avg': {'precision': 0.9831820634304184, 'recall': 0.8673322596230021, 'f1-score': 0.9165174743958975, 'support': 1470}, 'weighted avg': {'precision': 0.9819531511486664, 'recall': 0.8891156462585034, 'f1-score': 0.9293000324291586, 'support': 1470}, 'samples avg': {'precision': 0.8754166666666666, 'recall': 0.8228125, 'f1-scor

In [6]:
print(f"Now doing: {files_multi_label[2][0].split('.')[0]}")
multi_main_model(files_multi_label[2][0].split('.')[0], files_multi_label[2][0].split('.')[1])
print(f"Now doing: {files_multi_label[3][0].split('.')[0]}")
multi_main_model(files_multi_label[3][0].split('.')[0], files_multi_label[3][0].split('.')[1])

Now doing: dataset_gpt_multi_label_8000
Fold 1/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4434,0.212085,0.998404,0.842838,0.914049
2,0.186,0.177014,0.99383,0.867984,0.926654
3,0.1353,0.116267,0.992802,0.929053,0.95987
4,0.0975,0.115741,0.986275,0.935788,0.960369
5,0.0747,0.117154,0.984899,0.937135,0.960423


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4565,0.205713,0.992662,0.870805,0.927749
2,0.1506,0.125449,0.992157,0.930575,0.96038
3,0.1084,0.120451,0.987781,0.929195,0.957593
4,0.0894,0.115795,0.992651,0.931494,0.961101
5,0.0686,0.120744,0.988797,0.933333,0.960265


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4474,0.20765,0.995645,0.854673,0.919789
2,0.1793,0.136093,0.993384,0.91215,0.951035
3,0.1249,0.113034,0.9945,0.929439,0.96087
4,0.0961,0.112417,0.993503,0.928972,0.960155
5,0.0762,0.1139,0.986673,0.934112,0.959674


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4585,0.211222,0.998349,0.849649,0.918016
2,0.1889,0.159602,0.999466,0.877283,0.934398
3,0.1358,0.118776,0.991935,0.92178,0.955572
4,0.1073,0.108135,0.994015,0.933489,0.962802
5,0.0807,0.109191,0.983317,0.938642,0.96046


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.46,0.220859,0.950285,0.870907,0.908866
2,0.1707,0.114656,0.994934,0.932131,0.962509
3,0.1114,0.110927,0.993949,0.935453,0.963814
4,0.0932,0.114856,0.992436,0.934029,0.962347
5,0.0729,0.117761,0.988454,0.934504,0.960722


  metrics_df = metrics_df.append({


  test_metrics_df = test_metrics_df.append({


Test F1: 0.9622714896927266
{'bug report': {'precision': 1.0, 'recall': 0.9623762376237623, 'f1-score': 0.9808274470232089, 'support': 505}, 'user experience': {'precision': 0.9905482041587902, 'recall': 0.9579524680073126, 'f1-score': 0.9739776951672862, 'support': 1094}, 'rating': {'precision': 0.9946524064171123, 'recall': 0.7965738758029979, 'f1-score': 0.8846611177170035, 'support': 467}, 'feature request': {'precision': 0.9947460595446584, 'recall': 0.9676320272572402, 'f1-score': 0.9810017271157166, 'support': 587}, 'micro avg': {'precision': 0.9939734833266372, 'recall': 0.9325292122125896, 'f1-score': 0.9622714896927266, 'support': 2653}, 'macro avg': {'precision': 0.9949866675301402, 'recall': 0.9211336521728282, 'f1-score': 0.9551169967558037, 'support': 2653}, 'weighted avg': {'precision': 0.993998622728693, 'recall': 0.9325292122125896, 'f1-score': 0.9611135375237458, 'support': 2653}, 'samples avg': {'precision': 0.8630208333333335, 'recall': 0.8323958333333334, 'f1-score

Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2854,0.152536,0.998223,0.881737,0.936371
2,0.1,0.078814,0.999172,0.947148,0.972465
3,0.0678,0.055177,0.998103,0.963893,0.9807
4,0.0483,0.050635,0.996777,0.970958,0.983698
5,0.033,0.053173,0.994378,0.971743,0.98293


  metrics_df = metrics_df.append({


Fold 2/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2863,0.113242,0.994941,0.924523,0.958441
2,0.0953,0.089407,0.995851,0.940193,0.967222
3,0.0749,0.076839,0.995618,0.949334,0.971925
4,0.055,0.057353,0.995152,0.965004,0.979846
5,0.0323,0.05821,0.992761,0.967093,0.979759


  metrics_df = metrics_df.append({


Fold 3/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2797,0.099753,0.992111,0.942134,0.966477
2,0.087,0.085861,0.995926,0.9473,0.971005
3,0.0771,0.071406,0.990634,0.956342,0.973186
4,0.048,0.050354,0.996821,0.9721,0.984306
5,0.0297,0.05138,0.996825,0.973134,0.984837


  metrics_df = metrics_df.append({


Fold 4/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2866,0.099688,0.99617,0.940098,0.967322
2,0.0927,0.088975,0.993243,0.948877,0.970553
3,0.0808,0.07937,0.994323,0.949651,0.971474
4,0.0693,0.071546,0.993294,0.956106,0.974345
5,0.053,0.072944,0.994091,0.95559,0.97446


  metrics_df = metrics_df.append({


Fold 5/5


Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.284,0.101874,1.0,0.937746,0.967873
2,0.091,0.079703,0.995867,0.949304,0.972028
3,0.0724,0.075381,0.992886,0.953244,0.972661
4,0.0591,0.067742,0.994295,0.961387,0.977564
5,0.0354,0.063873,0.992453,0.967166,0.979646


  metrics_df = metrics_df.append({


  test_metrics_df = test_metrics_df.append({


Test F1: 0.9825798867158276
{'bug report': {'precision': 0.9977324263038548, 'recall': 0.9745293466223699, 'f1-score': 0.9859943977591036, 'support': 903}, 'user experience': {'precision': 0.9950641658440277, 'recall': 0.9781659388646288, 'f1-score': 0.9865426963542941, 'support': 2061}, 'rating': {'precision': 0.9914772727272727, 'recall': 0.9344042838018741, 'f1-score': 0.9620951068228808, 'support': 747}, 'feature request': {'precision': 0.9930693069306931, 'recall': 0.9794921875, 'f1-score': 0.9862340216322517, 'support': 1024}, 'micro avg': {'precision': 0.9945910861099091, 'recall': 0.9708553326293559, 'f1-score': 0.9825798867158276, 'support': 4735}, 'macro avg': {'precision': 0.9943357929514621, 'recall': 0.9666479391972183, 'f1-score': 0.9802165556421325, 'support': 4735}, 'weighted avg': {'precision': 0.9945757380741762, 'recall': 0.9708553326293559, 'f1-score': 0.9825144923570832, 'support': 4735}, 'samples avg': {'precision': 0.8463802083333334, 'recall': 0.83546875, 'f1-sc