# DistilBert

### Import necessary packages

In [1]:
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.model_selection import train_test_split
import mlflow
# from mlflow.transformers import log_model
import logging 
from mlflow.sklearn import save_model

# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# from sklearn.preprocessing import LabelEncoder
# from mlflow.models.signature import infer_signature
# from sklearn.utils.class_weight import compute_class_weight
# from sklearn.preprocessing import LabelBinarizer
# from scipy.special import softmax
import numpy as np

# from torch import nn
# import mlflow.pytorch

# import sentencepiece
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"  # This tells Hugging Face: “Don’t use parallel tokenization — avoid possible deadlocks.”

# from torch.utils.data import Dataset, DataLoader
import torch

from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, AutoModel, AutoTokenizer, AutoConfig

import config 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from basic_functions import(
    get_encode_tokenize_data,
    createTrainer,
    get_eval_metrics
)

### Setup

In [3]:
MODEL_NAME = "distilbert-base-uncased" # pulls the general-purpose DistilBERT model
TRACKING_URI = open("../.mlflow_uri").read().strip()
EXPERIMENT_NAME = config.EXPERIMENT_NAME

logging.basicConfig(format="%(asctime)s: %(message)s") # Configure logging format to show timestamp before every message

logger = logging.getLogger()
logger.setLevel(logging.INFO) # Only show logs that are INFO or more important (e.g., WARNING, ERROR) — but ignore DEBUG.

In [4]:
DATA_PATH = "../data/data_dropped_duplicates_small.csv"
MODEL_PATH = "distilbert-base-uncased"
MODEL_TRAINING_PATH ="distilbert-base-uncased"
OUTPUT_DIR = "../models/distilbert_finetuned_1/trainer_output"
SAVE_PATH = "../models/distilbert_finetuned_1/pytorch_model"

### Get data

In [5]:
train_dataset, test_dataset, y_train, le = get_encode_tokenize_data(DATA_PATH, MODEL_PATH)

INFO:basic_functions:Loading data...
INFO:basic_functions:Train test split, test-size 0.3
INFO:root:encode the label column
INFO:root:tokenize
INFO:basic_functions:create tokenizer & load model
INFO:basic_functions:create tokenizer & load model
INFO:root:create TextDatasets (train & test)


### Model Initialization

In [6]:
num_classes = len(np.unique(y_train))
num_classes

6

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_TRAINING_PATH,
    num_labels=num_classes,
    problem_type="single_label_classification"
)

# model.gradient_checkpointing_enable() 

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
params = {
      "learning_rate": 3e-5,
      "weight_decay": 0.01,
      "num_train_epochs": 4,
      "evaluation_strategy": "epoch",
      "class_weight":True,
  }


# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)


mlflow.start_run()
run = mlflow.active_run()
print("Active run_id: {}".format(run.info.run_id))

mlflow.set_tag("model_name", MODEL_NAME)
mlflow.log_params(params)

Active run_id: c6af1fede7ef45b1aa5248d23633d585


In [9]:
trainer = createTrainer(
    model= model, 
    train_dataset = train_dataset,
    test_dataset = test_dataset,
    output_dir= OUTPUT_DIR, 
    y_train=y_train, 
    class_weight=True, 
    epochs=4, 
    learning_rate=3e-5, 
    weight_decay = 0.01, 
    train_batch_size=4, 
    eval_batch_size=8 )

INFO:root:defining training arguments
INFO:root:get weighted loss trainer


### Execute Training

In [10]:
torch.mps.empty_cache()  # Clears unused GPU memory

In [11]:
# disable upper limit for memory
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

# Allows up to 100% of available memory
torch.mps.set_per_process_memory_fraction(1.0)  

In [12]:
logger.info('training is running')
trainer.train()

INFO:root:training is running


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9881,1.056663,0.618
2,0.7166,1.176395,0.735333
3,0.3872,1.548002,0.740667
4,0.2964,1.736059,0.732


2025/04/07 10:14:09 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id c6af1fede7ef45b1aa5248d23633d585: Failed to log run data: Exception: API request to http://127.0.0.1:5001/api/2.0/mlflow/runs/log-batch failed with exception HTTPConnectionPool(host='127.0.0.1', port=5001): Max retries exceeded with url: /api/2.0/mlflow/runs/log-batch (Caused by ResponseError('too many 500 error responses'))


KeyboardInterrupt: 

### Evaluation


In [13]:
def log_metrics(cr, brier, split):
    mlflow.log_metric(f"{split}_brier", brier)

    for key, value in cr.items():
        if (key == "accuracy"):
                # print(f"{split}_{key}", round(value,2))
                mlflow.log_metric(f"{split}_{key}", value)
        else:
            for metric in value:
                mlflow.log_metric(f"{split}_{key}_{metric}", value.get(metric))
                # print(f"{split}_{key}_{metric}", round(value.get(metric),2))

    

In [14]:
logger.info('predict on test_dataset')
test_output = trainer.predict(test_dataset)

classification_report, brier = get_eval_metrics(test_output, le)
log_metrics(classification_report, brier, "test")


INFO:root:predict on test_dataset
INFO:basic_functions:get evaluation metrics
INFO:basic_functions:classification_report
INFO:basic_functions:confusion_matrix
INFO:basic_functions:brier score


                       precision    recall  f1-score   support

           ad_hominem       0.69      0.61      0.65       142
  appeal_to_authority       0.71      0.56      0.62        97
    appeal_to_emotion       0.68      0.73      0.70       216
        false_dilemma       0.73      0.69      0.71       137
faulty_generalization       0.61      0.49      0.55       192
                 none       0.78      0.85      0.82       716

             accuracy                           0.73      1500
            macro avg       0.70      0.66      0.67      1500
         weighted avg       0.73      0.73      0.73      1500

[[ 87   3  22   1   4  25]
 [  5  54   6   2   3  27]
 [ 16   3 157   3  12  25]
 [  3   1   1  95   4  33]
 [  7   8  20   2  95  60]
 [  9   7  25  27  38 610]]
Multiclass Brier score: 0.47276299786606957


In [15]:
logger.info('predict on train_dataset')
train_output = trainer.predict(train_dataset)

classification_report, brier= get_eval_metrics(train_output, le)
log_metrics(classification_report, brier, "train")


INFO:root:predict on train_dataset
INFO:basic_functions:get evaluation metrics
INFO:basic_functions:classification_report
INFO:basic_functions:confusion_matrix
INFO:basic_functions:brier score


                       precision    recall  f1-score   support

           ad_hominem       0.98      0.98      0.98       331
  appeal_to_authority       0.99      0.99      0.99       227
    appeal_to_emotion       0.98      0.97      0.98       504
        false_dilemma       0.98      0.93      0.95       319
faulty_generalization       0.96      0.94      0.95       449
                 none       0.97      0.99      0.98      1670

             accuracy                           0.98      3500
            macro avg       0.98      0.97      0.97      3500
         weighted avg       0.98      0.98      0.98      3500

[[ 324    0    2    0    4    1]
 [   0  225    0    0    0    2]
 [   3    0  490    2    2    7]
 [   2    0    1  296    6   14]
 [   0    1    4    0  422   22]
 [   2    1    2    4    5 1656]]
Multiclass Brier score: 0.040556967015328314


In [16]:
mlflow.end_run()

🏃 View run funny-mouse-162 at: http://127.0.0.1:5001/#/experiments/445989666823303942/runs/c6af1fede7ef45b1aa5248d23633d585
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/445989666823303942


### Save model

In [17]:
#save with pytorch
mlflow.pytorch.save_model(model, path=SAVE_PATH)