Distilbert

In [None]:
pip install transformers[torch] datasets optuna scikit-learn



In [None]:
pip install --upgrade transformers datasets accelerate



In [None]:
import optuna
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict, ClassLabel
from transformers import AutoTokenizer
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 1 Load dataset from local CSV
csv_file_path = 'Philippine_Business_TrustPilot_Reviews_Labeled.csv'
df_raw = pd.read_csv(csv_file_path, encoding='ISO-8859-1')

# Rename columns for consistency with DistilBERT pipeline and map labels
df_raw = df_raw.rename(columns={'User Review Body': 'sentence', 'Ground Label': 'label'})

# Map 'Positive' to 1, 'Negative' to 0, and filter out 'Neutral'
label_mapping = {'Positive': 1, 'Negative': 0}
# Use .loc to avoid SettingWithCopyWarning
df_filtered = df_raw.copy()
df_filtered['label'] = df_filtered['label'].map(label_mapping)
df_filtered = df_filtered.dropna(subset=['label']) # Drop rows where mapping resulted in NaN (e.g., 'Neutral')
df_filtered.loc[:, 'label'] = df_filtered['label'].astype(int) # Convert labels to int

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df_filtered)

# Cast 'label' column to ClassLabel for stratification
features = dataset.features.copy()
features['label'] = ClassLabel(names=['negative', 'positive']) # Assuming 0=negative, 1=positive
dataset = dataset.cast(features)

# Split into train and test/validation sets
# Since load_dataset("csv") would typically create a single 'train' split, we manually create train/validation splits.
train_test_split = dataset.train_test_split(test_size=0.2, stratify_by_column="label", seed=42)
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test'] # Rename 'test' to 'validation' for consistency
})


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess(examples):
    return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=128)

# Apply preprocessing to the dataset dictionary
encoded_dataset = dataset_dict.map(preprocess, batched=True)

# Rename the label column to 'labels' as expected by the Trainer
encoded_dataset = encoded_dataset.rename_column("label", "labels")
encoded_dataset.set_format("torch")

train_dataset = encoded_dataset["train"]
eval_dataset = encoded_dataset["validation"]

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


Casting the dataset:   0%|          | 0/10698 [00:00<?, ? examples/s]

Map:   0%|          | 0/8558 [00:00<?, ? examples/s]

Map:   0%|          | 0/2140 [00:00<?, ? examples/s]

In [None]:
#  2 Define the Optuna objective
def objective(trial):
    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

    # Optuna suggests hyperparameters for AdamW and training
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 5e-4, log=True)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.3)
    batch_size = trial.suggest_categorical("batch_size", [32, 48])
    num_train_epochs = trial.suggest_int("num_train_epochs", 2, 3)

    training_args = TrainingArguments(
        output_dir=f"./results/{trial.number}",
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        per_device_train_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        report_to="none",
        # Removed evaluation_strategy, save_strategy, and logging_strategy due to TypeError
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    # 3 Training — internally uses AdamW optimizer
    trainer.train()
    metrics = trainer.evaluate()

    # 4 Report evaluation metric back to Optuna
    return metrics["eval_accuracy"]


In [None]:
# 5 Run Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)


[I 2025-11-16 12:27:18,981] A new study created in memory with name: no-name-39aad061-fc98-41aa-b574-0ee31e087bce


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.107


[I 2025-11-16 12:31:47,378] Trial 0 finished with value: 0.9616822429906542 and parameters: {'learning_rate': 4.240581583059883e-05, 'weight_decay': 0.2367395889909453, 'batch_size': 48, 'num_train_epochs': 3}. Best is trial 0 with value: 0.9616822429906542.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-11-16 12:34:46,143] Trial 1 finished with value: 0.9593457943925233 and parameters: {'learning_rate': 7.466851495719788e-06, 'weight_decay': 0.09045961074739281, 'batch_size': 48, 'num_train_epochs': 2}. Best is trial 0 with value: 0.9616822429906542.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.4229


[I 2025-11-16 12:39:19,444] Trial 2 finished with value: 0.9551401869158879 and parameters: {'learning_rate': 1.2281711771117244e-06, 'weight_decay': 0.1373073940233538, 'batch_size': 32, 'num_train_epochs': 3}. Best is trial 0 with value: 0.9616822429906542.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.1004


[I 2025-11-16 12:43:50,560] Trial 3 finished with value: 0.9663551401869159 and parameters: {'learning_rate': 7.668938387754336e-05, 'weight_decay': 0.17315718570924743, 'batch_size': 48, 'num_train_epochs': 3}. Best is trial 3 with value: 0.9663551401869159.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-11-16 12:46:51,318] Trial 4 finished with value: 0.9640186915887851 and parameters: {'learning_rate': 0.00017667311125427597, 'weight_decay': 0.03685512102513615, 'batch_size': 48, 'num_train_epochs': 2}. Best is trial 3 with value: 0.9663551401869159.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-11-16 12:49:53,866] Trial 5 finished with value: 0.9574766355140187 and parameters: {'learning_rate': 4.1520873508195074e-06, 'weight_decay': 0.10048811416241134, 'batch_size': 48, 'num_train_epochs': 2}. Best is trial 3 with value: 0.9663551401869159.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-11-16 12:52:54,784] Trial 6 finished with value: 0.9570093457943926 and parameters: {'learning_rate': 3.5750479757738046e-06, 'weight_decay': 0.12955046180009736, 'batch_size': 48, 'num_train_epochs': 2}. Best is trial 3 with value: 0.9663551401869159.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.1579


[I 2025-11-16 12:57:43,708] Trial 7 finished with value: 0.9579439252336449 and parameters: {'learning_rate': 0.0003444086440984969, 'weight_decay': 0.20066456199535332, 'batch_size': 48, 'num_train_epochs': 3}. Best is trial 3 with value: 0.9663551401869159.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.1164


[I 2025-11-16 13:03:22,219] Trial 8 finished with value: 0.9630841121495327 and parameters: {'learning_rate': 6.427007056335462e-05, 'weight_decay': 0.21483890674356595, 'batch_size': 32, 'num_train_epochs': 3}. Best is trial 3 with value: 0.9663551401869159.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.2458


[I 2025-11-16 13:09:19,377] Trial 9 finished with value: 0.9588785046728971 and parameters: {'learning_rate': 3.402758515363457e-06, 'weight_decay': 0.06248560753142308, 'batch_size': 32, 'num_train_epochs': 3}. Best is trial 3 with value: 0.9663551401869159.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.1284


[I 2025-11-16 13:14:20,087] Trial 10 finished with value: 0.9635514018691589 and parameters: {'learning_rate': 2.1216940726237158e-05, 'weight_decay': 0.26995424325709333, 'batch_size': 32, 'num_train_epochs': 3}. Best is trial 3 with value: 0.9663551401869159.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-11-16 13:17:42,268] Trial 11 finished with value: 0.9626168224299065 and parameters: {'learning_rate': 0.00023178640101368652, 'weight_decay': 0.004789919671147086, 'batch_size': 48, 'num_train_epochs': 2}. Best is trial 3 with value: 0.9663551401869159.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-11-16 13:21:32,653] Trial 12 finished with value: 0.9649532710280374 and parameters: {'learning_rate': 0.00012327589313612875, 'weight_decay': 0.0018012853676639962, 'batch_size': 48, 'num_train_epochs': 2}. Best is trial 3 with value: 0.9663551401869159.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-11-16 13:24:47,460] Trial 13 finished with value: 0.9644859813084112 and parameters: {'learning_rate': 8.865656989651001e-05, 'weight_decay': 0.1855759569077591, 'batch_size': 48, 'num_train_epochs': 2}. Best is trial 3 with value: 0.9663551401869159.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.1158


[I 2025-11-16 13:30:37,986] Trial 14 finished with value: 0.9649532710280374 and parameters: {'learning_rate': 2.193584982652083e-05, 'weight_decay': 0.2957975017859309, 'batch_size': 48, 'num_train_epochs': 3}. Best is trial 3 with value: 0.9663551401869159.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-11-16 13:34:27,368] Trial 15 finished with value: 0.9644859813084112 and parameters: {'learning_rate': 0.00013277908931809564, 'weight_decay': 0.166162305195391, 'batch_size': 48, 'num_train_epochs': 2}. Best is trial 3 with value: 0.9663551401869159.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.2344


[I 2025-11-16 13:39:28,027] Trial 16 finished with value: 0.9467289719626168 and parameters: {'learning_rate': 0.0004352484290845848, 'weight_decay': 0.01574816376321602, 'batch_size': 48, 'num_train_epochs': 3}. Best is trial 3 with value: 0.9663551401869159.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-11-16 13:43:11,448] Trial 17 finished with value: 0.9644859813084112 and parameters: {'learning_rate': 4.023752424695986e-05, 'weight_decay': 0.10140254969464088, 'batch_size': 48, 'num_train_epochs': 2}. Best is trial 3 with value: 0.9663551401869159.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.1461


[I 2025-11-16 13:47:16,595] Trial 18 finished with value: 0.9616822429906542 and parameters: {'learning_rate': 1.2381196477439789e-05, 'weight_decay': 0.05098809348597362, 'batch_size': 32, 'num_train_epochs': 2}. Best is trial 3 with value: 0.9663551401869159.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.0999


[I 2025-11-16 13:53:04,938] Trial 19 finished with value: 0.9649532710280374 and parameters: {'learning_rate': 8.61944579407336e-05, 'weight_decay': 0.16098450700140338, 'batch_size': 48, 'num_train_epochs': 3}. Best is trial 3 with value: 0.9663551401869159.


Best hyperparameters: {'learning_rate': 7.668938387754336e-05, 'weight_decay': 0.17315718570924743, 'batch_size': 48, 'num_train_epochs': 3}
Best validation accuracy: 0.9663551401869159


In [None]:
print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)

# Initialize model with best hyperparameters
best_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Set the best hyperparameters
best_training_args = TrainingArguments(
    output_dir="./best_results",
    learning_rate=study.best_params["learning_rate"],
    weight_decay=study.best_params["weight_decay"],
    per_device_train_batch_size=study.best_params["batch_size"],
    num_train_epochs=study.best_params["num_train_epochs"],
    report_to="none",
)

best_trainer = Trainer(
    model=best_model,
    args=best_training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

# Evaluate the model with the best hyperparameters
final_metrics = best_trainer.evaluate()

print("\nFinal Evaluation Metrics with Best Hyperparameters:")
for key, value in final_metrics.items():
    print(f"{key}: {value:.4f}")

Best hyperparameters: {'learning_rate': 7.668938387754336e-05, 'weight_decay': 0.17315718570924743, 'batch_size': 48, 'num_train_epochs': 3}
Best validation accuracy: 0.9663551401869159


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Final Evaluation Metrics with Best Hyperparameters:
eval_loss: 0.6879
eval_model_preparation_time: 0.0022
eval_accuracy: 0.5949
eval_precision: 0.5831
eval_recall: 0.5949
eval_f1: 0.5882
eval_runtime: 8.2054
eval_samples_per_second: 260.8030
eval_steps_per_second: 32.6610


In [None]:
# Make predictions on the evaluation set
predictions = best_trainer.predict(eval_dataset)

# Extract logits and labels from predictions
logits = predictions.predictions
labels = predictions.label_ids

# Compute all metrics using the compute_metrics function
calculated_metrics = compute_metrics((logits, labels))

print("\nExplicitly Calculated Metrics on Evaluation Set:")
print(f"Accuracy: {calculated_metrics['accuracy']:.4f}")
print(f"Precision: {calculated_metrics['precision']:.4f}")
print(f"Recall: {calculated_metrics['recall']:.4f}")
print(f"F1 Score: {calculated_metrics['f1']:.4f}")

# Print the loss from the final evaluation metrics (if available)
if 'eval_loss' in final_metrics:
    print(f"Loss: {final_metrics['eval_loss']:.4f}")
else:
    print("Loss not available directly from final_metrics. Please refer to eval_loss in previous output.")


Explicitly Calculated Metrics on Evaluation Set:
Accuracy: 0.5949
Precision: 0.5831
Recall: 0.5949
F1 Score: 0.5882
Loss: 0.6879


In [None]:
print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)

Best hyperparameters: {'learning_rate': 7.668938387754336e-05, 'weight_decay': 0.17315718570924743, 'batch_size': 48, 'num_train_epochs': 3}
Best validation accuracy: 0.9663551401869159


In [None]:
print("DistilBERT Metrics from calculated_metrics:")
print(f"Accuracy: {calculated_metrics['accuracy']:.4f}")
print(f"Precision: {calculated_metrics['precision']:.4f}")
print(f"Recall: {calculated_metrics['recall']:.4f}")
print(f"F1 Score: {calculated_metrics['f1']:.4f}")

DistilBERT Metrics from calculated_metrics:
Accuracy: 0.5949
Precision: 0.5831
Recall: 0.5949
F1 Score: 0.5882


## Corrected DistilBERT Model Evaluation


In [None]:
import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Ensure the best hyperparameters are available from the Optuna study
if 'study' not in globals() or study is None:
    print("Error: Optuna study not found. Please run the Optuna optimization cells first.")
else:
    print("Best hyperparameters:", study.best_params)
    print("Best validation accuracy from Optuna study:", study.best_value)

    # Initialize model with best hyperparameters
    # It's crucial to initialize a new model for training if the previous one was used in the study.
    model_to_train = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

    # Set the best hyperparameters
    best_training_args = TrainingArguments(
        output_dir="./best_model_trained_results", # Changed output directory to avoid conflicts
        learning_rate=study.best_params["learning_rate"],
        weight_decay=study.best_params["weight_decay"],
        per_device_train_batch_size=study.best_params["batch_size"],
        num_train_epochs=study.best_params["num_train_epochs"],
        report_to="none",
        # Optional: Add logging/evaluation strategy if needed, but keep it minimal for re-evaluation
        # evaluation_strategy="epoch",
        # save_strategy="epoch",
        # load_best_model_at_end=True, # Requires evaluation strategy
    )

    best_trainer_retrained = Trainer(
        model=model_to_train,
        args=best_training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    # Crucially, TRAIN the model with the best hyperparameters
    print("\nTraining DistilBERT model with best hyperparameters...")
    best_trainer_retrained.train()
    print("Training complete.\n")

    # Evaluate the retrained model on the evaluation set
    final_retrained_metrics = best_trainer_retrained.evaluate()

    print("\nFinal Evaluation Metrics for **Retrained** DistilBERT Model with Best Hyperparameters:")
    for key, value in final_retrained_metrics.items():
        print(f"{key}: {value:.4f}")

    # Make predictions on the evaluation set to get precision, recall, f1 separately
    predictions_retrained = best_trainer_retrained.predict(eval_dataset)
    logits_retrained = predictions_retrained.predictions
    labels_retrained = predictions_retrained.label_ids

    calculated_metrics_retrained = compute_metrics((logits_retrained, labels_retrained))

    print("\nExplicitly Calculated Metrics for Retrained DistilBERT on Evaluation Set:")
    print(f"Accuracy: {calculated_metrics_retrained['accuracy']:.4f}")
    print(f"Precision: {calculated_metrics_retrained['precision']:.4f}")
    print(f"Recall: {calculated_metrics_retrained['recall']:.4f}")
    print(f"F1 Score: {calculated_metrics_retrained['f1']:.4f}")

    # Save the retrained model
    model_to_train.save_pretrained("./best_distilbert_model")
    tokenizer.save_pretrained("./best_distilbert_model")
    print("\nRetrained DistilBERT model and tokenizer saved to './best_distilbert_model'")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Best hyperparameters: {'learning_rate': 7.668938387754336e-05, 'weight_decay': 0.17315718570924743, 'batch_size': 48, 'num_train_epochs': 3}
Best validation accuracy from Optuna study: 0.9663551401869159

Training DistilBERT model with best hyperparameters...


Step,Training Loss
500,0.101


Training complete.




Final Evaluation Metrics for **Retrained** DistilBERT Model with Best Hyperparameters:
eval_loss: 0.1229
eval_accuracy: 0.9654
eval_precision: 0.9658
eval_recall: 0.9654
eval_f1: 0.9651
eval_runtime: 7.6080
eval_samples_per_second: 281.2830
eval_steps_per_second: 35.2260
epoch: 3.0000

Explicitly Calculated Metrics for Retrained DistilBERT on Evaluation Set:
Accuracy: 0.9654
Precision: 0.9658
Recall: 0.9654
F1 Score: 0.9651

Retrained DistilBERT model and tokenizer saved to './best_distilbert_model'


## Sentiment Prediction with DistilBERT

This code block defines a function `predict_sentiment` that takes a text input and uses the retrained DistilBERT model to classify its sentiment. Since the model was originally trained on binary (positive/negative) labels, a heuristic is applied to infer 'neutral' sentiment:

*   If the model's highest predicted probability for either positive or negative is below a certain `confidence_threshold` (e.g., 0.6), the sentiment is classified as 'Neutral'.
*   Otherwise, the sentiment is classified as 'Positive' or 'Negative' based on the highest probability.

This approach provides an estimation for 'neutral' as the model was not explicitly trained on a 'neutral' class. For more accurate 'neutral' predictions, the model would need to be re-trained with a dataset that includes a dedicated 'neutral' class.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

# Load the saved tokenizer and model
model_path = "./best_distilbert_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Ensure the model is in evaluation mode
model.eval()

def predict_sentiment(text, confidence_threshold=0.6):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)

    # Get probabilities by applying softmax to logits
    probabilities = F.softmax(outputs.logits, dim=-1)[0].tolist()

    # Assuming 0: Negative, 1: Positive based on previous mapping
    negative_prob = probabilities[0]
    positive_prob = probabilities[1]

    # Determine sentiment based on probabilities and threshold
    if positive_prob > confidence_threshold and positive_prob > negative_prob:
        sentiment = "Positive"
    elif negative_prob > confidence_threshold and negative_prob > positive_prob:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"

    return sentiment, {"Negative_prob": negative_prob, "Positive_prob": positive_prob}

# Example usage:
text1 = "This product is absolutely fantastic! I love it."
sentiment1, probs1 = predict_sentiment(text1)
print(f"Text: '{text1}'\nSentiment: {sentiment1}, Probabilities: {probs1}\n")

text2 = "I am very disappointed with this service, it was terrible."
sentiment2, probs2 = predict_sentiment(text2)
print(f"Text: '{text2}'\nSentiment: {sentiment2}, Probabilities: {probs2}\n")

text3 = "The product is okay, nothing special, just average."
sentiment3, probs3 = predict_sentiment(text3)
print(f"Text: '{text3}'\nSentiment: {sentiment3}, Probabilities: {probs3}\n")

text4 = "This is neither good nor bad, just a product."
sentiment4, probs4 = predict_sentiment(text4)
print(f"Text: '{text4}'\nSentiment: {sentiment4}, Probabilities: {probs4}\n")


Text: 'This product is absolutely fantastic! I love it.'
Sentiment: Positive, Probabilities: {'Negative_prob': 0.000790383608546108, 'Positive_prob': 0.9992096424102783}

Text: 'I am very disappointed with this service, it was terrible.'
Sentiment: Negative, Probabilities: {'Negative_prob': 0.9949712753295898, 'Positive_prob': 0.0050287023186683655}

Text: 'The product is okay, nothing special, just average.'
Sentiment: Positive, Probabilities: {'Negative_prob': 0.0011189732467755675, 'Positive_prob': 0.9988810420036316}

Text: 'This is neither good nor bad, just a product.'
Sentiment: Positive, Probabilities: {'Negative_prob': 0.011385664343833923, 'Positive_prob': 0.9886143207550049}



## Interactive Sentiment Prediction

Use the input box below to enter text and see the DistilBERT model's sentiment prediction (Positive, Negative, or Neutral). Type `quit` to stop the interactive session.

In [None]:
while True:
    user_input = input("\nEnter text (type 'quit' to exit): ")
    if user_input.lower() == 'quit':
        print("Exiting interactive prediction.")
        break

    sentiment, probabilities = predict_sentiment(user_input)
    print(f"Sentiment: {sentiment}, Probabilities: {probabilities}")


Enter text (type 'quit' to exit): #reallyglobe? You collect payment online but you can't do online refund? How come you have no 211 live rep to speak to with? bcoz of plandemic?!  but you wanted the customer to get refund at your store? What is the use of online banking?  Your Agent JayDetl7071..said it was ok for me to complain him for adding heat to my fire...you have no customer service globe..everytime I complaint you don't always give satisfactory resolution..#reallyglobe# Remove your customer service if they can't help us and your company to grow! Always unresolved, always unsatisfied, always bad service!
Sentiment: Negative, Probabilities: {'Negative_prob': 0.9985458850860596, 'Positive_prob': 0.001454177894629538}

Enter text (type 'quit' to exit): exit
Sentiment: Neutral, Probabilities: {'Negative_prob': 0.49149182438850403, 'Positive_prob': 0.5085081458091736}

Enter text (type 'quit' to exit): quit
Exiting interactive prediction.


## Acquire and Preprocess Authenticity Data

### Subtask:
Acquire a dataset labeled for authenticity (original vs. computer-generated) and preprocess it for DistilBERT fine-tuning, including tokenization and creating train/validation splits.


In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict, ClassLabel
from transformers import AutoTokenizer

# 1. Identify and load the authenticity-labeled dataset
csv_file_path = 'Philippine_Business_TrustPilot_Reviews_Labeled.csv'
df = pd.read_csv(csv_file_path, encoding='ISO-8859-1')

# 2. Rename the column containing the text content to 'sentence' and labels to 'label'
df = df.rename(columns={'User Review Body': 'sentence', 'Ground Label': 'label'})

# 3. Define a label mapping and apply it. 'Positive' to 1, 'Negative' to 0, filter out 'Neutral'
label_mapping = {'Positive': 1, 'Negative': 0}
df['label'] = df['label'].map(label_mapping)

# 4. Remove rows where 'sentence' is empty or 'label' could not be mapped
df = df.dropna(subset=['label', 'sentence']) # Drop rows where mapping resulted in NaN (e.g., 'Neutral')
df = df[df['sentence'].astype(str).str.strip() != ''] # Remove rows with empty sentences
df['label'] = df['label'].astype(int) # Convert labels to int after dropping NaNs

print(f"Dataset shape after initial cleaning and label mapping: {df.shape}")
print("Label distribution after cleaning:")
print(df['label'].value_counts())

# 5. Convert preprocessed DataFrame into a Hugging Face `Dataset` object
dataset = Dataset.from_pandas(df)

# 6. Cast the 'label' column to ClassLabel
# Assuming 0: Negative, 1: Positive based on previous mapping
features = dataset.features.copy()
features['label'] = ClassLabel(names=['negative', 'positive'])
dataset = dataset.cast(features)

# 7. Split the `Dataset` into training and validation sets
train_test_split = dataset.train_test_split(test_size=0.2, stratify_by_column="label", seed=42)
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test'] # Rename 'test' to 'validation' for consistency
})

print(f"Train dataset size: {len(dataset_dict['train'])}")
print(f"Validation dataset size: {len(dataset_dict['validation'])}")

# 8. Initialize the `AutoTokenizer`
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# 9. Define a preprocessing function and apply it
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=128)

encoded_dataset = dataset_dict.map(preprocess_function, batched=True)

# 10. Rename the 'label' column to 'labels' and set the format to 'torch'
encoded_dataset = encoded_dataset.rename_column("label", "labels")
encoded_dataset.set_format("torch")

print("Preprocessing complete. Encoded dataset example:")
print(encoded_dataset["train"][0])

Dataset shape after initial cleaning and label mapping: (10698, 13)
Label distribution after cleaning:
label
1    7181
0    3517
Name: count, dtype: int64


Casting the dataset:   0%|          | 0/10698 [00:00<?, ? examples/s]

Train dataset size: 8558
Validation dataset size: 2140


Map:   0%|          | 0/8558 [00:00<?, ? examples/s]

Map:   0%|          | 0/2140 [00:00<?, ? examples/s]

Preprocessing complete. Encoded dataset example:
{'Business Name': 'Globe Telecom', 'Business Average Rating': tensor(1.1000), 'Business Review Grade': 'Bad', 'User Review Title': 'Globe Telecom Please Close Down the doors of Your company.', 'sentence': 'Globe should be closed down. I am both a gamer and an online class student. Whenever I have online classes, I will disconnect because of how much the wifi is so low, and plus There is no rain or wind that is disrupting our internet connection, Its just that the connection is really bad. Whenever I play online games such as Call of Duty Mobile, I will get a good internet connection before the game but whenever the game starts I get 199ms then I disconnect. Plus we always pay our internet bill in time then you give us a bad wifi? Well if that is how it is whats the use of even paying internet bill? I do not care about how hard you convince people that your wifi is good but the truth is you should just close down your company.', 'User Rev

## Train DistilBERT for Authenticity Classification

### Subtask:
Initialize a new DistilBERT model and fine-tune it on the authenticity-labeled dataset using the best hyperparameters found by Optuna.


In [None]:
import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Ensure the best hyperparameters are available from the Optuna study
if 'study' not in globals() or study is None:
    print("Error: Optuna study not found. Please run the Optuna optimization cells first.")
else:
    print("Best hyperparameters from Optuna study:", study.best_params)
    print("Best validation accuracy from Optuna study:", study.best_value)

    # 1. Initialize AutoModelForSequenceClassification
    # It's crucial to initialize a new model for training, as the Optuna study models were transient.
    model_for_authenticity_classification = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=2 # Assuming binary classification: Original vs. Computer-Generated
    )

    # 2. Create a TrainingArguments object using best hyperparameters
    best_training_args_authenticity = TrainingArguments(
        output_dir="./distilbert_authenticity_fine_tuned_model", # New output directory for this task
        learning_rate=study.best_params["learning_rate"],
        weight_decay=study.best_params["weight_decay"],
        per_device_train_batch_size=study.best_params["batch_size"],
        num_train_epochs=study.best_params["num_train_epochs"],
        report_to="none"
    )

    # 3. Instantiate a Trainer object
    authenticity_trainer = Trainer(
        model=model_for_authenticity_classification,
        args=best_training_args_authenticity,
        train_dataset=encoded_dataset["train"], # Use the preprocessed authenticity train dataset
        eval_dataset=encoded_dataset["validation"], # Use the preprocessed authenticity validation dataset
        compute_metrics=compute_metrics,
    )

    # 4. Call the train() method to start fine-tuning
    print("\nStarting fine-tuning of DistilBERT for authenticity classification...")
    authenticity_trainer.train()
    print("Fine-tuning complete.\n")

    # Evaluate the fine-tuned model
    authenticity_metrics = authenticity_trainer.evaluate()

    print("\nEvaluation Metrics for Fine-tuned DistilBERT on Authenticity Dataset:")
    for key, value in authenticity_metrics.items():
        print(f"{key}: {value:.4f}")

    # Save the fine-tuned model and tokenizer
    authenticity_trainer.save_model("./distilbert_authenticity_fine_tuned_model")
    tokenizer.save_pretrained("./distilbert_authenticity_fine_tuned_model")
    print("\nFine-tuned DistilBERT model and tokenizer for authenticity saved to './distilbert_authenticity_fine_tuned_model'")

Best hyperparameters from Optuna study: {'learning_rate': 7.668938387754336e-05, 'weight_decay': 0.17315718570924743, 'batch_size': 48, 'num_train_epochs': 3}
Best validation accuracy from Optuna study: 0.9663551401869159


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting fine-tuning of DistilBERT for authenticity classification...


Step,Training Loss
