# Setup

In [3]:
import os, re, json, math, random, hashlib, time
import numpy as np, pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack, csr_matrix
import spacy

nlp = spacy.load("en_core_web_sm")

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

DATA_DIR = "data/"


# ML Model Training

In [4]:
import pandas as pd
from datasets import Dataset
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from sklearn.metrics import f1_score, accuracy_score
from torch import nn  # <-- Added for class-weighted loss
import torch.nn.functional as F # For Focal CE Loss

from sklearn.model_selection import train_test_split
from sklearn.utils import resample # for over-sampling

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha=alpha
        self.gamma=gamma

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction="none")
        pt = torch.exp(-ce_loss)
        loss = (1 - pt) ** self.gamma * ce_loss
        if self.alpha is not None: 
            loss *= self.alpha[targets]
        return loss.mean()


df_clean = pd.read_json(r'data\ground_truth-2.json', lines=True)
print(df_clean.head())

# map labels to numbers
label_map = {"Advertisement":0, "Irrelevant Content":1,
             "Rant without visiting":2, "None":3}

inv_label_map = {v:k for k,v in label_map.items()}

# Map labels to integers
df_clean["labels"] = df_clean["GPT-label"].map(label_map)

# Drop rows where mapping failed (NaN)
df_clean = df_clean.dropna(subset=["labels"])

# Convert labels to integer
df_clean["labels"] = df_clean["labels"].astype(int)

df_clean = df_clean.copy()
df_clean["text_clean"] = df_clean["text_clean"].astype(str).fillna("")
df_clean["text_for_model"] = df_clean["text_clean"] #+ " [SENTIMENT: " + df_clean["sentiment_label"] + "]"
df_clean["text_for_model"] = df_clean["text_for_model"].astype(str).fillna("")

df_clean["text_for_model"] = (
    df_clean["text_clean"].astype(str) +
    #" [SENTIMENT: " + df_clean["sentiment_label"].astype(str) + "]" +
    " [RULE_AD: " + df_clean["rule_advertisement"].fillna(False).astype(str) + "]" +
    " [RULE_IRR: " + df_clean["rule_irrelevant"].fillna(False).astype(str) + "]" +
    " [RULE_RANT: " + df_clean["rule_rant_without_visit"].fillna(False).astype(str) + "]"
)

train_df, test_df = train_test_split(df_clean, test_size=0.1, stratify=df_clean["labels"])


        user_id               name                                gmap_id  \
0  1.130049e+20              Ben A  0x8626a5074449db49:0x79fa4fa238746342   
1  1.088190e+20       michele vess  0x8626baf396726b1f:0xd33145fb706bec2d   
2  1.047214e+20  Danyelle Williams  0x8631c517e81069d3:0x720670c8e8f84fc3   
3  1.140741e+20           CK Moody  0x86249e700409f4af:0x204bb6ea088067c2   
4  1.013455e+20              Ant01   0x889de6c064ee6423:0x5b69e22b1279b2b   

   rating                                           text_raw  \
0       1  It's a nice enough McDonalds, but I can't stan...   
1       4  Really great street tacos tons of sauces for t...   
2       4               Went to visit someone place was nicw   
3       5  Great store to visit with a large variety of p...   
4       5  Excellent! Very knowledgeable sales manager.  ...   

                                          text_clean           time  \
0  nice mcdonalds cant stand self order kiosk obv...  1547600291921   
1  great 

In [5]:
# Oversampling
dfs = [train_df[train_df['labels'] == i] for i in train_df['labels'].unique()]
max_size = max(len(df) for df in dfs)

dfs_upsampled = [resample(df, replace=True, n_samples=max_size) for df in dfs]
train_df = pd.concat(dfs_upsampled).sample(frac=1)

# hugging face dataset
# dataset = Dataset.from_pandas(df_clean[["text_for_model", "labels"]])
train_dataset = Dataset.from_pandas(train_df[["text_for_model", "labels"]]) # over-sample
test_dataset = Dataset.from_pandas(test_df[["text_for_model", "labels"]])
# dataset = dataset.train_test_split(test_size=0.1) # only if splitting after conversion

# Tokenize
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text_for_model"], truncation=True, padding="max_length", max_length=256)





In [8]:
# Compute class weights   <-- NEW SECTION
labels = df_clean["labels"]
class_counts = labels.value_counts().sort_index()
class_weights = 1.0 / class_counts
class_weights = torch.tensor(class_weights.values, dtype=torch.float32).to("cuda" if torch.cuda.is_available() else "cpu")
class_weights /= class_weights.sum()

# Load Model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=4,
    ignore_mismatched_sizes=True,
    hidden_dropout_prob = 0.1,
    attention_probs_dropout_prob = 0.2
)

# dataset = dataset.map(tokenize, batched=True)
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/33424 [00:00<?, ? examples/s]

Map:   0%|          | 0/1058 [00:00<?, ? examples/s]

In [10]:
# Define custom loss function  <-- NEW SECTION
#loss_fct = nn.CrossEntropyLoss(weight=class_weights)
loss_fct = FocalLoss(gamma=2)

def compute_loss(model, inputs, return_outputs=False):
    labels = inputs.pop("labels")
    outputs = model(**inputs)
    loss = loss_fct(outputs.logits, labels)
    return (loss, outputs) if return_outputs else loss

# Training Setup
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=25, save_steps=25,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,  # <-- Increased for better learning
    weight_decay=1e-2,
    logging_dir="./logs",
    logging_steps=50,
    warmup_ratio=0.01
)

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), axis=1)
    labels = torch.tensor(labels)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro")
    }

# Subclass Trainer to use weighted loss  <-- NEW SECTION
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = loss_fct(outputs.logits, labels)  # Focal loss
        return (loss, outputs) if return_outputs else loss

# Initialize trainer using subclass  <-- NEW SECTION
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  # optional, for monitoring metrics
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = WeightedTrainer(


In [None]:
# Train the model
trainer.train()

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mestelle-sim[0m ([33mestelle-sim-htx[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss,Validation Loss,Accuracy,F1 Macro
25,No log,0.773462,0.05482,0.025986
50,0.761300,0.580902,0.753308,0.314117
75,0.761300,0.493919,0.747637,0.377855
100,0.564400,0.457019,0.675803,0.37781
125,0.564400,0.44614,0.613422,0.398451
150,0.400300,0.385412,0.68242,0.442362
175,0.400300,0.383598,0.689981,0.456425
200,0.288400,0.318394,0.721172,0.47444
225,0.288400,0.196361,0.8431,0.554689
250,0.213300,0.29709,0.76087,0.492031


KeyboardInterrupt: 

In [11]:
model = model.from_pretrained(r"results\best")
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  # optional, for monitoring metrics
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.evaluate()

  trainer = WeightedTrainer(


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mestelle-sim[0m ([33mestelle-sim-htx[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


{'eval_loss': 0.15624478459358215,
 'eval_model_preparation_time': 0.001,
 'eval_accuracy': 0.8610586011342155,
 'eval_f1_macro': 0.5893328661136195,
 'eval_runtime': 3.7836,
 'eval_samples_per_second': 279.63,
 'eval_steps_per_second': 8.986}

In [None]:
import optuna
from sklearn.metrics import f1_score

def objective(trial, probs_val, y_val):
    # per-class thresholds for 0..2
    t0 = trial.suggest_float("t0", 0.01, 0.6)
    t1 = trial.suggest_float("t1", 0.01, 0.6)
    t2 = trial.suggest_float("t2", 0.01, 0.6)

    # temperature (if you provide logits instead of probs you can rescale first)
    preds = []
    for p in probs_val:
        # use margin rule
        top = p.argmax()
        runner_up = np.argsort(p)[-2]
        if top != 3 and p[top] >= [t0, t1, t2][top]:
            preds.append(int(top))
        else:
            preds.append(3)
    return f1_score(y_val, preds, average="macro")

study = optuna.create_study(direction="maximize")
study.optimize(lambda t: objective(t, probs, test_df['labels'].values), n_trials=200)
print("best", study.best_value, study.best_params)

[I 2025-08-31 03:03:00,694] A new study created in memory with name: no-name-989bccbf-44ae-42ea-96c4-b2c5983ed097
[I 2025-08-31 03:03:00,699] Trial 0 finished with value: 0.5912624805615444 and parameters: {'t0': 0.25960178271694334, 't1': 0.5078362753600324, 't2': 0.5843226584624087}. Best is trial 0 with value: 0.5912624805615444.
[I 2025-08-31 03:03:00,702] Trial 1 finished with value: 0.5893328661136195 and parameters: {'t0': 0.07164433886849858, 't1': 0.24845626230199702, 't2': 0.19542456053521193}. Best is trial 0 with value: 0.5912624805615444.
[I 2025-08-31 03:03:00,705] Trial 2 finished with value: 0.600787583787466 and parameters: {'t0': 0.06284495528368667, 't1': 0.15048693408747016, 't2': 0.5356792394683623}. Best is trial 2 with value: 0.600787583787466.
[I 2025-08-31 03:03:00,709] Trial 3 finished with value: 0.5893328661136195 and parameters: {'t0': 0.04395693575425268, 't1': 0.157123254223161, 't2': 0.2906453789333608}. Best is trial 2 with value: 0.600787583787466.
[I 

best 0.6146054388302443 {'t0': 0.5316590285922207, 't1': 0.5266510967355174, 't2': 0.5370496628602927}


In [32]:
def predict_with_thresholds(
    model,
    tokenizer,
    texts,
    device='cuda',
    ts=study.best_params.values(),
    batch_size=32
):
    """
    Predict class labels with per-class thresholds and margin rule.
    
    Parameters:
    - model: Hugging Face AutoModelForSequenceClassification
    - tokenizer: corresponding tokenizer
    - texts: list of strings
    - device: 'cuda' or 'cpu'
    - t0, t1, t2: thresholds for classes 0,1,2
    - batch_size: number of samples per forward pass

    Returns:
    - preds: np.array of predicted class indices
    """
    model.eval()
    model.to(device)
    thresholds = list(ts)
    none_class = 3
    preds = []

    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            enc = tokenizer(
                batch_texts,
                return_tensors='pt',
                padding=True,
                truncation=True,
                max_length=256
            ).to(device)
            logits = model(**enc).logits  # [batch, num_classes]
            probs = torch.softmax(logits, dim=1).cpu().numpy()

            for p in probs:
                # top-2 logic
                top_idx = int(np.argmax(p))
                top_prob = p[top_idx]

                if top_idx != none_class and top_prob >= thresholds[top_idx]:
                    preds.append(top_idx)
                else:
                    preds.append(none_class)

    return np.array(preds)

Testing


In [48]:
from pprint import pprint
# Testing with few test cases
test_texts = [
    "come my shop cheap phone",
    "place suck never come back",
    "had coffee here yesterday nice staff good service",
    "place terrible should go yanhuis restaurant instead food way better",
    "from outside can tell food kinda suck so just left 1 star"
]
preds = predict_with_thresholds(model, tokenizer, test_texts, device='cuda' if torch.cuda.is_available() else 'cpu')
zipped = zip(test_texts, [inv_label_map[p] for p in preds])
pprint(list(zipped))

[('come my shop cheap phone', 'Advertisement'),
 ('place suck never come back', 'Rant without visiting'),
 ('had coffee here yesterday nice staff good service', 'None'),
 ('place terrible should go yanhuis restaurant instead food way better',
  'None'),
 ('from outside can tell food kinda suck so just left 1 star', 'None')]


In [49]:
# 1️⃣ Set device and move model there
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 2️⃣ Tokenize test texts
encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")

# 3️⃣ Move input tensors to the same device as the model
encodings = {k: v.to(device) for k, v in encodings.items()}

# 4️⃣ Run model
outputs = model(**encodings)

# 5️⃣ Move logits back to CPU for numpy
probs = torch.softmax(outputs.logits, dim=1).detach().cpu().numpy()

# 6️⃣ Map predictions to label names
for text, prob in zip(test_texts, probs):
    pred_label = list(label_map.keys())[prob.argmax()]
    print(f"Review: {text}")
    print(f"Predicted label: {pred_label}, probabilities: {prob}\n")

Review: come my shop cheap phone
Predicted label: Advertisement, probabilities: [0.84301347 0.06525601 0.00726939 0.08446104]

Review: place suck never come back
Predicted label: Rant without visiting, probabilities: [0.0170664  0.16881512 0.68056786 0.1335506 ]

Review: had coffee here yesterday nice staff good service
Predicted label: None, probabilities: [0.04042015 0.10484689 0.00880323 0.8459298 ]

Review: place terrible should go yanhuis restaurant instead food way better
Predicted label: None, probabilities: [0.18432072 0.1197008  0.00792617 0.68805236]

Review: from outside can tell food kinda suck so just left 1 star
Predicted label: Rant without visiting, probabilities: [0.00744062 0.20828566 0.44916686 0.33510688]

