In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import torch
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load your CSV
df = pd.read_csv('../data/5a_stage0_notsure_final_dataset_no_definition_drilldown_availability.csv')
dataset = Dataset.from_pandas(df)


In [3]:

# Preprocessing
model_name = "microsoft/deberta-v3-small"
tokenizer  = AutoTokenizer.from_pretrained(model_name)




In [7]:
def preprocess(examples):
    # Tokenize
    tokens = tokenizer(
        examples["NL_Query"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    # Attach labels
    tokens["labels"] = examples["Label"]    # <— make sure it’s 'labels'
    return tokens
tokenized_datasets = dataset.map(preprocess, batched=True)

# Train/test split
split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = split["train"]
eval_dataset = split["test"]

Map: 100%|██████████| 11254/11254 [00:00<00:00, 23025.14 examples/s]


In [5]:
def compute_metrics(pred):
    from sklearn.metrics import f1_score, accuracy_score
    labels = pred.label_ids
    preds  = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1":       f1_score(labels, preds, average="macro"),
    }

In [8]:
# Model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

# Training setup
training_args = TrainingArguments(
    output_dir="out",
    evaluation_strategy="epoch",
    save_strategy="epoch",            # ← match eval
    load_best_model_at_end=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=2e-4,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                        
  0%|          | 0/1689 [08:38<?, ?it/s]          

{'loss': 0.1755, 'grad_norm': 0.018828876316547394, 'learning_rate': 1.4079336885731204e-05, 'epoch': 0.89}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                        

[A[A                                           
  0%|          | 0/1689 [09:22<?, ?it/s]          
[A
[A

{'eval_loss': 0.008903995156288147, 'eval_accuracy': 0.9986672589960017, 'eval_f1': 0.9961248259079569, 'eval_runtime': 18.0162, 'eval_samples_per_second': 124.943, 'eval_steps_per_second': 7.826, 'epoch': 1.0}


                                        
  0%|          | 0/1689 [12:30<?, ?it/s]           

{'loss': 0.0013, 'grad_norm': 0.004671952221542597, 'learning_rate': 8.158673771462404e-06, 'epoch': 1.78}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                        

[A[A                                           
  0%|          | 0/1689 [13:41<?, ?it/s]           
[A
[A

{'eval_loss': 0.0033373613841831684, 'eval_accuracy': 0.9995557529986673, 'eval_f1': 0.9986983624659553, 'eval_runtime': 17.789, 'eval_samples_per_second': 126.539, 'eval_steps_per_second': 7.926, 'epoch': 2.0}


                                        
  0%|          | 0/1689 [16:19<?, ?it/s]           

{'loss': 0.0003, 'grad_norm': 0.0051642595790326595, 'learning_rate': 2.238010657193606e-06, 'epoch': 2.66}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                        

[A[A                                           
  0%|          | 0/1689 [17:52<?, ?it/s]           
[A
[A

{'eval_loss': 0.0033935019746422768, 'eval_accuracy': 0.9995557529986673, 'eval_f1': 0.9986983624659553, 'eval_runtime': 17.6709, 'eval_samples_per_second': 127.385, 'eval_steps_per_second': 7.979, 'epoch': 3.0}


                                        
100%|██████████| 1689/1689 [12:53<00:00,  2.18it/s]

{'train_runtime': 773.0142, 'train_samples_per_second': 34.94, 'train_steps_per_second': 2.185, 'train_loss': 0.0524296138905399, 'epoch': 3.0}





TrainOutput(global_step=1689, training_loss=0.0524296138905399, metrics={'train_runtime': 773.0142, 'train_samples_per_second': 34.94, 'train_steps_per_second': 2.185, 'total_flos': 894516756286464.0, 'train_loss': 0.0524296138905399, 'epoch': 3.0})

In [16]:
from transformers import pipeline
clf = pipeline(
    "text-classification",
    model=trainer.model,        # or your checkpoint path
    tokenizer=tokenizer,
    device=-1                    # or -1 for CPU
)

samples = [
    "Why did revenue drop in Q2 2024?",
    "Show me the EBITDA trend over the last 4 quarters.",
    "What is the Cost of Goods Sold last year?",
    "Can you detect anomalies in our debt-to-equity ratio?"
]

for q in samples:
    print(q, "→", clf(q))


Why did revenue drop in Q2 2024? → [{'label': 'LABEL_2', 'score': 0.9996572732925415}]
Show me the EBITDA trend over the last 4 quarters. → [{'label': 'LABEL_3', 'score': 0.9980450868606567}]
What is the Cost of Goods Sold last year? → [{'label': 'LABEL_0', 'score': 0.9999179840087891}]
Can you detect anomalies in our debt-to-equity ratio? → [{'label': 'LABEL_2', 'score': 0.9998264908790588}]


In [None]:
## 0 -> Query, 1 -> Comparison, 2 -> Anomaly Detection, 3 -> Others