In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import torch
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [47]:
# Load your CSV
df = pd.read_csv('../data/5b_stage0_refined.csv')
dataset = Dataset.from_pandas(df)


In [48]:

# Preprocessing
model_name = "microsoft/deberta-v3-small"
tokenizer  = AutoTokenizer.from_pretrained(model_name)




In [49]:
def preprocess(examples):
    # Tokenize
    tokens = tokenizer(
        examples["NL_Query"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    # Attach labels
    tokens["labels"] = examples["Label"]    # <— make sure it’s 'labels'
    return tokens
tokenized_datasets = dataset.map(preprocess, batched=True)

# Train/test split
split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = split["train"]
eval_dataset = split["test"]

Map: 100%|██████████| 12064/12064 [00:00<00:00, 18519.57 examples/s]


In [50]:
def compute_metrics(pred):
    from sklearn.metrics import f1_score, accuracy_score
    labels = pred.label_ids
    preds  = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1":       f1_score(labels, preds, average="macro"),
    }

In [51]:
# Model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

# Training setup
training_args = TrainingArguments(
    output_dir="out",
    evaluation_strategy="epoch",
    save_strategy="epoch",            # ← match eval
    load_best_model_at_end=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=2e-4,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 33%|███▎      | 302/906 [03:45<18:51,  1.87s/it]
 33%|███▎      | 302/906 [04:03<18:51,  1.87s/it]

{'eval_loss': 0.011976677924394608, 'eval_accuracy': 0.9979278905926233, 'eval_f1': 0.9940236434489308, 'eval_runtime': 18.0698, 'eval_samples_per_second': 133.538, 'eval_steps_per_second': 4.206, 'epoch': 1.0}


 55%|█████▌    | 500/906 [06:32<05:05,  1.33it/s]  

{'loss': 0.0612, 'grad_norm': 0.0019127613632008433, 'learning_rate': 8.962472406181016e-05, 'epoch': 1.66}


 67%|██████▋   | 604/906 [07:52<03:53,  1.29it/s]
 67%|██████▋   | 604/906 [08:11<03:53,  1.29it/s]

{'eval_loss': 2.9139500838937238e-05, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 19.2028, 'eval_samples_per_second': 125.659, 'eval_steps_per_second': 3.958, 'epoch': 2.0}


100%|██████████| 906/906 [12:45<00:00,  1.15it/s]
100%|██████████| 906/906 [13:08<00:00,  1.15it/s]

{'eval_loss': 1.9513141523930244e-05, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 22.9107, 'eval_samples_per_second': 105.322, 'eval_steps_per_second': 3.317, 'epoch': 3.0}


100%|██████████| 906/906 [13:13<00:00,  1.14it/s]

{'train_runtime': 793.2481, 'train_samples_per_second': 36.499, 'train_steps_per_second': 1.142, 'train_loss': 0.03379114517822829, 'epoch': 3.0}





TrainOutput(global_step=906, training_loss=0.03379114517822829, metrics={'train_runtime': 793.2481, 'train_samples_per_second': 36.499, 'train_steps_per_second': 1.142, 'total_flos': 958900501490688.0, 'train_loss': 0.03379114517822829, 'epoch': 3.0})

In [52]:
from transformers import pipeline
clf = pipeline(
    "text-classification",
    model=trainer.model,        # or your checkpoint path
    tokenizer=tokenizer,
    device=-1                    # or -1 for CPU
)

samples = [
    "Why did revenue drop in Q2 2024?",
    "Show me the EBITDA over the last 4 quarters.",
    "When was the EBITDA higher - this year or last",
    "Can you detect anomalies?",
    "Is a Price-to-Book ratio of 1.8 considered healthy",
    "Does the Gross Profit margin look abnormal this quarter?",
    "Please provide the value of Current Assets last year.",
    "PAT"
]

for q in samples:
    print(q, "→", clf(q))

## 0 -> Query, 1 -> Comparison, 2 -> Anomaly Detection, 3 -> Others


Why did revenue drop in Q2 2024? → [{'label': 'LABEL_2', 'score': 0.9999641180038452}]
Show me the EBITDA over the last 4 quarters. → [{'label': 'LABEL_0', 'score': 0.9999899864196777}]
When was the EBITDA higher - this year or last → [{'label': 'LABEL_1', 'score': 0.9999946355819702}]
Can you detect anomalies? → [{'label': 'LABEL_2', 'score': 0.9999668598175049}]
Is a Price-to-Book ratio of 1.8 considered healthy → [{'label': 'LABEL_3', 'score': 0.9998692274093628}]
Does the Gross Profit margin look abnormal this quarter? → [{'label': 'LABEL_2', 'score': 0.9999681711196899}]
Please provide the value of Current Assets last year. → [{'label': 'LABEL_0', 'score': 0.9999895095825195}]
PAT → [{'label': 'LABEL_0', 'score': 0.9999803304672241}]


In [None]:
from sklearn.metrics import classification_report
import numpy as np

# 1) Define your test cases (query, true_label)
test_cases = [
    ("Can you tell me the Revenue for Q3 2024?", 0),
    ("Revenue", 0),
    ("EBIDTA", 0),
    ("Ebidta - how has it changed this year vs last", 1),
    ("When was the EBITDA higher - this year or last", 1),
    ("I would like to receive the Net Profit (PAT) figure for 2023.", 0),
    ("What was the EBITDA amount in Q1 2024?", 0),
    ("Give me the Cost of Goods Sold (COGS) for the previous quarter.", 0),
    ("Compare Operating Profit (EBIT) in Q2 2023 vs Q2 2024.", 1),
    ("How does ROE for 2022 and 2023 fare against each other?", 1),
    ("Is the Current Ratio higher in Q4 2023 or Q1 2024?", 1),
    ("Please contrast Free Cash Flow across the last three years.", 1),
    ("Which is better: Debt-to-Equity Ratio or Interest Coverage Ratio?", 1),
    ("Is there any anomaly in Receivables Turnover for Q3 2023?", 2),
    ("Why did the Working Capital suddenly spike in Q1 2024?", 2),
    ("Does the Gross Profit margin look abnormal this quarter?", 2),
    ("There was a sharp drop in Inventories in 2023—what caused it?", 2),
    ("Detect any irregularity in Operating Cash Flow over the past four quarters.", 2),
    ("What is the trend in EPS over the past five years?", 3),
    ("Forecast the Revenue for Q1 2025 based on historical data.", 3),
    ("Is a Price-to-Book ratio of 1.8 considered healthy?", 3),
    ("Why did Gross Profit fall despite higher Revenue?", 3),
    ("Should we be concerned about the low Quick Ratio this quarter?", 3),
]

# 2) Run the classifier and collect preds/trues
preds = []
trues = []

for query, true_label in test_cases:
    # pipeline returns a list of dicts: [{"label":"LABEL_X","score":...}]
    out = clf(query, top_k=None)[0]
    pred_label = int(out["label"].split("_")[-1])
    preds.append(pred_label)
    trues.append(true_label)
    print(f"{query!r} → Predicted: {pred_label}, True: {true_label}")

# 3) Print class‐wise precision/recall/F1
print("\n" + "="*60)
print(classification_report(
    trues,
    preds,
    target_names=[
        "Querying (0)",
        "Comparison (1)",
        "Anomaly (2)",
        "Other (3)"
    ]
))

## 0 -> Query, 1 -> Comparison, 2 -> Anomaly Detection, 3 -> Others


'Can you tell me the Revenue for Q3 2024?' → Predicted: 0, True: 0
'Revenue' → Predicted: 0, True: 0
'EBIDTA' → Predicted: 0, True: 0
'Ebidta - how has it changed this year vs last' → Predicted: 1, True: 1
'When was the EBITDA higher - this year or last' → Predicted: 1, True: 1
'I would like to receive the Net Profit (PAT) figure for 2023.' → Predicted: 0, True: 0
'What was the EBITDA amount in Q1 2024?' → Predicted: 0, True: 0
'Give me the Cost of Goods Sold (COGS) for the previous quarter.' → Predicted: 0, True: 0
'Compare Operating Profit (EBIT) in Q2 2023 vs Q2 2024.' → Predicted: 1, True: 1
'How does ROE for 2022 and 2023 fare against each other?' → Predicted: 1, True: 1
'Is the Current Ratio higher in Q4 2023 or Q1 2024?' → Predicted: 1, True: 1
'Please contrast Free Cash Flow across the last three years.' → Predicted: 1, True: 1
'Which is better: Debt-to-Equity Ratio or Interest Coverage Ratio?' → Predicted: 1, True: 1
'Is there any anomaly in Receivables Turnover for Q3 2023?' 

In [None]:
## 0 -> Query, 1 -> Comparison, 2 -> Anomaly Detection, 3 -> Others

In [56]:
# 1. Save with Trainer API
trainer.save_model("../results/15jul_stage0/trained_model")  
# This does the same as model.save_pretrained and also writes config.json.

# 2. Save the tokenizer
tokenizer.save_pretrained("../results/15jul_stage0/trained_tokeniser")

# 3. (Optional) Save training arguments / state
trainer.state.save_to_json("../results/15jul_stage0/trained_args.json")
