In [1]:
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import Dataset
import torch
import os
import json
import re
from tqdm import tqdm
tqdm.pandas()
from transformers import Trainer, TrainingArguments
import numpy as np
import evaluate

from sklearn.model_selection import train_test_split
import ast

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [14]:
test_df = pd.read_csv("test_all_ranked.csv")

In [16]:
class LegalDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df.reset_index(drop=True)
        self.df["text"] = self.df["ranked-sentences"].progress_apply(lambda x:" ".join(eval(x)[:10]))
        #self.df["label"] = self.df["decision"].progress_apply(lambda x:1 if x=="granted" else 0)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        model_input = self.df['text'][idx]
        encoded_sent = self.tokenizer.encode_plus(
            text=model_input,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            return_attention_mask=True,
            truncation=True
            )

        input_ids = encoded_sent.get('input_ids')
        attention_mask = encoded_sent.get('attention_mask')
        input_ids = torch.tensor(input_ids)
        attention_mask = torch.tensor(attention_mask)

        label = torch.tensor(self.df['label'][idx])

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': label}

In [17]:
from transformers import AlbertTokenizer

tokenizer = AlbertTokenizer.from_pretrained("ai4bharat/indic-bert")


In [18]:
test_dataset = LegalDataset(test_df, tokenizer)

100%|██████████| 35400/35400 [00:01<00:00, 25503.27it/s]


In [19]:
metric1 = evaluate.load("accuracy")
metric2 = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric1.compute(predictions=predictions, references=labels)
    f1 = metric2.compute(predictions=predictions, references=labels, average="macro")
    return {'accuracy': accuracy["accuracy"], 'f1-score': f1["f1"]}

In [20]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# 1. Load the trained model from the "model" directory
# Ensure the "model" folder from the training notebook is in the same directory as this notebook
model_path = "./model" 
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# 2. Define TrainingArguments for evaluation (minimal configuration needed)
eval_args = TrainingArguments(
    output_dir="eval_output",
    per_device_eval_batch_size=8,  # Match the batch size used in training or adjust for memory
    do_train=False,
    do_eval=True
)

# 3. Initialize the Trainer
# We use the loaded model, the test dataset you created, and the metrics function
trainer = Trainer(
    model=model,
    args=eval_args,
    eval_dataset=test_dataset,     # Using the test_dataset defined in your previous cells
    compute_metrics=compute_metrics # Using the compute_metrics defined in your previous cells
)

# 4. Run Evaluation
results = trainer.evaluate()
print(results)

{'eval_loss': 0.8779400587081909, 'eval_model_preparation_time': 0.0076, 'eval_accuracy': 0.8450282485875706, 'eval_f1-score': 0.8330939009867592, 'eval_runtime': 804.789, 'eval_samples_per_second': 43.987, 'eval_steps_per_second': 5.498}
