In [None]:
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import Dataset
import torch
import os
import json
import re
from tqdm import tqdm
tqdm.pandas()
from transformers import Trainer, TrainingArguments
import numpy as np
import evaluate

from sklearn.model_selection import train_test_split
import ast

In [None]:
from huggingface_hub import login
login(token="insert_your_token_here")

In [None]:
from transformers import AlbertTokenizer

tokenizer = AlbertTokenizer.from_pretrained("ai4bharat/indic-bert")


In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
train_df = pd.read_csv("train_all_ranked.csv")
test_df = pd.read_csv("dev_all_ranked.csv")
#train_df = train_df.head(500)
#test_df = test_df.head(500)
hp_train_df = train_df.sample(frac = 0.1, random_state=42).reset_index()
hp_test_df = test_df.sample(frac = 0.1, random_state=42).reset_index()

In [None]:
train_df.head(1)#['text']
#test_df = test_df.head(500)

In [None]:
class LegalDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df.reset_index(drop=True)
        self.df["text"] = self.df["ranked-sentences"].progress_apply(lambda x:" ".join(eval(x)[:10]))
        #self.df["label"] = self.df["decision"].progress_apply(lambda x:1 if x=="granted" else 0)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        model_input = self.df['text'][idx]
        encoded_sent = self.tokenizer.encode_plus(
            text=model_input,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            return_attention_mask=True,
            truncation=True
            )

        input_ids = encoded_sent.get('input_ids')
        attention_mask = encoded_sent.get('attention_mask')
        input_ids = torch.tensor(input_ids)
        attention_mask = torch.tensor(attention_mask)

        label = torch.tensor(self.df['label'][idx])

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': label}

In [None]:
train_dataset = LegalDataset(train_df, tokenizer)
test_dataset = LegalDataset(test_df, tokenizer)
hp_train_dataset = LegalDataset(hp_train_df, tokenizer)
hp_test_dataset = LegalDataset(hp_test_df, tokenizer)

In [None]:
metric1 = evaluate.load("accuracy")
metric2 = evaluate.load("f1")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric1.compute(predictions=predictions, references=labels)
    f1 = metric2.compute(predictions=predictions, references=labels, average="macro")
    return {'accuracy': accuracy["accuracy"], 'f1-score': f1["f1"]}

In [None]:
def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "weight_decay":trial.suggest_float("weight_decay", 0.005, 0.05),
        "adam_beta1":trial.suggest_float("adam_beta1", 0.75, 0.95),
        "adam_beta2":trial.suggest_float("adam_beta2", 0.99, 0.9999),
        "adam_epsilon":trial.suggest_float("adam_epsilon", 1e-9, 1e-7, log=True)
    }

In [None]:
training_args = TrainingArguments(
    output_dir='htf3_results',          # output directory
    num_train_epochs=5,            # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,               # number of warmup steps for learning rate scheduler
    weight_decay=0.01,              # strength of weight decay
    logging_dir='htf3_logs',           # directory for storing logs
    eval_strategy="epoch",
    logging_steps=250,
    save_strategy='epoch',
    save_total_limit = 1,
    learning_rate = 0.00001,
    load_best_model_at_end=True,
    metric_for_best_model ="eval_f1-score",
    bf16=True,
)

In [None]:
# Create a list to store trial results
trial_results = []

class HPSearchCallback:
    def __init__(self):
        self.trial_number = 0
    
    def __call__(self, study, trial):
        self.trial_number += 1
        # Get the hyperparameters
        params = trial.params
        # Get the objective value (metric being optimized)
        value = trial.value
        
        # Store results
        result = {
            'trial': self.trial_number,
            'hyperparameters': params,
            'objective_value': value
        }
        trial_results.append(result)
        
        # Print results for this trial
        print(f"\n{'='*80}")
        print(f"Trial {self.trial_number} Results:")
        print(f"{'='*80}")
        print(f"Hyperparameters:")
        for key, val in params.items():
            print(f"  {key}: {val}")
        print(f"\nObjective Value (eval metric): {value:.4f}")
        print(f"{'='*80}\n")

trainer = Trainer(
    model_init=model_init,                        # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=hp_train_dataset,         # training dataset
    eval_dataset=hp_test_dataset,           # evaluation dataset
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
def optuna_hp_space(trial):
    # Get hyperparameters
    params = my_hp_space(trial)

    # Store trial info
    trial.set_user_attr('trial_number', trial.number + 1)

    return params

# Run hyperparameter search
best_run = trainer.hyperparameter_search(
    n_trials=4,
    direction="maximize",
    hp_space=optuna_hp_space,
)

# Extract and display trial results after search completes
print(f"\n{'='*80}")
print("HYPERPARAMETER SEARCH COMPLETED")
print(f"{'='*80}")
print(f"\nBest trial:")
print(f"  Value: {best_run.objective:.4f}")
print(f"  Hyperparameters:")
for key, val in best_run.hyperparameters.items():
    print(f"    {key}: {val}")
print(f"{'='*80}\n")

In [None]:
print("Best HyperParameters")

In [None]:
# Fixed version - check if trial_results is empty first
print(best_run)

print("\n" + "="*80)
print("SUMMARY OF ALL TRIALS")
print("="*80)

if trial_results:
    for result in trial_results:
        print(f"\nTrial {result['trial']}:")
        print(f"  Hyperparameters: {result['hyperparameters']}")
        print(f"  Objective Value: {result['objective_value']:.4f}")

    # Create a detailed dataframe
    import pandas as pd
    summary_data = []
    for result in trial_results:
        row = {'Trial': result['trial'], 'Objective_Value': result['objective_value']}
        row.update(result['hyperparameters'])
        summary_data.append(row)

    summary_df = pd.DataFrame(summary_data)
    print("\n" + "="*80)
    print("Detailed Results Table:")
    print("="*80)
    print(summary_df.to_string(index=False))

    # Show best trial
    best_trial = max(trial_results, key=lambda x: x['objective_value'])
    print("\n" + "="*80)
    print("BEST TRIAL")
    print("="*80)
    print(f"Trial Number: {best_trial['trial']}")
    print(f"Objective Value: {best_trial['objective_value']:.4f}")
    print(f"Hyperparameters: {best_trial['hyperparameters']}")
else:
    print("No trial results were captured. Using best_run object instead.")
    print(f"\nBest Run ID: {best_run.run_id}")
    print(f"Objective: {best_run.objective}")
    print(f"Hyperparameters: {best_run.hyperparameters}")

In [None]:
del trainer
del training_args
import gc
gc.collect()

In [76]:
print("Starting Training...")

Starting Training...


In [77]:
# Define best hyperparameters from previous search
best_hyperparameters = {
    'learning_rate': 3.395690349549142e-05,
    'weight_decay': 0.03261220631726104,
    'adam_beta1': 0.8895833438683564,
    'adam_beta2': 0.9930199195181635,
    'adam_epsilon': 6.448086662178787e-09
}

print("Using best hyperparameters from previous search:")
for key, val in best_hyperparameters.items():
    print(f"  {key}: {val}")

# Create training arguments with best hyperparameters
training_args = TrainingArguments(
    output_dir='tf3_results',          # output directory
    num_train_epochs=15,            # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,               # number of warmup steps for learning rate scheduler
    weight_decay=best_hyperparameters['weight_decay'],
    logging_dir='tf3_logs',           # directory for storing logs
    eval_strategy="epoch",
    logging_steps=250,
    save_strategy='epoch',
    save_total_limit=1,
    learning_rate=best_hyperparameters['learning_rate'],
    adam_beta1=best_hyperparameters['adam_beta1'],
    adam_beta2=best_hyperparameters['adam_beta2'],
    adam_epsilon=best_hyperparameters['adam_epsilon'],
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1-score",
    bf16=True,
    report_to="none",  # Disable wandb/tensorboard if not needed
)

Using best hyperparameters from previous search:
  learning_rate: 3.395690349549142e-05
  weight_decay: 0.03261220631726104
  adam_beta1: 0.8895833438683564
  adam_beta2: 0.9930199195181635
  adam_epsilon: 6.448086662178787e-09


In [78]:
# Create trainer without custom callback
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

  trainer = Trainer(
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Training arguments already include best hyperparameters, so just start training
print("Training Arguments:")
print(trainer.args)
print("\nStarting training with full dataset...")
trainer.train()

Training Arguments:
TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.8895833438683564,
adam_beta2=0.9930199195181635,
adam_epsilon=6.448086662178787e-09,
auto_find_batch_size=False,
average_tokens_across_devices=True,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1-score
1,0.6215,0.607988,0.704072,0.619837
2,0.6111,0.606761,0.704072,0.619837
3,0.5986,0.605344,0.704072,0.619837
4,0.6536,0.644446,0.649404,0.461761


In [None]:
# Save the model
trainer.save_model("/home2/username/legal-tech/tfidf_sum+indic-ad")
print(f"Model saved to: /home2/username/legal-tech/tfidf_sum+indic-ad")