In [2]:

import pandas as pd
from torch.utils.data import Dataset
import torch
import ast
from tqdm import tqdm
tqdm.pandas()


In [3]:
def is_running_on_colab():
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        return True
    except ImportError:
        return False

# Set a global flag
IN_COLAB = is_running_on_colab()
print('In colab?',IN_COLAB)


In colab? False


In [4]:

load_path_train_df= "/content/drive/MyDrive/bail_prediction_datasets/train_all_ranked.csv" if IN_COLAB else 'csv_datasets/train_all_ranked.csv'
load_path_val_df= "/content/drive/MyDrive/bail_prediction_datasets/val_all_ranked.csv" if IN_COLAB else 'csv_datasets/val_all_ranked.csv'
train_df = pd.read_csv(load_path_train_df)
val_df = pd.read_csv(load_path_val_df)


In [5]:
train_df.drop(columns=["id", "text","district"], inplace=True)
val_df.drop(columns=["id", "text","district"], inplace=True)
train_df.rename(columns={"ranked-sentences": "text"}, inplace=True)
val_df.rename(columns={"ranked-sentences": "text"}, inplace=True)


In [6]:
# This runs once on CPU so the GPU doesn't have to wait for 'ast.literal_eval' later
def pre_process_df(df):
    # Use progress_apply so you can see the bar
    df["text"] = df["text"].progress_apply(lambda x: " ".join(ast.literal_eval(x)[:10]))
    return df

print("Pre-processing text...")
train_df = pre_process_df(train_df)
val_df = pre_process_df(val_df)

Pre-processing text...


100%|██████████| 123742/123742 [00:03<00:00, 32473.14it/s]
100%|██████████| 17707/17707 [00:00<00:00, 30266.83it/s]


In [7]:
hp_train_df = train_df.sample(frac = 0.1, random_state=42).reset_index()
hp_val_df = val_df.sample(frac = 0.1, random_state=42).reset_index()

In [8]:
# class LegalDataset(Dataset):
#     def __init__(self, df, tokenizer):
#         self.labels = torch.tensor(df['label'].values, dtype=torch.long)

#         print("Batch tokenizing... (this will be much faster)")
#         # Tokenize everything at once
#         self.encodings = tokenizer(
#             df['text'].tolist(),
#             add_special_tokens=True,
#             max_length=512,
#             padding=False, #'max_length',
#             truncation=True,
#             return_tensors='pt'
#         )

#     def __len__(self):
#         return len(self.labels)

#     def __getitem__(self, idx):
#         return {
#             'input_ids': self.encodings['input_ids'][idx],
#             'attention_mask': self.encodings['attention_mask'][idx],
#             'label': self.labels[idx]
#         }
class LegalDataset(Dataset):
    def __init__(self, df, tokenizer):
        # Store labels as a simple list first
        self.labels = df['label'].tolist()

        print("Batch tokenizing... (this will be much faster)")
        # Tokenize everything at once
        self.encodings = tokenizer(
            df['text'].tolist(),
            add_special_tokens=True,
            max_length=512,
            padding=False,     # Dynamic padding enabled
            truncation=True,
            # return_tensors='pt'  <-- REMOVED THIS (It causes the crash!)
        )

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Convert to tensor HERE, for just this one item
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long) # Note: Trainer expects 'labels' (plural)
        }

In [9]:
from transformers import AlbertTokenizer

tokenizer = AlbertTokenizer.from_pretrained("ai4bharat/indic-bert")

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
train_dataset = LegalDataset(train_df, tokenizer)
val_dataset = LegalDataset(val_df, tokenizer)
hp_train_dataset = LegalDataset(hp_train_df, tokenizer)
hp_val_dataset = LegalDataset(hp_val_df, tokenizer)

Batch tokenizing... (this will be much faster)
Batch tokenizing... (this will be much faster)
Batch tokenizing... (this will be much faster)
Batch tokenizing... (this will be much faster)


In [11]:
import os

# Define paths on your Drive
save_dir = "/content/drive/MyDrive/bail_prediction_datasets/" if IN_COLAB else 'pt_datasets/'
os.makedirs(save_dir, exist_ok=True)

paths = {
    "train": os.path.join(save_dir, "train_dataset.pt"),
    "val": os.path.join(save_dir, "val_dataset.pt"),
    "hp_train": os.path.join(save_dir, "hp_train_dataset.pt"),
    "hp_val": os.path.join(save_dir, "hp_val_dataset.pt")
}

# Helper function to load or create
def save_dataset(df, file_path):
    torch.save(df, file_path)


save_dataset(train_dataset, paths["train"])
save_dataset(val_dataset, paths["val"])
save_dataset(hp_train_dataset, paths["hp_train"])
save_dataset(hp_val_dataset, paths["hp_val"])

print("\nAll datasets ready!")


All datasets ready!
