In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import torch
import json
import os

In [4]:
print("CUDA available:", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


CUDA available: True
Device name: NVIDIA GeForce GTX 1650 Ti


In [5]:
df = pd.read_csv("D:/GitUploads/Court_case_classification/dataset/legal_text_classification.csv")  # Replace with your file path
print(df.columns)

Index(['case_id', 'case_outcome', 'case_title', 'case_text'], dtype='object')


In [6]:
df = df[['case_text', 'case_outcome']]

In [7]:
df['label'] = df['case_outcome'].astype('category').cat.codes
label_mapping = dict(enumerate(df['case_outcome'].astype('category').cat.categories))

In [8]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['case_text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

In [9]:
none_count = sum(1 for text in train_texts if text is None)
print(f"Number of None entries in train_texts: {none_count}")

non_string_count = sum(1 for text in train_texts if not isinstance(text, str))
print(f"Number of non-string entries in train_texts: {non_string_count}")


Number of None entries in train_texts: 0
Number of non-string entries in train_texts: 150


In [10]:
train_data = list(zip(train_texts, train_labels))

clean_train_data = [(text, label) for text, label in train_data if isinstance(text, str) and text.strip() != ""]

train_texts, train_labels = zip(*clean_train_data)


In [None]:
val_data = list(zip(val_texts, val_labels))

clean_val_data = [(text, label) for text, label in val_data if isinstance(text, str) and text.strip() != ""]

val_texts, val_labels = zip(*clean_val_data)


In [12]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

In [13]:
class LegalDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = LegalDataset(train_encodings, train_labels)
val_dataset = LegalDataset(val_encodings, val_labels)

In [14]:

num_labels = len(set(train_labels))

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=False,
    per_device_train_batch_size=8,
    num_train_epochs=1,
    logging_dir="./logs",
    logging_steps=100,
    fp16=True
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [17]:
trainer.train()

Step,Training Loss
100,1.6592
200,1.6294
300,1.6035
400,1.6209
500,1.5867
600,1.5674
700,1.5535
800,1.5796
900,1.5571
1000,1.5577


TrainOutput(global_step=2480, training_loss=1.557844392714962, metrics={'train_runtime': 7157.2428, 'train_samples_per_second': 2.772, 'train_steps_per_second': 0.347, 'total_flos': 2628263171788800.0, 'train_loss': 1.557844392714962, 'epoch': 1.0})

In [18]:
model.save_pretrained("./court_case_classifier_model")
tokenizer.save_pretrained("./court_case_classifier_model")

('./court_case_classifier_model\\tokenizer_config.json',
 './court_case_classifier_model\\special_tokens_map.json',
 './court_case_classifier_model\\vocab.txt',
 './court_case_classifier_model\\added_tokens.json',
 './court_case_classifier_model\\tokenizer.json')

In [9]:
df = pd.read_csv("D:/GitUploads/Court_case_classification/dataset/legal_text_classification.csv")
label_mapping = dict(enumerate(df['case_outcome'].astype('category').cat.categories))

In [10]:
with open("label_mapping.json", "w") as f:
    json.dump({str(k): v for k, v in label_mapping.items()}, f)

In [None]:
model_path = "court_case_classifier_model"  
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

with open("label_mapping.json", "r") as f:
    label_mapping = json.load(f)
label_mapping = {int(k): v for k, v in label_mapping.items()}  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def classify_case(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    predicted_label = torch.argmax(outputs.logits, dim=1).item()
    return label_mapping[predicted_label]


sample_text = "The plaintiff filed a petition for divorce under Section 13 of the Hindu Marriage Act."
print("Predicted Case Type:", classify_case(sample_text))

Predicted Case Type: cited
