In [None]:
# pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124

In [None]:
# pip install transformers datasets trl peft bitsandbytes scikit-learn

In [None]:
import os
import transformers
import torch
from trl import SFTTrainer
from datasets import load_dataset,Dataset,DatasetDict
from peft import LoraConfig,get_peft_model
from trl import SFTTrainer
from sklearn.model_selection import train_test_split
from transformers import (
    DataCollatorWithPadding,
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,)
import pandas as pd
import evaluate
import numpy as np


In [None]:
os.environ["HF_TOKEN"] = "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"

In [None]:
torch. cuda. is_available()

In [None]:
model_id = "google/gemma-2b"

Define Classes labels for the model to process

In [None]:

df = pd.read_csv('dataset.csv')

df = df.drop(columns=['case_id','case_outcome'])

df = df.dropna(subset=['case_title', 'case_text'])

df = df.rename(columns={'case_title': 'input', 'case_text': 'output'})


Replace the labels and remove null valise

In [None]:

df_cleaned = df.dropna()

dataset = Dataset.from_pandas(df_cleaned)


Split train and test dataset

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

dataset = dataset.remove_columns(['__index_level_0__'])
print(dataset)

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

Load model and tokinizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map="auto",
                                             token=os.environ['HF_TOKEN'])

In [None]:
input_text = "Case Title: Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Ltd (No 2) [2002] FCA 224 ; (2002) 190 ALR 121 \nDetails:"
input_ids = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(**input_ids,max_length=100).to("cuda")
print(tokenizer.decode(outputs[0]))

In [None]:
def tokenize_function(examples):
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(examples['input'],examples['output'], truncation=True, max_length=256)
    return tokenized_inputs

In [None]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
print(tokenized_dataset)


In [None]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [None]:
def formatting_func(example):
    text = f"Case Title: {example['input']}\nDetails: {example['output']}"
    return {"input_text": text}

In [None]:
peft_config = LoraConfig(r=8,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        task_type = "CAUSAL_LM",
                        target_modules = ["q_proj", "o_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj"])

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    warmup_steps=2,
    max_steps=100,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    output_dir="outputs",
    optim="paged_adamw_8bit"
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    args=training_args,
    peft_config=peft_config,
    formatting_func=formatting_func,
)

In [None]:
trainer.train()

In [None]:
input_text = "Case Title: Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Ltd (No 2) [2002] FCA 224 ; (2002) 190 ALR 121 \nDetails:"
input_ids = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(**input_ids,max_length=100)
# cited
print(tokenizer.decode(outputs[0]))

In [None]:
input_text = "Case Title: TCN Channel Nine Pty Ltd v Australian Broadcasting Tribunal (1992) 28 ALD 829 \nDetails:"
input_ids = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(**input_ids,max_length=100)
# cited
print(tokenizer.decode(outputs[0]))

In [None]:
input_text = "Case Title: Waterford v Commonwealth [1987] HCA 25 \nDetails:"
input_ids = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(**input_ids,max_length=100)
# referred to
print(tokenizer.decode(outputs[0]))

In [None]:
input_text = "Case Title: Australian Securities and Investments Commission v Pegasus Leveraged Options Group Pty Ltd (2002) 41 ACSR 561 \nDetails:"
input_ids = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(**input_ids,max_length=100)
# considered
print(tokenizer.decode(outputs[0]))

In [None]:
input_text = "Case Title: Heinrich v Commonwealth Bank of Australia [2003] FCAFC 315 \nDetails:"
input_ids = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(**input_ids,max_length=100)
# applied
print(tokenizer.decode(outputs[0]))

In [None]:
input_text = "Case Title: X v Australian Crime Commission [2004] FCA 1475 \nDetails:"
input_ids = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(**input_ids,max_length=100)
# followed
print(tokenizer.decode(outputs[0]))

In [None]:
input_text = "Case Title: Commissioner for Australian Capital Territory Revenue v Alphaone Pty Ltd (1994) 49 FCR 576 \nOutcome:"
input_ids = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(**input_ids,max_length=100)
# followed
print(tokenizer.decode(outputs[0]))