In [1]:
# pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124

In [2]:
# pip install transformers datasets trl peft bitsandbytes scikit-learn

In [3]:
import os
import transformers
import torch
from datasets import load_dataset,Dataset,DatasetDict
from peft import LoraConfig,get_peft_model
from trl import SFTTrainer
from sklearn.model_selection import train_test_split
from transformers import (
    DataCollatorWithPadding,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer)
import pandas as pd
import evaluate
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
torch. cuda. is_available()

True

In [45]:
model_id = "distilbert-base-uncased"

Define Classes labels for the model to process

In [39]:
label2id = {
    "cited": 0,
    "applied": 1,
    "followed": 2,
    "referred to": 3,
    "related": 4,
    "considered": 5,
    "discussed": 6,
    "distinguished": 7,
    "affirmed": 8,
    "approved": 9
}
id2label = {v: k for k, v in label2id.items()}

Load the dataset and rename accordingly

In [40]:

df = pd.read_csv('dataset.csv')

df = df.drop(columns=['case_id','case_text'])

df = df.dropna(subset=['case_title', 'case_outcome'])

df = df.rename(columns={'case_title': 'text', 'case_outcome': 'label'})


Replace the labels and remove null valise

In [41]:

# Function to replace labels with IDs in DataFrame
def replace_labels(df, label_map):
    df['label'] = df['label'].replace(label_map)
    return df

# Replace labels with IDs in DataFrame
df_train = replace_labels(df, label2id)

df_cleaned = df.dropna()

dataset = Dataset.from_pandas(df_cleaned)

print(df_train)  

       label                                               text
0          0  Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...
1          0  Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...
2          0  Colgate Palmolive Co v Cussons Pty Ltd (1993) ...
3          0  Dais Studio Pty Ltd v Bullett Creative Pty Ltd...
4          0  Dr Martens Australia Pty Ltd v Figgins Holding...
...      ...                                                ...
24980      0  Reches Pty Ltd v Tadiran Pty Ltd (1998) 85 FCR...
24981      0  Sir Lindsay Parkinson &amp; Co Ltd v Triplan L...
24982      0  Spiel v Commodity Brokers Australia Pty Ltd (I...
24983      7  Tullock Ltd v Walker (Unreported, Supreme Cour...
24984      7  Yandil Holdings Pty Ltd v Insurance Co of Nort...

[24985 rows x 2 columns]


  df['label'] = df['label'].replace(label_map)


In [42]:
print(df['label'].unique().tolist())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


Split train and test dataset

In [43]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

dataset = dataset.remove_columns(['__index_level_0__'])
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 19988
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 4997
    })
})


Load model and tokinizer

In [46]:
tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)
model = AutoModelForSequenceClassification.from_pretrained(model_id,
                                             num_labels=id2label.__len__(), 
                                             id2label=id2label,
                                             label2id=label2id,)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
def check_model_accutacy():
    text_list = ["Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Ltd (No 2) [2002] FCA 224 ; (2002) 190 ALR 121",
                "TCN Channel Nine Pty Ltd v Australian Broadcasting Tribunal (1992) 28 ALD 829",
                "Australian Securities and Investments Commission v Pegasus Leveraged Options Group Pty Ltd (2002) 41 ACSR 561"
                "Waterford v Commonwealth [1987] HCA 25",
                "Heinrich v Commonwealth Bank of Australia [2003] FCAFC 315",
                "X v Australian Crime Commission [2004] FCA 1475",
                "Commissioner for Australian Capital Territory Revenue v Alphaone Pty Ltd (1994) 49 FCR 576 "
                ]
    print("----------------------------")
    for text in text_list:
        # tokenize text
        inputs = tokenizer.encode(text, return_tensors="pt").to('cuda')
        # compute logits
        logits = model(inputs).logits
        # convert logits to label
        predictions = torch.argmax(logits)

        print(text + " - " + id2label[predictions.tolist()])

In [13]:
check_model_accutacy()

----------------------------
Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Ltd (No 2) [2002] FCA 224 ; (2002) 190 ALR 121 - cited
TCN Channel Nine Pty Ltd v Australian Broadcasting Tribunal (1992) 28 ALD 829 - approved
Australian Securities and Investments Commission v Pegasus Leveraged Options Group Pty Ltd (2002) 41 ACSR 561Waterford v Commonwealth [1987] HCA 25 - approved
Heinrich v Commonwealth Bank of Australia [2003] FCAFC 315 - approved
X v Australian Crime Commission [2004] FCA 1475 - approved
Commissioner for Australian Capital Territory Revenue v Alphaone Pty Ltd (1994) 49 FCR 576  - approved


In [14]:
# # add pad token if none exists
# if tokenizer.pad_token is None:
#     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#     model.resize_token_embeddings(len(tokenizer))
# def tokenize_function(examples):
#     #tokenize and truncate text
#     tokenizer.truncation_side = "left"
#     tokenized_inputs = tokenizer(
#         examples['text'],
#         return_tensors="np",
#         truncation=True,
#         max_length=512
#     )

#     return tokenized_inputs

In [16]:
def tokenize_function(example):
    return tokenizer(example['text'], padding="max_length", truncation=True)


In [17]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
print(tokenized_dataset)


Map: 100%|██████████| 19988/19988 [00:03<00:00, 5425.96 examples/s]
Map: 100%|██████████| 4997/4997 [00:00<00:00, 5753.90 examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 19988
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 4997
    })
})





In [18]:
accuracy = evaluate.load("accuracy")

In [19]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [20]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

In [22]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Could not find the bitsandbytes CUDA binary at WindowsPath('c:/Users/awais/Desktop/finetubeLLm/.venv/Lib/site-packages/bitsandbytes/libbitsandbytes_cuda124_nocublaslt.dll')
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


trainable params: 635,146 || all params: 67,596,308 || trainable%: 0.9396


In [23]:
# define training arguments
training_args = TrainingArguments(
    output_dir= "model/save",
    learning_rate=1e-3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    # max_steps=50,
    load_best_model_at_end=True,
)



In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


max_steps is given, it will override any value given in num_train_epochs


In [25]:
trainer.train()

                                               
100%|██████████| 50/50 [03:02<00:00,  2.10it/s]    

{'eval_loss': 1.5998820066452026, 'eval_accuracy': {'accuracy': 0.4916950170102061}, 'eval_runtime': 157.8437, 'eval_samples_per_second': 31.658, 'eval_steps_per_second': 7.919, 'epoch': 0.02}


100%|██████████| 50/50 [03:12<00:00,  3.85s/it]

{'train_runtime': 192.6054, 'train_samples_per_second': 2.077, 'train_steps_per_second': 0.26, 'train_loss': 1.7433152770996094, 'epoch': 0.02}





TrainOutput(global_step=50, training_loss=1.7433152770996094, metrics={'train_runtime': 192.6054, 'train_samples_per_second': 2.077, 'train_steps_per_second': 0.26, 'total_flos': 53774986444800.0, 'train_loss': 1.7433152770996094, 'epoch': 0.020008003201280513})

In [26]:
eval_results = trainer.evaluate()
print(eval_results)


100%|██████████| 1250/1250 [02:37<00:00,  7.95it/s]

{'eval_loss': 1.5998820066452026, 'eval_accuracy': {'accuracy': 0.4916950170102061}, 'eval_runtime': 157.5429, 'eval_samples_per_second': 31.718, 'eval_steps_per_second': 7.934, 'epoch': 0.020008003201280513}





In [30]:
check_model_accutacy()

----------------------------
Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Ltd (No 2) [2002] FCA 224 ; (2002) 190 ALR 121 - cited
TCN Channel Nine Pty Ltd v Australian Broadcasting Tribunal (1992) 28 ALD 829 - cited
Australian Securities and Investments Commission v Pegasus Leveraged Options Group Pty Ltd (2002) 41 ACSR 561Waterford v Commonwealth [1987] HCA 25 - cited
Heinrich v Commonwealth Bank of Australia [2003] FCAFC 315 - cited
X v Australian Crime Commission [2004] FCA 1475 - cited
Commissioner for Australian Capital Territory Revenue v Alphaone Pty Ltd (1994) 49 FCR 576  - cited


In [None]:
# model.save_pretrained("model/save/fine-tuned-model")
# tokenizer.save_pretrained("model/save/fine-tuned-model")