In [1]:
# Import necessary modules
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
import os

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [2]:
# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Add padding token (GPT-2 doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# Move model to GPU
model.to(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [6]:
from datasets import load_dataset, DatasetDict

# Define paths to your JSON files (adjust paths as per your setup)
json_files = {
    "train": [
        "archive/constitution_qa.json",
        "archive/crpc_qa_normalized.json",
        "archive/ipc_qa.json"
    ]
}

# Load the datasets
dataset = load_dataset("json", data_files=json_files)

# Combine into a single dataset
combined_dataset = DatasetDict({
    "train": dataset["train"].flatten()  # Flatten if nested, adjust if needed
})

# Display a sample to verify
print(combined_dataset["train"][0])

Generating train split: 0 examples [00:00, ? examples/s]

{'question': 'What is India according to the Union and its Territory?', 'answer': 'India, that is Bharat, shall be a Union of States.'}


In [10]:
from transformers import GPT2Tokenizer

# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Preprocess function to combine question and answer and set labels
def preprocess_function(examples):
    # Combine question and answer with a separator
    texts = [f"question: {q} [SEP] answer: {a}" for q, a in zip(examples["question"], examples["answer"])]
    
    # Tokenize with truncation and padding
    tokenized_inputs = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    
    # Set labels to be the same as input_ids (GPT-2 will shift them internally)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()
    
    return tokenized_inputs

# Apply preprocessing
tokenized_dataset = combined_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=combined_dataset["train"].column_names  # Remove original columns
)

# Split into train and validation sets
train_dataset = tokenized_dataset["train"].select(range(int(len(tokenized_dataset["train"]) * 0.9)))
eval_dataset = tokenized_dataset["train"].select(range(int(len(tokenized_dataset["train"]) * 0.9), len(tokenized_dataset["train"])))

# Verify the dataset
print(tokenized_dataset["train"][0])

Map:   0%|          | 0/14543 [00:00<?, ? examples/s]

{'input_ids': [25652, 25, 1867, 318, 3794, 1864, 284, 262, 4479, 290, 663, 25219, 30, 685, 5188, 47, 60, 3280, 25, 3794, 11, 326, 318, 33653, 265, 11, 2236, 307, 257, 4479, 286, 1829, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 

In [11]:
# Decode a sample to verify
sample = tokenized_dataset["train"][0]
decoded_text = tokenizer.decode(sample["input_ids"], skip_special_tokens=False)
print(decoded_text)

question: What is India according to the Union and its Territory? [SEP] answer: India, that is Bharat, shall be a Union of States.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|

In [12]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
import torch

# Load model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.config.pad_token_id = model.config.eos_token_id
model.to(device)

# Define LoRA configuration (same as before)
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Define training arguments (same as before, optimized for 4GB GPU)
training_args = TrainingArguments(
    output_dir="./gpt2-indian-legal",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=5e-5,
    fp16=True,
    load_best_model_at_end=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./gpt2-indian-legal-final")
tokenizer.save_pretrained("./gpt2-indian-legal-final")

trainable params: 811,008 || all params: 125,250,816 || trainable%: 0.6475


  trainer = Trainer(


Step,Training Loss,Validation Loss
500,0.257,0.270943
1000,0.2514,0.26333
1500,0.2394,0.261205
2000,0.247,0.25976
2500,0.2367,0.259248
3000,0.2333,0.258408
3500,0.2339,0.257805
4000,0.232,0.257064
4500,0.2387,0.257199


('./gpt2-indian-legal-final\\tokenizer_config.json',
 './gpt2-indian-legal-final\\special_tokens_map.json',
 './gpt2-indian-legal-final\\vocab.json',
 './gpt2-indian-legal-final\\merges.txt',
 './gpt2-indian-legal-final\\added_tokens.json')

In [13]:
# Load the fine-tuned model
model = GPT2LMHeadModel.from_pretrained("./gpt2-indian-legal-final").to(device)
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-indian-legal-final")

# Test inference
prompt = "What is the title and extent of operation of the Indian Penal Code?"
inputs = tokenizer(f"question: {prompt} [SEP] answer:", return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_length=100, num_return_sequences=1)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


question: What is the title and extent of operation of the Indian Penal Code? [SEP] answer: The Indian Penal Code is the law of India.


In [14]:
from datasets import DatasetDict

# Revisit preprocessing to retain original question and answer for evaluation
def preprocess_with_references(examples):
    texts = [f"question: {q} [SEP] answer: {a}" for q, a in zip(examples["question"], examples["answer"])]
    tokenized_inputs = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()
    # Retain original question and answer for evaluation
    tokenized_inputs["original_question"] = examples["question"]
    tokenized_inputs["reference_answer"] = examples["answer"]
    return tokenized_inputs

# Apply updated preprocessing
tokenized_dataset = combined_dataset.map(
    preprocess_with_references,
    batched=True,
    remove_columns=["question", "answer"]  # Remove only the original columns, keep new ones
)

# Re-split into train and validation sets
train_dataset = tokenized_dataset["train"].select(range(int(len(tokenized_dataset["train"]) * 0.9)))
eval_dataset = tokenized_dataset["train"].select(range(int(len(tokenized_dataset["train"]) * 0.9), len(tokenized_dataset["train"])))

# Verify the dataset
print(eval_dataset[0].keys())  # Should include 'original_question' and 'reference_answer'

Map:   0%|          | 0/14543 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'attention_mask', 'labels', 'original_question', 'reference_answer'])


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the fine-tuned model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2LMHeadModel.from_pretrained("./gpt2-indian-legal-final").to(device)
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-indian-legal-final")

# Function to generate answers
def generate_answer(question):
    prompt = f"question: {question} [SEP] answer:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    outputs = model.generate(
        **inputs,
        max_length=150,  # Adjust based on expected answer length
        num_return_sequences=1,
        do_sample=False,  # Use greedy decoding for consistency
        pad_token_id=tokenizer.eos_token_id
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract the answer part (after "answer:")
    generated_answer = generated_text.split("[SEP] answer:")[-1].strip()
    return generated_answer

# Generate answers for the evaluation dataset
generated_answers = []
reference_answers = []
questions = []

for example in eval_dataset:
    question = example["original_question"]
    reference = example["reference_answer"]
    generated = generate_answer(question)
    
    questions.append(question)
    reference_answers.append(reference)
    generated_answers.append(generated)

# Print a few samples to inspect
for q, ref, gen in zip(questions[:3], reference_answers[:3], generated_answers[:3]):
    print(f"Question: {q}")
    print(f"Reference: {ref}")
    print(f"Generated: {gen}")
    print("-" * 50)

ValueError: Input length of input_ids is 103, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.

In [None]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from sentence_transformers import SentenceTransformer, util
import nltk
nltk.download("punkt")

# Initialize ROUGE scorer and SentenceTransformer for semantic similarity
rouge = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)
sentence_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Lists to store scores
rouge1_scores = []
rougeL_scores = []
bleu_scores = []
exact_matches = []
semantic_similarities = []

# Compute metrics for each question-answer pair
for gen, ref in zip(generated_answers, reference_answers):
    # ROUGE scores
    rouge_scores = rouge.score(ref, gen)
    rouge1_scores.append(rouge_scores["rouge1"].fmeasure)
    rougeL_scores.append(rouge_scores["rougeL"].fmeasure)
    
    # BLEU score
    ref_tokens = nltk.word_tokenize(ref)
    gen_tokens = nltk.word_tokenize(gen)
    bleu = sentence_bleu([ref_tokens], gen_tokens, weights=(0.25, 0.25, 0.25, 0.25))  # 4-gram BLEU
    bleu_scores.append(bleu)
    
    # Exact Match
    exact_match = 1 if gen.strip() == ref.strip() else 0
    exact_matches.append(exact_match)
    
    # Semantic Similarity
    ref_embedding = sentence_model.encode(ref, convert_to_tensor=True)
    gen_embedding = sentence_model.encode(gen, convert_to_tensor=True)
    similarity = util.cos_sim(ref_embedding, gen_embedding).item()
    semantic_similarities.append(similarity)

# Compute averages
avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_exact_match = sum(exact_matches) / len(exact_matches)
avg_semantic_similarity = sum(semantic_similarities) / len(semantic_similarities)

# Print results
print(f"Average ROUGE-1 F1: {avg_rouge1:.4f}")
print(f"Average ROUGE-L F1: {avg_rougeL:.4f}")
print(f"Average BLEU Score: {avg_bleu:.4f}")
print(f"Exact Match Rate: {avg_exact_match:.4f}")
print(f"Average Semantic Similarity: {avg_semantic_similarity:.4f}")