In [7]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
base_dir = '/content/drive/MyDrive/LawBot'

Mounted at /content/drive


In [15]:
import os

# Setup directory structure
print("Setting up directory structure...")

# Use the base_dir already defined in cell 1
print("Using base directory:", base_dir)

# Create all necessary directories
dirs_to_create = [
    base_dir,
    f'{base_dir}/datasets',
    f'{base_dir}/data',
    f'{base_dir}/data/processed',
    f'{base_dir}/models',
    f'{base_dir}/models/adapters',
    f'{base_dir}/vectorstore',
    f'{base_dir}/vectorstore/faiss_index'
]

print("\nCreating directory structure...")
for dir_path in dirs_to_create:
    os.makedirs(dir_path, exist_ok=True)
    print(f"  ✅ {dir_path}")

datasets_path = f'{base_dir}/datasets'

# Verify required files exist
required_files = ['constitution_qa.json', 'crpc_qa.json', 'ipc_qa.json']
print("\nChecking for required files:")
all_present = True
for file in required_files:
    file_path = f'{datasets_path}/{file}'
    if os.path.exists(file_path):
        size = os.path.getsize(file_path) / (1024*1024)  # Size in MB
        print(f"  ✅ {file} ({size:.2f} MB)")
    else:
        print(f"  ❌ {file} - MISSING!")
        all_present = False

if not all_present:
    print(f"\n❌ Some files are missing. Please upload them to: {datasets_path}/")
else:
    print(f"\n✅ All files present. Ready to proceed!")


Setting up directory structure...
Using base directory: /content/drive/MyDrive/LawBot

Creating directory structure...
  ✅ /content/drive/MyDrive/LawBot
  ✅ /content/drive/MyDrive/LawBot/datasets
  ✅ /content/drive/MyDrive/LawBot/data
  ✅ /content/drive/MyDrive/LawBot/data/processed
  ✅ /content/drive/MyDrive/LawBot/models
  ✅ /content/drive/MyDrive/LawBot/models/adapters
  ✅ /content/drive/MyDrive/LawBot/vectorstore
  ✅ /content/drive/MyDrive/LawBot/vectorstore/faiss_index

Checking for required files:
  ✅ constitution_qa.json (1.22 MB)
  ✅ crpc_qa.json (2.08 MB)
  ✅ ipc_qa.json (0.64 MB)

✅ All files present. Ready to proceed!


# Phase 1: Dataset Preparation for LawBot

## Objectives:
1. Load and merge constitution_qa.json, crpc_qa.json, and ipc_qa.json
2. Transform to instruction format: {instruction, output, source}
3. Clean and deduplicate data
4. Split 80:20 into train/validation sets
5. Generate preprocessing report


In [3]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
import re


## Step 1: Load Datasets


In [8]:
# Load all three datasets
# Path is determined by the setup cell above
print(f"Loading datasets from: {datasets_path}/")

with open(f'{datasets_path}/constitution_qa.json', 'r', encoding='utf-8') as f:
    constitution_data = json.load(f)

with open(f'{datasets_path}/crpc_qa.json', 'r', encoding='utf-8') as f:
    crpc_data = json.load(f)

with open(f'{datasets_path}/ipc_qa.json', 'r', encoding='utf-8') as f:
    ipc_data = json.load(f)

print(f"Constitution Q&A: {len(constitution_data)} pairs")
print(f"CrPC Q&A: {len(crpc_data)} pairs")
print(f"IPC Q&A: {len(ipc_data)} pairs")


Loading datasets from: /content/drive/MyDrive/LawBot/datasets/
Constitution Q&A: 4082 pairs
CrPC Q&A: 8194 pairs
IPC Q&A: 2267 pairs


## Step 2: Transform to Instruction Format


In [9]:
def transform_to_instruction_format(data, source_name):
    """Transform data from {question, answer} to {instruction, output, source}"""
    formatted_data = []
    for item in data:
        formatted_data.append({
            "instruction": item["question"],
            "output": item["answer"],
            "source": source_name
        })
    return formatted_data

# Transform each dataset
constitution_formatted = transform_to_instruction_format(constitution_data, "Constitution")
crpc_formatted = transform_to_instruction_format(crpc_data, "CrPC")
ipc_formatted = transform_to_instruction_format(ipc_data, "IPC")

print(f"Constitution formatted: {len(constitution_formatted)}")
print(f"CrPC formatted: {len(crpc_formatted)}")
print(f"IPC formatted: {len(ipc_formatted)}")


Constitution formatted: 4082
CrPC formatted: 8194
IPC formatted: 2267


## Step 3: Merge Datasets


In [10]:
# Combine all datasets
combined_data = constitution_formatted + crpc_formatted + ipc_formatted
print(f"Total combined data: {len(combined_data)} pairs")

# Display sample
print("\nSample data:")
print(json.dumps(combined_data[0], indent=2, ensure_ascii=False))


Total combined data: 14543 pairs

Sample data:
{
  "instruction": "What is India according to the Union and its Territory?",
  "output": "India, that is Bharat, shall be a Union of States.",
  "source": "Constitution"
}


## Step 4: Clean and Deduplicate


In [11]:
def normalize_text(text):
    """Normalize text for comparison"""
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    return text

def clean_data(data):
    """Remove duplicates and clean data"""
    seen = set()
    cleaned = []
    duplicates_removed = 0

    for item in data:
        # Create a unique key from instruction and output
        key = (normalize_text(item["instruction"]), normalize_text(item["output"]))

        if key not in seen:
            seen.add(key)
            cleaned.append(item)
        else:
            duplicates_removed += 1

    print(f"Removed {duplicates_removed} duplicates")
    print(f"Clean data: {len(cleaned)} pairs")

    return cleaned

cleaned_data = clean_data(combined_data)

# Display statistics by source
source_counts = Counter([item["source"] for item in cleaned_data])
print("\nData by source:")
for source, count in source_counts.items():
    print(f"  {source}: {count}")


Removed 21 duplicates
Clean data: 14522 pairs

Data by source:
  Constitution: 4074
  CrPC: 8181
  IPC: 2267


## Step 5: Train/Validation Split (80:20)


In [12]:
# Split into train and validation sets
train_data, val_data = train_test_split(
    cleaned_data,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

print(f"Training data: {len(train_data)} pairs")
print(f"Validation data: {len(val_data)} pairs")

# Verify split maintains source distribution
train_sources = Counter([item["source"] for item in train_data])
val_sources = Counter([item["source"] for item in val_data])

print("\nTraining data by source:")
for source, count in train_sources.items():
    print(f"  {source}: {count}")

print("\nValidation data by source:")
for source, count in val_sources.items():
    print(f"  {source}: {count}")


Training data: 11617 pairs
Validation data: 2905 pairs

Training data by source:
  Constitution: 3241
  CrPC: 6553
  IPC: 1823

Validation data by source:
  IPC: 444
  CrPC: 1628
  Constitution: 833


## Step 6: Save Processed Data


In [13]:
def save_jsonl(data, filename):
    """Save data to JSONL format"""
    # Ensure directory exists
    os.makedirs(os.path.dirname(filename), exist_ok=True)

    # Save file
    with open(filename, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    print(f"✅ Saved {len(data)} items to {filename}")

# Save all datasets with dynamic paths
output_dir = f'{base_dir}/data/processed'
print(f"\nSaving outputs to: {output_dir}/")

save_jsonl(cleaned_data, f'{output_dir}/lawbot_cleaned.jsonl')
save_jsonl(train_data, f'{output_dir}/train.jsonl')
save_jsonl(val_data, f'{output_dir}/val.jsonl')



Saving outputs to: /content/drive/MyDrive/LawBot/data/processed/
✅ Saved 14522 items to /content/drive/MyDrive/LawBot/data/processed/lawbot_cleaned.jsonl
✅ Saved 11617 items to /content/drive/MyDrive/LawBot/data/processed/train.jsonl
✅ Saved 2905 items to /content/drive/MyDrive/LawBot/data/processed/val.jsonl


## Step 7: Generate Preprocessing Report


In [14]:
def generate_report(data, train_data, val_data, filename):
    """Generate preprocessing report"""
    report = {
        "dataset_statistics": {
            "total_samples": len(data),
            "train_samples": len(train_data),
            "val_samples": len(val_data),
            "train_ratio": len(train_data) / len(data),
            "val_ratio": len(val_data) / len(data)
        },
        "source_distribution": {
            "overall": dict(Counter([item["source"] for item in data])),
            "train": dict(Counter([item["source"] for item in train_data])),
            "val": dict(Counter([item["source"] for item in val_data]))
        },
        "text_statistics": {
            "avg_instruction_length": sum(len(item["instruction"]) for item in data) / len(data),
            "avg_output_length": sum(len(item["output"]) for item in data) / len(data),
            "max_instruction_length": max(len(item["instruction"]) for item in data),
            "max_output_length": max(len(item["output"]) for item in data)
        }
    }

    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=2, ensure_ascii=False)

    print("Preprocessing Report:")
    print(json.dumps(report, indent=2, ensure_ascii=False))

    return report

report = generate_report(cleaned_data, train_data, val_data, f'{base_dir}/data/processed/preprocessing_report.json')


Preprocessing Report:
{
  "dataset_statistics": {
    "total_samples": 14522,
    "train_samples": 11617,
    "val_samples": 2905,
    "train_ratio": 0.7999586833769453,
    "val_ratio": 0.2000413166230547
  },
  "source_distribution": {
    "overall": {
      "Constitution": 4074,
      "CrPC": 8181,
      "IPC": 2267
    },
    "train": {
      "Constitution": 3241,
      "CrPC": 6553,
      "IPC": 1823
    },
    "val": {
      "IPC": 444,
      "CrPC": 1628,
      "Constitution": 833
    }
  },
  "text_statistics": {
    "avg_instruction_length": 100.9557912133315,
    "avg_output_length": 121.56404076573475,
    "max_instruction_length": 1025,
    "max_output_length": 1398
  }
}


## Summary

Phase 1 completed successfully! The dataset has been:
1. ✅ Loaded from three JSON files
2. ✅ Transformed to instruction format
3. ✅ Cleaned and deduplicated
4. ✅ Split into train/validation sets (80:20)
5. ✅ Saved as JSONL files
6. ✅ Preprocessing report generated

**Deliverables:**
- `data/processed/lawbot_cleaned.jsonl` - Complete cleaned dataset
- `data/processed/train.jsonl` - Training set
- `data/processed/val.jsonl` - Validation set
- `data/processed/preprocessing_report.json` - Statistics report
