This script is the refined version of your json_convert_chunk.py. It reads combined.json, maps its fields to our standard format, chunks the descriptions, and saves it with the exact same final columns as the Nyaaya script.

In [1]:
import pandas as pd
import re

# --- CONFIGURATION ---
INPUT_JSON_PATH = '/home/aditya/0Legal_Agent_Project/raw_data_and_scrap_codes/combined.json'
OUTPUT_CSV_PATH = '/home/aditya/0Legal_Agent_Project/cleaned_final_csv_data/kaggle_cleaned_chunks.csv'

def process_kaggle_json(input_path, output_path):
    """
    Reads the raw Kaggle JSON file, harmonizes its structure,
    chunks the content, and saves it to a standardized, clean CSV file.
    """
    print(f"--- Processing Kaggle Data: {input_path} ---")
    try:
        # For the 'combined.json' which is a JSON array of objects
        df_raw = pd.read_json(input_path)
    except FileNotFoundError:
        print(f"Error: Input file not found at '{input_path}'.")
        return
    except ValueError: # If JSON is not a simple array
        print(f"Error: Could not parse JSON. Ensure '{input_path}' is a valid JSON array of objects.")
        return

    processed_chunks = []

    print(f"Starting standardization and chunking for {len(df_raw)} documents...")

    for index, row in df_raw.iterrows():
        # --- Harmonization Step ---
        # Map the raw JSON fields to our standard names
        # We check for both 'Name'/'Section' and 'title'/'description' structures
        if 'Name' in row and 'Section' in row and 'Section_Title' in row: # Structure from original combined.json
            original_title = f"{row.get('Name', '')} - Section {row.get('Section', '')}: {row.get('Section_Title', '')}"
            long_text = row.get('Description', '')
            category = row.get('Name', 'Uncategorized Law')
        else: # Structure from the sample you showed later
            original_title = row.get('title', 'Untitled Section')
            long_text = row.get('description', '')
            category = 'Motor Vehicles Act, 1988' # Assign a default for this file type

        source = f"Kaggle Dataset - Doc {index}"

        # --- Chunking Step ---
        chunks = re.split(r'\n\s*\n', str(long_text))

        for i, chunk in enumerate(chunks):
            chunk_text = chunk.strip()
            if chunk_text:
                chunk_id = f"kaggle_{index}_chunk_{i}"

                processed_chunks.append({
                    'chunk_id': chunk_id,
                    'chunk_text': chunk_text,
                    'source': source,
                    'category': category,
                    'title': original_title
                })

    df_final = pd.DataFrame(processed_chunks)
    
    # Reorder columns to match the other script's output
    final_columns = ['chunk_id', 'chunk_text', 'source', 'category', 'title']
    df_final = df_final[final_columns]

    print(f"Processing complete. Created {len(df_final)} final chunks.")
    
    df_final.to_csv(output_path, index=False)
    print(f"Success! Standardized data saved to {output_path}")


# --- RUN THE SCRIPT ---
if __name__ == '__main__':
    process_kaggle_json(INPUT_JSON_PATH, OUTPUT_CSV_PATH)


--- Processing Kaggle Data: /home/aditya/0Legal_Agent_Project/raw_data_and_scrap_codes/combined.json ---
Starting standardization and chunking for 1931 documents...
Processing complete. Created 4655 final chunks.
Success! Standardized data saved to /home/aditya/0Legal_Agent_Project/cleaned_final_csv_data/kaggle_cleaned_chunks.csv
