This script will now handle the nyaaya_data.csv file. It reads the raw data, chunks the content, and standardizes the output columns to our agreed-upon format (chunk_id, chunk_text, source, category, title). Notice I've added a step to try and infer a category from the URL, which is a small but intelligent refinement.

In [5]:
print("ok1")

ok1


In [7]:
import pandas as pd
import re
from urllib.parse import urlparse

# --- CONFIGURATION ---
INPUT_CSV_PATH = '/home/aditya/0Legal_Agent_Project/raw_data_and_scrap_codes/nyaaya_data.csv'
OUTPUT_CSV_PATH = '/home/aditya/0Legal_Agent_Project/cleaned_final_csv_data/nyaaya_cleaned_chunks.csv'

def process_nyaaya_csv(input_path, output_path):
    """
    Reads the raw scraped CSV from Nyaaya, standardizes its columns,
    chunks the content, and saves it to a clean CSV file.
    """
    print(f"--- Processing Nyaaya Data: {input_path} ---")
    try:
        df_raw = pd.read_csv(input_path)
        # Ensure the content column is treated as a string to avoid errors
        df_raw['content'] = df_raw['content'].astype(str)
    except FileNotFoundError:
        print(f"Error: Input file not found at '{input_path}'.")
        return
    except KeyError as e:
        print(f"Error: A required column is missing from the CSV: {e}")
        return

    # This list will hold our final, processed chunk data
    processed_chunks = []

    print(f"Starting standardization and chunking for {len(df_raw)} documents...")
    
    # Iterate through each row (each scraped article)
    for index, row in df_raw.iterrows():
        # --- Harmonization Step ---
        # We map the raw columns to our standard names
        original_title = row.get('title', 'Untitled')
        source_url = row.get('url', 'Source Not Found')
        long_text = row.get('content', '')

        # Attempt to infer a category from the URL structure (e.g., /legal-explainers/criminal-law/)
        try:
            path_parts = urlparse(source_url).path.split('/')
            # Find 'legal-explainers' and take the next part as the category
            category_index = path_parts.index('legal-explainers') + 1
            category = path_parts[category_index].replace('-', ' ').title() if len(path_parts) > category_index else 'General'
        except (ValueError, IndexError):
            category = 'General' # Default category if parsing fails

        # --- Chunking Step ---
        # Split the text by paragraphs (two or more newlines)
        chunks = re.split(r'\n\s*\n', long_text)

        for i, chunk in enumerate(chunks):
            chunk_text = chunk.strip()
            if chunk_text:  # Only process non-empty chunks
                chunk_id = f"nyaaya_{index}_chunk_{i}"

                # Create a standardized dictionary for this chunk
                processed_chunks.append({
                    'chunk_id': chunk_id,
                    'chunk_text': chunk_text,
                    'source': source_url,
                    'category': category,
                    'title': original_title
                })

    # Convert the list of dictionaries to our final DataFrame
    df_final = pd.DataFrame(processed_chunks)

    # Reorder columns for consistency
    final_columns = ['chunk_id', 'chunk_text', 'source', 'category', 'title']
    df_final = df_final[final_columns]

    print(f"Processing complete. Created {len(df_final)} final chunks.")
    
    # Save the final, clean data
    df_final.to_csv(output_path, index=False)
    print(f"Success! Standardized data saved to {output_path}")


# --- RUN THE SCRIPT ---
if __name__ == '__main__':
    process_nyaaya_csv(INPUT_CSV_PATH, OUTPUT_CSV_PATH)


--- Processing Nyaaya Data: /home/aditya/0Legal_Agent_Project/raw_data_and_scrap_codes/nyaaya_data.csv ---
Starting standardization and chunking for 68 documents...
Processing complete. Created 68 final chunks.
Success! Standardized data saved to /home/aditya/0Legal_Agent_Project/cleaned_final_csv_data/nyaaya_cleaned_chunks.csv
