In [1]:
import os
import json
from pathlib import Path

base_path = Path("../data")

In [12]:
def load_json_chunks_with_rbac(base_path):
    all_chunks = []

    role_mapping = {
        "finance": ["Finance_Team",  "God_Tier_Admins"],
        "marketing": ["Marketing_Team", "God_Tier_Admins"],
        "hr": ["HR_Team", "God_Tier_Admins"],
        "engineering": ["Engineering_Team", "God_Tier_Admins"],
        "general": ["Employee", "Finance_Team", "Marketing_Team", "HR_Team", "Engineering_Team", "God_Tier_Admins"]
    }

    for role in role_mapping.keys():
        role_path = Path(base_path / role / "chunked_reports")

        if os.path.exists(role_path):
            for json_file in os.listdir(role_path):
                if json_file.endswith(".json"):
                    with open(role_path / json_file, "r") as f:
                        data = json.load(f)

                        for item in data:
                            clean_content = " ".join([line.strip() for line in item['content'] if line.strip() and "---" not in line])
                            print(clean_content)

                        print(clean_content)


In [13]:
load_json_chunks_with_rbac(base_path)

2024 marked a year of both opportunity and challenge for FinSolve Technologies. Despite a robust revenue increase, we saw significant pressure in certain expense categories, notably vendor-related costs and software subscriptions. However, these pressures were balanced by cost-saving measures in operational efficiency, strong gross margin performance, and strategic investment in growth areas. The company is well-positioned to continue scaling its core offerings, but focused attention on cost optimization will be essential for maintaining profitability in the coming years.
FinSolve Technologies's revenue grew by 25% in 2024, driven largely by the global expansion of its services, especially in Asia and Europe. This was accompanied by a 10% increase in vendor-related expenses, impacting overall profit margins. While gross profit increased by 25%, reflecting higher operational efficiency, **net income** saw a more modest increase of 12%. This suggests that while revenue growth is strong, 

In [15]:
import os
import json

def batch_process_all_data(root_dir):
    all_processed_chunks = []

    # Define roles based on the Problem Statement [cite: 1, 9-15]
    role_permissions = {
        "finance": "Finance_Team, C_Level_Executives",
        "marketing": "Marketing_Team, C_Level_Executives",
        "hr": "HR_Team, C_Level_Executives",
        "engineering": "Engineering_Department, C_Level_Executives",
        "general": "Employee_Level, Finance_Team, Marketing_Team, HR_Team, Engineering_Department, C_Level_Executives"
    }

    # Walk through the data directory
    for dept_folder in os.listdir(root_dir):
        dept_path = os.path.join(root_dir, dept_folder)

        # Only process if it's a directory we have roles for
        if os.path.isdir(dept_path) and dept_folder in role_permissions:
            chunked_reports_path = os.path.join(dept_path, "chunked_reports")

            if os.path.exists(chunked_reports_path):
                print(f"Processing {dept_folder} data...")

                for filename in os.listdir(chunked_reports_path):
                    if filename.endswith(".json"):
                        file_path = os.path.join(chunked_reports_path, filename)

                        with open(file_path, 'r') as f:
                            # Your JSON files contain a list of objects or a single object
                            # This handles both cases
                            data = json.load(f)
                            if not isinstance(data, list):
                                data = [data]

                            for i, item in enumerate(data):
                                # Combine content array into a clean string
                                content_string = " ".join([line.strip() for line in item.get("content", []) if line.strip()])

                                # Create unique chunk_id using hierarchy [cite: 20]
                                # Format: dept_filename_index
                                chunk_id = f"{dept_folder}_{filename.replace('.json', '')}_{i}"

                                # Build the chunk object
                                processed_chunk = {
                                    "id": chunk_id,
                                    "text": content_string,
                                    "metadata": {
                                        "chunk_id": chunk_id,
                                        "source": filename,
                                        "section": item.get("section", "N/A"),
                                        "subsection": item.get("subsection", "N/A"),
                                        "subsubsection": item.get("subsubsection", "N/A"),
                                        "allowed_roles": role_permissions[dept_folder],
                                        "department": dept_folder
                                    }
                                }
                                all_processed_chunks.append(processed_chunk)

    return all_processed_chunks

# Execute the loader
# Replace 'data' with the actual path to your main data folder
final_chunks = batch_process_all_data(base_path)
print(f"\nSuccess! Total secure chunks prepared: {len(final_chunks)}")

Processing engineering data...
Processing finance data...
Processing general data...
Processing marketing data...

Success! Total secure chunks prepared: 206


In [27]:
for i in range (0, 206):
    print(final_chunks[i]['metadata']['subsubsection'] == None)

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
Tr

In [31]:
# 1. Check total count
print(f"Total chunks created: {len(final_chunks)}")

# 2. Check Department Distribution
# This ensures no department was left out
from collections import Counter
dept_counts = Counter([c['metadata']['department'] for c in final_chunks])
print("\n--- Chunks per Department ---")
for dept, count in dept_counts.items():
    print(f"{dept.capitalize()}: {count} chunks")

# 3. Verify a "Deep Sample"
# Let's look at one chunk to see if the string joining worked
if len(final_chunks) > 0:
    sample = final_chunks[0]
    print("\n--- Sample Content Verification ---")
    print(f"ID: {sample['id']}")
    print(f"Metadata Roles: {sample['metadata']['allowed_roles']}")
    print(f"Text Preview (First 200 chars): {sample['text'][:200]}...")

    # Check if 'content' was actually joined or if it's empty
    if len(sample['text']) < 10:
        print("⚠️ WARNING: Content for this chunk seems too short. Check your JSON 'content' key.")

Total chunks created: 206

--- Chunks per Department ---
Engineering: 81 chunks
Finance: 27 chunks
General: 53 chunks
Marketing: 45 chunks

--- Sample Content Verification ---
ID: engineering_chunked_master_0
Metadata Roles: Engineering_Department, C_Level_Executives
Text Preview (First 200 chars): FinSolve Technologies is a leading FinTech company headquartered in Bangalore, India, with operations across North America, Europe, and Asia-Pacific. Founded in 2018, FinSolve provides innovative fina...


In [30]:
def verify_parsing_manifest(all_processed_chunks):
    manifest = {}

    for chunk in all_processed_chunks:
        source = chunk['metadata']['source']
        dept = chunk['metadata']['department']

        if dept not in manifest:
            manifest[dept] = {}

        if source not in manifest[dept]:
            manifest[dept][source] = 0

        manifest[dept][source] += 1

    print(f"{'Department':<15} | {'Filename':<35} | {'Chunks'}")
    print("-" * 65)

    for dept, files in manifest.items():
        for filename, count in files.items():
            print(f"{dept:<15} | {filename:<35} | {count}")

# Run this to see the breakdown
verify_parsing_manifest(final_chunks)

Department      | Filename                            | Chunks
-----------------------------------------------------------------
engineering     | chunked_master.json                 | 81
finance         | chunked_summary.json                | 8
finance         | quarterly_financial_report.json     | 19
general         | employee_handbook.json              | 53
marketing       | chunked_report_annual.json          | 9
marketing       | chunked_report_q1.json              | 9
marketing       | chunked_report_q2.json              | 9
marketing       | chunked_report_q3.json              | 9
marketing       | chunked_report_q4.json              | 9
