In [4]:
import os
import json
from dotenv import load_dotenv 

# Load environment variables from .env file
load_dotenv() 
DATASET_PATH = os.getenv('DATASET_PATH')  # Get the DATASET_PATH stored in .env file

# Define file paths
source_data_file = 'mentalhealth_data_ORIGINAL.json'
source_data = os.path.join(DATASET_PATH, 'source_data', source_data_file)

# Define the total number of parts (files) to split into (approximately)
NUM_FILES = 10
TARGET_LINES_PER_FILE = 380000  # Approximate number of lines per file

# List of topics that should have "Mental Health Dataset" as the source
mental_health_fact_topics = [
    "mental-health-fact", "fact-1", "fact-2", "fact-3", "fact-5", "fact-6", "fact-7", 
    "fact-8", "fact-9", "fact-10", "fact-11", "fact-12", "fact-13", "fact-14", 
    "fact-15", "fact-16", "fact-17", "fact-18", "fact-19", "fact-20", "fact-21", 
    "fact-22", "fact-23", "fact-24", "fact-25", "fact-26", "fact-27", "fact-28", 
    "fact-29", "fact-30", "fact-31", "fact-32"
]

# Load JSON data
def load_json_data(filepath):
    with open(filepath, 'r') as infile:
        return json.load(infile)

# Function to replace "Pandora" with "AskTheraRAGBuddy", but keep "Pandora's box"
def replace_pandora(text):
    return text.replace("Pandora", "AskTheraRAGBuddy").replace("AskTheraRAGBuddy's box", "Pandora's box")

# Function to transform an entry with patterns and responses
def transform_entry(pattern, topic, source, responses, line_counter):
    return {
        "question_id": f"mh_{line_counter}",
        "topic": topic,
        "question_title": replace_pandora(pattern),
        "question_full": replace_pandora(pattern),
        "answers": [{"answer": replace_pandora(resp), "source": source} for resp in responses]  # Ensure each answer has the source field
    }

# Function to write a chunk of data to a file
def write_file(data_chunk, file_counter, output_dir):
    file_name = os.path.join(output_dir, f"mentalhealth_data_part{file_counter}.json")
    with open(file_name, 'w') as outfile:
        json.dump(data_chunk, outfile, indent=4)
    print(f"Saved {file_name}")

# Function to count lines of an entry
def count_lines(entry):
    return len(json.dumps(entry, indent=4).splitlines())

# Main function to split the data into multiple parts based on line count
def process_and_split_data(intents, output_dir, target_lines_per_file, mental_health_fact_topics):
    file_counter = 1
    global_line_counter = 0  # Use a global line counter across files
    current_file_lines = 0   # To track the number of lines in the current file
    output_data = []

    for entry in intents:
        topic = entry.get('tag', 'Unknown')
        source = "Mental Health Dataset" if topic in mental_health_fact_topics else "AskTheraRAGBuddy"
        
        for pattern in entry.get('patterns', []):
            responses = entry.get('responses', [])
            if not responses:
                responses = entry.get('response', [])
            
            # Use the global line counter to ensure unique question_ids
            transformed_entry = transform_entry(pattern, topic, source, responses, global_line_counter)
            
            # Count the lines of this entry
            entry_lines = count_lines(transformed_entry)
            
            # Check if adding this entry would exceed the target lines for this file
            if current_file_lines + entry_lines > target_lines_per_file:
                # If so, write the current file and reset for the next file
                write_file(output_data, file_counter, output_dir)
                file_counter += 1
                output_data = []  # Reset the data chunk
                current_file_lines = 0  # Reset the line counter for the new file

            # Add the entry to the current file
            output_data.append(transformed_entry)
            current_file_lines += entry_lines
            global_line_counter += 1  # Increment global line counter to keep unique question_ids

    # Write any remaining data to the final file
    if output_data:
        write_file(output_data, file_counter, output_dir)


# Load the data
data = load_json_data(source_data)
intents = data["intents"]  # Access the "intents" key

# Process and split the data
process_and_split_data(intents, DATASET_PATH, TARGET_LINES_PER_FILE, mental_health_fact_topics)

print(f"Completed splitting the files into parts with approximately {TARGET_LINES_PER_FILE} lines each.")


Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_part1.json
Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_part2.json
Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_part3.json
Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_part4.json
Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_part5.json
Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_part6.json
Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_part7.json
Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_part8.json
Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_part9.json
Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_part10.json
Completed