In [7]:
import os
import json
from dotenv import load_dotenv 

# Load environment variables from .env file
load_dotenv() 
DATASET_PATH = os.getenv('DATASET_PATH')  # Get the DATASET_PATH stored in .env file

# Define file paths
source_data_file = 'mentalhealth_data_ORIGINAL.json'
source_data = os.path.join(DATASET_PATH, 'source_data', source_data_file)

# Define the total number of parts (files) to split into
NUM_FILES = 10

# List of topics that should have "Mental Health Dataset" as the source
mental_health_fact_topics = [
    "mental-health-fact", "fact-1", "fact-2", "fact-3", "fact-5", "fact-6", "fact-7", 
    "fact-8", "fact-9", "fact-10", "fact-11", "fact-12", "fact-13", "fact-14", 
    "fact-15", "fact-16", "fact-17", "fact-18", "fact-19", "fact-20", "fact-21", 
    "fact-22", "fact-23", "fact-24", "fact-25", "fact-26", "fact-27", "fact-28", 
    "fact-29", "fact-30", "fact-31", "fact-32"
]

# Load JSON data
def load_json_data(filepath):
    with open(filepath, 'r') as infile:
        return json.load(infile)

# Function to replace "Pandora" with "AskTheraRAGBuddy", but keep "Pandora's box"
def replace_pandora(text):
    return text.replace("Pandora", "AskTheraRAGBuddy").replace("AskTheraRAGBuddy's box", "Pandora's box")

# Function to transform an entry with patterns and responses
def transform_entry(pattern, topic, source, responses, line_counter):
    return {
        "question_id": f"mh_{line_counter}",
        "topic": topic,
        "question_title": replace_pandora(pattern),
        "question_full": replace_pandora(pattern),
        "source": source,
        "answers": [{"answer": replace_pandora(resp)} for resp in responses]  # Handle both cases
    }

# Function to write a chunk of data to a file
def write_file(data_chunk, file_counter, output_dir):
    file_name = os.path.join(output_dir, f"mentalhealth_data_part{file_counter}.json")
    with open(file_name, 'w') as outfile:
        json.dump(data_chunk, outfile, indent=4)
    print(f"Saved {file_name}")

# Function to count total lines
def count_total_lines(intents):
    total_lines = 0
    for entry in intents:
        for pattern in entry.get('patterns', []):
            transformed_entry = {
                "question_id": f"mh_{total_lines}",
                "topic": entry.get('tag', 'Unknown'),
                "question_title": replace_pandora(pattern),
                "question_full": replace_pandora(pattern),
                "source": "Mental Health Dataset" if entry.get('tag', 'Unknown') in mental_health_fact_topics else "AskTheraRAGBuddy",
                "answers": [{"answer": replace_pandora(resp)} for resp in entry.get('responses', [])]
            }
            total_lines += len(json.dumps(transformed_entry, indent=4).splitlines())
    return total_lines

# Main function to split the data into multiple parts
def process_and_split_data(intents, output_dir, lines_per_file, mental_health_fact_topics):
    file_counter = 1
    line_counter = 0
    output_data = []

    for entry in intents:
        topic = entry.get('tag', 'Unknown')
        source = "Mental Health Dataset" if topic in mental_health_fact_topics else "AskTheraRAGBuddy"
        
        for pattern in entry.get('patterns', []):
            responses = entry.get('responses', [])
            if not responses:
                responses = entry.get('response', [])
            
            transformed_entry = transform_entry(pattern, topic, source, responses, line_counter)
            output_data.append(transformed_entry)
            
            # Count the lines for this entry and add to total
            line_counter += len(json.dumps(transformed_entry, indent=4).splitlines())
            
            # If we've hit the target lines per file, write out the file and reset
            if line_counter >= lines_per_file:
                write_file(output_data, file_counter, output_dir)
                file_counter += 1
                output_data = []  # Reset the data chunk
                line_counter = 0  # Reset the line counter

    # Write any remaining data to the final file
    if output_data:
        write_file(output_data, file_counter, output_dir)

# Main Execution Flow
if __name__ == "__main__":
    # Load the data
    data = load_json_data(source_data)
    intents = data["intents"]  # Access the "intents" key

    # Count the total lines
    total_lines = count_total_lines(intents)

    # Calculate lines per file
    lines_per_file = total_lines // NUM_FILES

    # Process and split the data
    process_and_split_data(intents, DATASET_PATH, lines_per_file, mental_health_fact_topics)

    print(f"Completed splitting the files into {NUM_FILES} or so parts.")


Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_CLEAN_part1.json
Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_CLEAN_part2.json
Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_CLEAN_part3.json
Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_CLEAN_part4.json
Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_CLEAN_part5.json
Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_CLEAN_part6.json
Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_CLEAN_part7.json
Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_CLEAN_part8.json
Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\mentalhealth_data_CLEAN_part9.json
Saved C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG