In [11]:
import json
import random
import os

def convert_jsonl_format(input_file_path: str, output_file_path: str, sample_size: int):
    """
    Converts a JSONL file from the initial structure to the desired format,
    using a random sample of the questions.

    Args:
        input_file_path (str): The path to the input train.jsonl file.
        output_file_path (str): The path to save the converted JSON file.
        sample_size (int): The number of random questions to select.
    """
    converted_data = []
    
    # Ensure the output directory exists before trying to write to it
    output_dir = os.path.dirname(output_file_path)
    if output_dir: # Check if the path includes a directory
        os.makedirs(output_dir, exist_ok=True) # exist_ok=True prevents an error if the directory already exists

    try:
        with open(input_file_path, 'r', encoding='utf-8') as infile:
            # --- CHANGE: Read all lines into a list to prepare for sampling ---
            all_lines = infile.readlines()
            
            # --- CHANGE: Select a random sample of lines ---
            if len(all_lines) < sample_size:
                print(f"Warning: Input file has only {len(all_lines)} lines, which is less than the requested sample of {sample_size}. Processing all lines.")
                selected_lines = all_lines
            else:
                selected_lines = random.sample(all_lines, sample_size)
            
            print(f"Processing {len(selected_lines)} randomly selected questions...")

            # --- CHANGE: Iterate over the smaller, sampled list of lines ---
            for line in selected_lines:
                try:
                    # Load the JSON object from the line
                    original_data = json.loads(line.strip())

                    question = original_data.get("question")
                    original_options = original_data.get("options", {})
                    correct_answer_key = original_data.get("answer_idx")
                    
                    if not all([question, original_options, correct_answer_key]):
                        print(f"Skipping line due to missing data: {line.strip()}")
                        continue

                    # Get the text of the correct answer
                    correct_answer_text = original_options.get(correct_answer_key)
                    if not correct_answer_text:
                         print(f"Skipping line due to missing correct answer text for key {correct_answer_key}: {line.strip()}")
                         continue

                    # Get the list of incorrect option texts
                    incorrect_options_texts = [
                        text for key, text in original_options.items() 
                        if key != correct_answer_key
                    ]

                    # Ensure there are enough incorrect options to remove one
                    if len(incorrect_options_texts) < 1:
                        print(f"Skipping line as it does not have enough incorrect options to proceed: {line.strip()}")
                        continue
                        
                    # Randomly remove one incorrect option
                    random.shuffle(incorrect_options_texts)
                    # Make sure there are incorrect options before popping
                    if incorrect_options_texts:
                        incorrect_options_texts.pop()

                    # Create the final list of 4 options (1 correct + 3 incorrect)
                    final_options_list = incorrect_options_texts + [correct_answer_text]
                    random.shuffle(final_options_list)

                    # Create the new options dictionary and find the new answer key
                    new_options_dict = {}
                    new_answer_key = ""
                    new_options_keys = ["A", "B", "C", "D"]
                    
                    for i, option_text in enumerate(final_options_list):
                        key = new_options_keys[i]
                        new_options_dict[key] = option_text
                        if option_text == correct_answer_text:
                            new_answer_key = key
                    
                    # Format the instruction string
                    options_string = ""
                    for key, value in new_options_dict.items():
                        options_string += f"\t\t\t{key}. {value}\n"
                    
                    instruction = (
                        f"{question}\n"
                        f"{options_string}"
                        f"\t\t\tAnswer with the letter of the correct answer.\n"
                        f"\t\t\tAnswer:"
                    )

                    # Create the final JSON object in the desired format
                    converted_item = {
                        "instruction": instruction,
                        "target_score": new_answer_key,
                        "dataset": "clinical_knowledge"
                    }
                    converted_data.append(converted_item)

                except json.JSONDecodeError:
                    print(f"Skipping line due to invalid JSON: {line.strip()}")
                except Exception as e:
                    print(f"An error occurred while processing a line: {e}")

        # Write the converted data to the output file
        with open(output_file_path, 'w', encoding='utf-8') as outfile:
            json.dump(converted_data, outfile, indent=4)
            
        print(f"Conversion successful! Output saved to '{output_file_path}'")

    except FileNotFoundError:
        # Provide a more helpful error message
        print(f"Error: The file '{input_file_path}' was not found.")
        print(f"Please ensure the file exists at this location and that the path is correct.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


if __name__ == '__main__':
    # Print working directory for easier debugging
    print(f"Current working directory: {os.getcwd()}")
    
    # --- CHANGE: Define the number of questions to sample ---
    SAMPLE_COUNT = 500

    # Construct paths relative to the current working directory using os.path.join
    # This is more robust and works across different operating systems (Windows, Mac, Linux)
    INPUT_FILE = os.path.join("raw", "train.jsonl")
    OUTPUT_FILE = os.path.join("splits", "target_test_new_medicine.json")
    
    print(f"Attempting to read from: {os.path.abspath(INPUT_FILE)}")
    print(f"Will write to: {os.path.abspath(OUTPUT_FILE)}")
    
    # --- CHANGE: Pass the sample count to the function ---
    convert_jsonl_format(INPUT_FILE, OUTPUT_FILE, sample_size=SAMPLE_COUNT)


Current working directory: c:\Users\user\Documents\GitHub\rolevectors\rolevectors\target_direction\pipeline\dataset
Attempting to read from: c:\Users\user\Documents\GitHub\rolevectors\rolevectors\target_direction\pipeline\dataset\raw\train.jsonl
Will write to: c:\Users\user\Documents\GitHub\rolevectors\rolevectors\target_direction\pipeline\dataset\splits\target_test_new_medicine.json
Processing 500 randomly selected questions...
Conversion successful! Output saved to 'splits\target_test_new_medicine.json'
