In [1]:
import requests
import json
import os

os.makedirs('pubmedqa', exist_ok=True)

files = {
    'ori_pqal.json': 'https://github.com/pubmedqa/pubmedqa/raw/master/data/ori_pqal.json',
    'ori_pqaa.json': 'https://github.com/pubmedqa/pubmedqa/raw/master/data/ori_pqaa.json',
    'ori_pqau.json': 'https://github.com/pubmedqa/pubmedqa/raw/master/data/ori_pqau.json'
}

for filename, url in files.items():
    print(f"Downloading {filename}...")
    response = requests.get(url)
    with open(f'pubmedqa/{filename}', 'wb') as f:
        f.write(response.content)
    print(f"✓ {filename} downloaded")

Downloading ori_pqal.json...
✓ ori_pqal.json downloaded
Downloading ori_pqaa.json...
✓ ori_pqaa.json downloaded
Downloading ori_pqau.json...
✓ ori_pqau.json downloaded


In [2]:
df = pd.read_json('pubmedqa/ori_pqal.json')
df.head()

NameError: name 'pd' is not defined

In [3]:
import json
import os
from typing import Dict, Any
import logging


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return str(text)

    text = ' '.join(text.split())
    return text.strip()

def format_context(contexts: list) -> str:
    if not contexts:
        return "No context provided."

    formatted_contexts = []
    for i, context in enumerate(contexts, 1):
        clean_context = clean_text(context)
        if clean_context:
            formatted_contexts.append(f"[{i}] {clean_context}")

    return "\n\n".join(formatted_contexts)

def convert_answer(answer: str) -> str:
    answer = answer.lower().strip()


    answer_mapping = {
        'yes': 'Yes',
        'no': 'No',
        'maybe': 'Maybe/Uncertain'
    }

    return answer_mapping.get(answer, 'Maybe/Uncertain')

def create_reasoning(question: str, contexts: list, answer: str, long_answer: str = None) -> str:
    reasoning_parts = []

    if long_answer and long_answer.strip():
        reasoning_parts.append(clean_text(long_answer))
    else:
        if answer.lower() == 'yes':
            reasoning_parts.append("The provided research context supports an affirmative answer to the question.")
        elif answer.lower() == 'no':
            reasoning_parts.append("The provided research context does not support the proposed relationship or claim.")
        else:
            reasoning_parts.append("The available evidence is insufficient or conflicting to provide a definitive answer.")


    if contexts and len(contexts) > 0:
        reasoning_parts.append(f"This conclusion is based on analysis of {len(contexts)} research context provided.")

    return " ".join(reasoning_parts)

def process_pubmedqa_entry(key: str, entry: Dict[str, Any]) -> Dict[str, str]:
    try:
        question = clean_text(entry.get('QUESTION', ''))
        contexts = entry.get('CONTEXTS', [])
        answer = entry.get('final_decision', entry.get('ANSWER', ''))
        long_answer = entry.get('LONG_ANSWER', '')

        if not question:
            raise ValueError("Missing question")

        if not answer:
            raise ValueError("Missing answer")


        formatted_contexts = format_context(contexts)
        standardized_answer = convert_answer(answer)
        reasoning = create_reasoning(question, contexts, answer, long_answer)


        instruction = "Based on the provided PubMed research context, answer whether the research question can be answered with 'Yes', 'No', or 'Maybe/Uncertain'. Provide your reasoning."

        # Input.
        input_text = f"Research Question: {question}\n\nContext from PubMed:\n{formatted_contexts}"

        # Output.
        output_text = f"{standardized_answer}\n\nReasoning: {reasoning}"

        return {
            "instruction": instruction,
            "input": input_text,
            "output": output_text
        }

    except Exception as e:
        logger.error(f"Error processing entry {key}: {str(e)}")
        return None

def convert_pubmedqa_to_jsonl(input_file: str, output_file: str, max_entries: int = None):


    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file not found: {input_file}")

    logger.info(f"Loading PubMedQA dataset from: {input_file}")

    # Original dataset.
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    logger.info(f"Loaded {len(data)} entries from dataset")


    processed_count = 0
    error_count = 0

    with open(output_file, 'w', encoding='utf-8') as f:
        for key, entry in data.items():
            if max_entries and processed_count >= max_entries:
                break

            processed_entry = process_pubmedqa_entry(key, entry)

            if processed_entry:
                json_line = json.dumps(processed_entry, ensure_ascii=False)
                f.write(json_line + '\n')
                processed_count += 1


                if processed_count % 1000 == 0:
                    logger.info(f"Processed {processed_count} entries.")
            else:
                error_count += 1

    logger.info(f"Conversion done")
    logger.info(f"Successfully processed: {processed_count} entries")
    logger.info(f"Errors encountered: {error_count} entries")
    logger.info(f"Output saved to: {output_file}")


    total_chars = 0
    max_chars = 0

    with open(output_file, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            try:
                entry = json.loads(line)
                entry_length = len(entry['instruction'] + entry['input'] + entry['output'])
                total_chars += entry_length
                max_chars = max(max_chars, entry_length)
            except:
                continue

    if processed_count > 0:
        avg_chars = total_chars / processed_count
        avg_tokens = avg_chars / 4
        max_tokens = max_chars / 4

        logger.info(f"\n Dataset Statistics:")
        logger.info(f"Average characters per entry: {avg_chars:.0f}")
        logger.info(f"Average tokens per entry: {avg_tokens:.0f}")
        logger.info(f"Maximum tokens per entry: {max_tokens:.0f}")

    return processed_count, error_count

def validate_jsonl_output(output_file: str, sample_size: int = 3):
    logger.info(f"\n Validating output file: {output_file}")

    valid_lines = 0
    total_lines = 0

    with open(output_file, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            total_lines += 1
            try:
                entry = json.loads(line)


                required_fields = ['instruction', 'input', 'output']
                if all(field in entry and isinstance(entry[field], str) and entry[field].strip()
                       for field in required_fields):
                    valid_lines += 1


                    if valid_lines <= sample_size:
                        logger.info(f"\n-- Sample Entry {valid_lines} ---")
                        logger.info(f"Instruction: {entry['instruction'][:100]}...")
                        logger.info(f"Input: {entry['input'][:200]}...")
                        logger.info(f"Output: {entry['output'][:150]}...")

            except json.JSONDecodeError as e:
                logger.error(f"Invalid JSON on line {line_num}: {e}")

    logger.info(f"\n Validation Results:")
    logger.info(f"Total lines: {total_lines}")
    logger.info(f"Valid JSONL entries: {valid_lines}")
    logger.info(f"Success rate: {(valid_lines/total_lines)*100:.1f}%" if total_lines > 0 else "No entries found.")

    return valid_lines, total_lines


if __name__ == "__main__":
    INPUT_FILE = "/content/pubmedqa/ori_pqal.json"
    OUTPUT_FILE = "pubmedqa_training.jsonl"
    MAX_ENTRIES = None
    VALIDATE_OUTPUT = True

    print("-" * 60)
    print("PubMedQA to JSONL Converter")
    print("-" * 60)

    try:
        if not os.path.exists(INPUT_FILE):
            print(f"Error: Input file not found at {INPUT_FILE}")
            print("Check the PubMedQA dataset is uploaded.")
        else:
            print(f"Input file: {INPUT_FILE}")
            print(f"Output file: {OUTPUT_FILE}")
            if MAX_ENTRIES:
                print(f"  Max entries: {MAX_ENTRIES}")

            processed, errors = convert_pubmedqa_to_jsonl(INPUT_FILE, OUTPUT_FILE, MAX_ENTRIES)


            if VALIDATE_OUTPUT and processed > 0:
                valid, total = validate_jsonl_output(OUTPUT_FILE)
                print(f"\n Conversion done.")
                print(f" Final Stats: {valid}/{total} valid entries.")
            else:
                print(f"\n Conversion completed: {processed} entries processed, {errors} errors.")

    except Exception as e:
        print(f"Conversion failed: {str(e)}")
        import traceback
        traceback.print_exc()

------------------------------------------------------------
PubMedQA to JSONL Converter
------------------------------------------------------------
Input file: /content/pubmedqa/ori_pqal.json
Output file: pubmedqa_training.jsonl

 Conversion done.
 Final Stats: 1000/1000 valid entries.


In [4]:
import json

file_path = 'pubmedqa_training.jsonl'
num_rows_to_show = 5

print(f"First {num_rows_to_show} rows of {file_path}:")
try:
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= num_rows_to_show:
                break
            try:
                json_obj = json.loads(line)
                print(f"--- Row {i+1} ---")
                print(json.dumps(json_obj, indent=2))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line {i+1}: {e}")
except FileNotFoundError:
    print(f"File not found at {file_path}")

First 5 rows of pubmedqa_training.jsonl:
--- Row 1 ---
{
  "instruction": "Based on the provided PubMed research context, answer whether the research question can be answered with 'Yes', 'No', or 'Maybe/Uncertain'. Provide your reasoning.",
  "input": "Research Question: Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?\n\nContext from PubMed:\n[1] Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.\n\n[2] The following paper elucidates the role of mitochondrial dynam