# Data Preparation for Medical Study Assistant Finetuning

This notebook helps you organize and prepare your lecture transcripts, curriculum, and study guides into a dataset suitable for finetuning a language model as a medical study assistant.

## 1. Import Required Libraries
Import Python libraries such as os, pandas, and json for file handling and data processing.

In [None]:
# Import Required Libraries
import os
import pandas as pd
import json
from pathlib import Path

## 2. Load and Preview Lecture Transcripts
Read sample lecture transcript files from the 'lectures/' directory and display their contents.

In [None]:
# List and preview lecture transcript files
lecture_dir = Path('lectures')
lecture_files = list(lecture_dir.glob('*.txt'))

for file in lecture_files:
    print(f"\n--- {file.name} ---\n")
    with open(file, 'r', encoding='utf-8') as f:
        print(f.read()[:1000])  # Preview first 1000 characters


## 3. Load Curriculum Descriptions
Read curriculum or program description files from the 'curriculum/' directory and display their contents.

In [None]:
# List and preview curriculum files
curriculum_dir = Path('curriculum')
curriculum_files = list(curriculum_dir.glob('*'))

for file in curriculum_files:
    print(f"\n--- {file.name} ---\n")
    with open(file, 'r', encoding='utf-8', errors='ignore') as f:
        print(f.read()[:1000])  # Preview first 1000 characters


## 4. Load Study Guides
Read study guide files from the 'study_guides/' directory and display their contents.

In [None]:
# List and preview study guide files
study_guide_dir = Path('study_guides')
study_guide_files = list(study_guide_dir.glob('*.md'))

for file in study_guide_files:
    print(f"\n--- {file.name} ---\n")
    with open(file, 'r', encoding='utf-8') as f:
        print(f.read()[:1000])  # Preview first 1000 characters


## 5. Prepare Training Dataset
Combine and preprocess the loaded data into a structured format suitable for model finetuning, such as Q&A pairs or summarization tasks.

In [None]:
# Example: Combine lecture, curriculum, and study guide into a dataset entry
# (You can customize this logic for your actual data structure)

dataset = []

if lecture_files and curriculum_files and study_guide_files:
    with open(lecture_files[0], 'r', encoding='utf-8') as lf, \
         open(curriculum_files[0], 'r', encoding='utf-8', errors='ignore') as cf, \
         open(study_guide_files[0], 'r', encoding='utf-8') as sf:
        lecture_text = lf.read()
        curriculum_text = cf.read()
        study_guide_text = sf.read()
        entry = {
            "input": f"Class: {lecture_files[0].stem}\nCurriculum: {curriculum_text[:500]}...\nApproach: Clinical\nLecture Transcript: {lecture_text[:1000]}...",
            "output": study_guide_text[:2000]  # Truncate for preview
        }
        dataset.append(entry)

print("Sample dataset entry:\n", json.dumps(dataset[0], indent=2) if dataset else "No data found.")


## 6. Export Dataset for Finetuning
Export the processed dataset to the 'data/' directory in JSONL format for use in Hugging Face Transformers/TRL finetuning.

In [None]:
# Export dataset to JSONL for finetuning
output_path = Path('data/finetune_dataset.jsonl')
with open(output_path, 'w', encoding='utf-8') as f:
    for entry in dataset:
        f.write(json.dumps(entry, ensure_ascii=False) + '\n')
print(f"Exported {len(dataset)} entries to {output_path}")
