In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Academic Advisor RAG Dataset Builder

This notebook converts raw university catalog files into a **clean, structured, RAG-ready JSON dataset**.

**Input sources:**
- `2025-2026_CSUN_Catalog.epub` (California State University, Northridge)
- `UG-Catalog-AY2024-2025-Updates.pdf` (University of the People)

**Main steps:**
1. Load and parse EPUB (CSUN) and PDF (UoPeople).
2. Clean raw text and preserve paragraph structure.
3. Chunk text into semantically meaningful segments.
4. Classify each chunk into a useful category (admissions, courses, graduation, etc.).
5. Infer academic metadata (program, degree, level, college).
6. Deduplicate repeated chunks across both catalogs.
7. Save final dataset as `academic_advisor_rag_dataset.json`.


In [3]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [5]:
"""
Step 1: Imports and configuration

This cell sets up all required imports and defines file paths
for the input catalogs and the output JSON dataset.
"""

import os
import re
import json
from zipfile import ZipFile

from typing import List, Dict, Tuple, Set

# PDF reader library
from PyPDF2 import PdfReader

# -------------------------------------------------------------------
# File path configuration
# -------------------------------------------------------------------
# NOTE: Adjust these paths if your files are in a different directory.
CSUN_EPUB_PATH = "/content/drive/MyDrive/DAB_RAG_ZakyProject/data/raw/2025-2026_CSUN_Catalog.epub"
UOPEOPLE_PDF_PATH = "/content/drive/MyDrive/DAB_RAG_ZakyProject/data/raw/UG-Catalog-AY2024-2025-Updates.pdf"
OUTPUT_JSON_PATH = "/content/drive/MyDrive/DAB_RAG_ZakyProject/data/processed/academic_advisor_rag_dataset.json"

print("Configured paths:")
print(f" - CSUN EPUB:     {CSUN_EPUB_PATH}")
print(f" - UoPeople PDF:  {UOPEOPLE_PDF_PATH}")
print(f" - Output JSON:   {OUTPUT_JSON_PATH}")

Configured paths:
 - CSUN EPUB:     /content/drive/MyDrive/DAB_RAG_ZakyProject/data/2025-2026_CSUN_Catalog.epub
 - UoPeople PDF:  /content/drive/MyDrive/DAB_RAG_ZakyProject/data/UG-Catalog-AY2024-2025-Updates.pdf
 - Output JSON:   /content/drive/MyDrive/DAB_RAG_ZakyProject/data/academic_advisor_rag_dataset.json


In [6]:
"""
Step 2: Text cleaning and chunking helpers

These functions:
- Clean HTML content from the EPUB while preserving paragraph breaks.
- Chunk text into paragraph-based segments suitable for embeddings.
"""

def clean_html_content(html: str) -> str:
    """Clean HTML content while preserving paragraph and line breaks.

    Operations:
    - Remove <script> and <style> blocks.
    - Convert <p> and <br> tags to newline characters.
    - Strip all other HTML tags.
    - Normalize whitespace and collapse excess blank lines.

    Parameters
    ----------
    html : str
        Raw HTML string extracted from the EPUB.

    Returns
    -------
    str
        Cleaned text with paragraph breaks maintained.
    """
    # Remove <script> and <style> blocks completely
    html = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", " ", html)

    # Convert <p> and <br> tags into newlines
    html = re.sub(r"(?i)<\s*(p|br)[^>]*>", "\n", html)

    # Remove all other HTML tags
    html = re.sub(r"(?s)<.*?>", " ", html)

    # Normalize newlines and spaces
    html = html.replace("\r\n", "\n").replace("\r", "\n")
    html = re.sub(r"[ \t]+", " ", html)       # collapse multiple spaces
    html = re.sub(r"\n{3,}", "\n\n", html)  # limit blank lines

    return html.strip()


def chunk_text_by_paragraph(text: str, max_length: int = 1400) -> List[str]:
    """Chunk text into semantically coherent segments by grouping paragraphs.

    Strategy:
    - Split on newline characters → get paragraphs.
    - Accumulate paragraphs into a buffer until max_length is reached.
    - Start a new chunk when the buffer would exceed max_length.

    Parameters
    ----------
    text : str
        Clean input text containing newline-separated paragraphs.
    max_length : int, optional
        Maximum number of characters per chunk, by default 1400.

    Returns
    -------
    List[str]
        List of chunk strings, each suitable for embedding.
    """
    if not text:
        return []

    # Split into non-empty paragraphs
    paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
    chunks: List[str] = []
    buffer = ""

    for paragraph in paragraphs:
        if buffer and len(buffer) + 1 + len(paragraph) > max_length:
            # Flush current buffer as a chunk
            chunks.append(buffer.strip())
            buffer = paragraph
        else:
            # Append paragraph to buffer (with newline if not empty)
            buffer = (buffer + "\n" + paragraph).strip() if buffer else paragraph

    # Append any remaining text in the buffer
    if buffer:
        chunks.append(buffer.strip())

    return chunks


print("Text cleaning and chunking helpers defined.")

Text cleaning and chunking helpers defined.


In [8]:
"""
Step 3: Classification and academic metadata inference

These functions:
- Assign a useful category to each chunk (e.g., admissions, course_description).
- Infer program, college, degree, and level from filenames and content.
"""

def classify_chunk_category(text: str) -> str:
    """Classify a chunk of catalog text into a high-level category using
    simple keyword-based rules.
    """
    t = text.lower()

    # Admissions
    if "undergraduate admissions" in t or "admission requirements" in t or "apply for admission" in t:
        return "admissions_general"
    if "admitted to the accountancy program" in t or "admission to the accountancy major" in t:
        return "admissions_program_specific"

    # ESL / language-related content
    if "english proficiency" in t or "english as a second language" in t or "esl program" in t:
        return "esl_language"

    # Academic calendar / dates
    if "academic calendar" in t or "first day of the term" in t or "final exam period" in t:
        return "academic_calendar"

    # Financial / fees / scholarships
    if (
        "tuition" in t
        or "processing fee" in t
        or "assessment fee" in t
        or "financial aid" in t
        or "scholarship" in t
    ):
        return "financial_fees"

    # Conduct and policy
    if "code of conduct" in t or "sexual harassment" in t or "non-discrimination policy" in t:
        return "conduct_policy"
    if "grievance" in t and "formal complaint" in t:
        return "grievance_policy"
    if "satisfactory academic progress" in t or "academic probation" in t or "dismissal" in t:
        return "sap_probation_policy"

    # Registration / enrollment timelines
    if (
        "course registration opens" in t
        or "late course registration" in t
        or "last day - course drop" in t
    ):
        return "registration_calendar"
    if "course registration" in t and "portal" in t:
        return "registration_procedure"

    # Graduation requirements
    if (
        "total units required for the" in t
        or "total units required for the b.s." in t
        or "total units required for the degree" in t
    ):
        return "graduation_requirements"
    if "must obtain a grade of" in t and "order to graduate" in t:
        return "graduation_academic_rules"

    # Course descriptions and curriculum
    if (
        "course description" in t
        or "prerequisite:" in t
        or "this course aims to" in t
        or "this course examines" in t
    ):
        return "course_description"
    if "program learning outcomes" in t or "students receiving a bachelor of science in" in t:
        return "program_learning_outcomes"
    if "overview" in t and ("program" in t or "major" in t):
        return "program_overview"
    if (
        "upper division business core" in t
        or "general education units" in t
        or "requirements business majors" in t
    ):
        return "curriculum_structure"

    # Fallback category
    return "general_academic"


def infer_academic_metadata(section_name: str, first_chunk_text: str) -> Dict[str, str]:
    """Infer program, college, degree, and level from a section name and
    the first chunk of text from that section.

    This uses heuristics based on:
    - File names (e.g., accounting.xhtml → "Accounting").
    - Key phrases in the text (e.g., "Bachelor of Science", "Master of Science").

    Parameters
    ----------
    section_name : str
        Filename or logical section identifier.
    first_chunk_text : str
        First chunk of text from the section, used for context.

    Returns
    -------
    Dict[str, str]
        Dictionary with keys: 'program', 'college', 'degree', 'level'
        Values may be None if not inferable.
    """
    metadata: Dict[str, str] = {
        "program": None,
        "college": None,
        "degree": None,
        "level": None,
    }

    base = os.path.basename(section_name).lower()

    # Infer program from filename (EPUB sections like accounting.xhtml, biology.xhtml, etc.)
    if base.endswith((".xhtml", ".html", ".htm")):
        base_no_ext = re.sub(r"\.x?html?$", "", base, flags=re.I)
        if base_no_ext not in ("index", "toc", "frontmatter"):
            program_name = re.sub(r"[_\-]+", " ", base_no_ext).strip()
            if program_name:
                metadata["program"] = program_name.title()

    text_lower = (first_chunk_text or "").lower()

    # Degree and level detection
    if "bachelor of science" in text_lower or "b.s." in text_lower:
        metadata["degree"] = "B.S."
        metadata["level"] = "undergraduate"
    elif "bachelor of arts" in text_lower or "b.a." in text_lower:
        metadata["degree"] = "B.A."
        metadata["level"] = "undergraduate"
    elif "master of science" in text_lower or "m.s." in text_lower:
        metadata["degree"] = "M.S."
        metadata["level"] = "graduate"
    elif "master of professional accountancy" in text_lower or "mpacc" in text_lower:
        metadata["degree"] = "M.P.Acc."
        metadata["level"] = "graduate"

    # Example of college detection (CSUN Nazarian College)
    if "david nazarian college of business and economics" in text_lower:
        metadata["college"] = "David Nazarian College of Business and Economics"

    return metadata


print("Classification and metadata inference helpers defined.")

Classification and metadata inference helpers defined.


In [9]:
"""
Step 4: Process CSUN EPUB catalog

This function:
- Opens the CSUN catalog EPUB.
- Extracts and cleans HTML files.
- Chunks text by paragraphs.
- Infers academic metadata.
- Classifies each chunk.
- Deduplicates content across the global dataset.
"""

def extract_csun_catalog_chunks(
    epub_path: str,
    seen_normalized_text: Set[str],
    next_record_id: int
) -> Tuple[List[Dict], int]:
    """Extract cleaned, chunked, and annotated records from the CSUN EPUB catalog.

    Parameters
    ----------
    epub_path : str
        Path to the CSUN EPUB file.
    seen_normalized_text : Set[str]
        Set of normalized text strings used to deduplicate chunks globally.
    next_record_id : int
        Starting ID value for generated records.

    Returns
    -------
    Tuple[List[Dict], int]
        - List of record dictionaries for this catalog.
        - The next available record ID after processing.
    """
    records: List[Dict] = []

    if not os.path.exists(epub_path):
        print(f"[WARN] CSUN EPUB not found at: {epub_path}")
        return records, next_record_id

    with ZipFile(epub_path, "r") as zf:
        html_files = [
            name for name in zf.namelist()
            if name.lower().endswith((".xhtml", ".html", ".htm"))
        ]

        for section_name in sorted(html_files):
            try:
                raw_html = zf.read(section_name).decode("utf-8", errors="ignore")
            except Exception as exc:
                print(f"[WARN] Failed to read {section_name}: {exc}")
                continue

            cleaned_text = clean_html_content(raw_html)

            # Skip very small or navigation-like fragments
            if len(cleaned_text) < 300:
                continue

            chunks = chunk_text_by_paragraph(cleaned_text, max_length=1400)
            if not chunks:
                continue

            # Infer program metadata from the first chunk in this section
            section_metadata = infer_academic_metadata(section_name, chunks[0])

            for chunk_index, chunk_text in enumerate(chunks):
                normalized = re.sub(r"\s+", " ", chunk_text.lower()).strip()

                # Skip tiny or noisy chunks
                if len(normalized) < 100:
                    continue

                # Deduplicate across entire dataset
                if normalized in seen_normalized_text:
                    continue

                seen_normalized_text.add(normalized)

                record = {
                    "id": next_record_id,
                    "source_file": "csun_2025_2026_catalog",
                    "university": "California State University, Northridge",
                    "catalog_label": "2025-2026 University Catalog",
                    "section": section_name,
                    "section_chunk_index": chunk_index,
                    "category": classify_chunk_category(chunk_text),
                    "program": section_metadata.get("program"),
                    "college": section_metadata.get("college"),
                    "degree": section_metadata.get("degree"),
                    "level": section_metadata.get("level"),
                    "text": chunk_text,
                }

                records.append(record)
                next_record_id += 1

    print(f"Extracted {len(records)} chunks from CSUN catalog.")
    return records, next_record_id


print("CSUN EPUB processing function defined.")

CSUN EPUB processing function defined.


In [10]:
"""
Step 5: Process University of the People PDF catalog

This function:
- Reads each page of the UoPeople catalog updates PDF.
- Normalizes text and preserves paragraph structure.
- Chunks text by paragraphs.
- Infers metadata where possible.
- Classifies each chunk.
- Deduplicates content across the global dataset.
"""

def extract_uopeople_catalog_chunks(
    pdf_path: str,
    seen_normalized_text: Set[str],
    next_record_id: int
) -> Tuple[List[Dict], int]:
    """Extract cleaned, chunked, and annotated records from the UoPeople PDF catalog.

    Parameters
    ----------
    pdf_path : str
        Path to the UoPeople PDF file.
    seen_normalized_text : Set[str]
        Set of normalized text strings used to deduplicate chunks globally.
    next_record_id : int
        Starting ID value for generated records.

    Returns
    -------
    Tuple[List[Dict], int]
        - List of record dictionaries for this catalog.
        - The next available record ID after processing.
    """
    records: List[Dict] = []

    if not os.path.exists(pdf_path):
        print(f"[WARN] UoPeople PDF not found at: {pdf_path}")
        return records, next_record_id

    reader = PdfReader(pdf_path)

    for page_index, page in enumerate(reader.pages):
        try:
            page_text = page.extract_text() or ""
        except Exception as exc:
            print(f"[WARN] Failed to extract text from page {page_index + 1}: {exc}")
            continue

        # Normalize whitespace and newlines
        page_text = page_text.replace("\r\n", "\n").replace("\r", "\n")
        page_text = re.sub(r"[ \t]+", " ", page_text)
        page_text = re.sub(r"\n{3,}", "\n\n", page_text)

        if len(page_text) < 300:
            # Skip pages with very little content
            continue

        chunks = chunk_text_by_paragraph(page_text, max_length=1400)
        if not chunks:
            continue

        section_name = f"page_{page_index + 1}"
        section_metadata = infer_academic_metadata(section_name, chunks[0])

        for chunk_index, chunk_text in enumerate(chunks):
            normalized = re.sub(r"\s+", " ", chunk_text.lower()).strip()

            if len(normalized) < 100:
                continue

            if normalized in seen_normalized_text:
                continue

            seen_normalized_text.add(normalized)

            record = {
                "id": next_record_id,
                "source_file": "uopeople_ug_catalog_updates_2024_2025",
                "university": "University of the People",
                "catalog_label": "2025-26 Undergraduate Catalog (Updates)",
                "section": section_name,
                "section_chunk_index": chunk_index,
                "category": classify_chunk_category(chunk_text),
                "program": section_metadata.get("program"),
                "college": section_metadata.get("college"),
                "degree": section_metadata.get("degree"),
                "level": section_metadata.get("level"),
                "text": chunk_text,
            }

            records.append(record)
            next_record_id += 1

    print(f"Extracted {len(records)} chunks from UoPeople catalog.")
    return records, next_record_id


print("UoPeople PDF processing function defined.")

UoPeople PDF processing function defined.


In [11]:
"""
Step 6: Build the full academic advisor RAG dataset

This function:
- Initializes a global deduplication set.
- Runs the CSUN EPUB and UoPeople PDF extractors.
- Combines all records into a single list.
- Saves the result as a JSON file.

We refer to this as the *dataset build pipeline*.
"""

def run_dataset_build_pipeline() -> List[Dict]:
    """Run the dataset build pipeline for the academic advisor RAG dataset.

    Returns
    -------
    List[Dict]
        List of all records that were generated and saved to JSON.
    """
    all_records: List[Dict] = []
    seen_normalized_text: Set[str] = set()
    next_record_id: int = 1

    # Process CSUN catalog (EPUB)
    csun_records, next_record_id = extract_csun_catalog_chunks(
        epub_path=CSUN_EPUB_PATH,
        seen_normalized_text=seen_normalized_text,
        next_record_id=next_record_id,
    )
    all_records.extend(csun_records)

    # Process UoPeople catalog (PDF)
    uopeople_records, next_record_id = extract_uopeople_catalog_chunks(
        pdf_path=UOPEOPLE_PDF_PATH,
        seen_normalized_text=seen_normalized_text,
        next_record_id=next_record_id,
    )
    all_records.extend(uopeople_records)

    print(f"Total combined chunks: {len(all_records)}")

    # Save to JSON file
    with open(OUTPUT_JSON_PATH, "w", encoding="utf-8") as output_file:
        json.dump(all_records, output_file, ensure_ascii=False, indent=2)

    print(f"Dataset written to: {OUTPUT_JSON_PATH}")
    return all_records


# Run the pipeline
academic_advisor_records = run_dataset_build_pipeline()

Extracted 4597 chunks from CSUN catalog.
Extracted 454 chunks from UoPeople catalog.
Total combined chunks: 5051
Dataset written to: /content/drive/MyDrive/DAB_RAG_ZakyProject/data/academic_advisor_rag_dataset.json


In [12]:
"""
Step 7: Inspect a few sample records

This is helpful in the project report to demonstrate
what the final RAG-ready entries look like.
"""

print(f"Number of records: {len(academic_advisor_records)}")

# Show a few sample records
for record in academic_advisor_records[:5]:
    print("-" * 80)
    print(f"ID:          {record['id']}")
    print(f"University:  {record['university']}")
    print(f"Source file: {record['source_file']}")
    print(f"Section:     {record['section']}")
    print(f"Chunk idx:   {record['section_chunk_index']}")
    print(f"Category:    {record['category']}")
    print(f"Program:     {record['program']}")
    print(f"Degree:      {record['degree']}")
    print(f"Level:       {record['level']}")
    print("Text preview:")
    print(record["text"][:400], "...")

Number of records: 5051
--------------------------------------------------------------------------------
ID:          1
University:  California State University, Northridge
Source file: csun_2025_2026_catalog
Section:     OEBPS/accounting.xhtml
Chunk idx:   0
Category:    general_academic
Program:     Accounting
Degree:      B.S.
Level:       undergraduate
Text preview:
Accounting
Accounting
David Nazarian College of Business and Economics
Department of Accounting
Chair: Rishma Vedd
Bookstein Hall (BB) 3123
(818) 677-2461
Master of Professional Accountancy
Director: Rafael Efrat
Bookstein Hall (BB) 3123
(818) 677-2461
Master of Science in Taxation
Bookstein Chair in Taxation: Rafael Efrat
Bookstein Hall (BB) 3123
(818) 677-5488
EY Center for Careers in Accounting ...
--------------------------------------------------------------------------------
ID:          2
University:  California State University, Northridge
Source file: csun_2025_2026_catalog
Section:     OEBPS/accounting.xhtml
C