In [1]:
import os
import re
import json
import fitz  # PyMuPDF
import openai
import pandas as pd
import tkinter as tk
from tkinter import filedialog

# Set your OpenAI API key
openai.api_key = "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"

# Paths to the CSV files containing course information
graduate_csv_path = '/Users/buluttok/UNCW_Core_Courses.csv'
undergraduate_csv_path = '/Users/buluttok/uncw_cs_courses.csv'

# Initialize context management
context_history = []

# Function to add context
def add_to_context(role, content):
    context_history.append({"role": role, "content": content})

# Function to get the context for the prompt
def get_context():
    return context_history[-10:]  # Last 10 interactions to keep the context relevant

# Function to clear context
def clear_context():
    global context_history
    context_history = []

# Input sanitization function
def sanitize_input(user_input):
    """
    Sanitize user input to remove potentially dangerous characters.
    This simple approach allows alphanumerics, whitespace, and certain punctuation.
    """
    allowed_pattern = re.compile(r'[^\w\s\-\.,:;\/\\\(\)!?]')
    sanitized = allowed_pattern.sub('', user_input)
    return sanitized.strip()

class SammySeaHawk:
    def __init__(self):
        self.core_classes_info = {}

    def load_all_courses(self, csv_paths):
        for csv_path in csv_paths:
            course_data = self.read_course_info_from_csv(csv_path)
            self.core_classes_info.update(course_data)

    def read_course_info_from_csv(self, csv_path):
        try:
            df = pd.read_csv(csv_path)
            print("CSV Columns:", df.columns)
            # Identify column names dynamically
            course_code_col = 'Course Code'
            name_col = 'Course Title' if 'Course Title' in df.columns else 'Class Name'
            credits_col = 'Credit Hours' if 'Credit Hours' in df.columns else 'Class Credits'
            prerequisites_col = 'Prerequisite Courses' if 'Prerequisite Courses' in df.columns else 'Prerequisites'
            max_repeatability_col = 'Course Repeatability' if 'Course Repeatability' in df.columns else 'Max Repeatability'
            
            course_dict = {}
            for _, row in df.iterrows():
                course_code = row[course_code_col]
                course_dict[course_code] = {
                    'course_code': course_code,
                    'name': row[name_col],
                    'credits': row[credits_col],
                    'prerequisites': row[prerequisites_col],
                    'max_repeatability': row.get(max_repeatability_col, 'Not specified'),
                    'description': row.get('Course Description', 'Description not available')
                }
            print(f"Loaded {len(course_dict)} courses from CSV: {csv_path}")
            return course_dict
        except KeyError as e:
            print(f"Error reading CSV file: {e}")
            return {}
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return {}

    def pdf_to_json(self, pdf_path):
        pdf_document = fitz.open(pdf_path)
        pdf_content = {}
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text = page.get_text("text")
            pdf_content[f"Page_{page_num + 1}"] = text
        return json.dumps(pdf_content, ensure_ascii=False, indent=4)

    def extract_student_info(self, json_obj):
        student_info = {'name': 'N/A', 'department': 'N/A', 'advisor': 'N/A', 'gpa': 'N/A'}
        content = " ".join(json_obj.values())
        patterns = {
            'name': re.compile(r"Student name[:\s]*([^\n]+)", re.IGNORECASE),
            'department': re.compile(r"(?:Plan description|Department)[:\s]*([^\n]+)", re.IGNORECASE),
            'advisor': re.compile(r"Primary Advisor[:\s]*([^\n]+)", re.IGNORECASE),
            'gpa': re.compile(r"\bGPA[:\s]*([\d\.]+)", re.IGNORECASE)
        }
        for key, pattern in patterns.items():
            match = pattern.search(content)
            if match:
                student_info[key] = match.group(1).strip()
        return student_info

    def extract_credits(self, json_obj):
        completed_classes, in_progress_classes = [], []
        completed_credits, in_progress_credits = 0, 0
        current_status = None
        for page, content in json_obj.items():
            lines = content.split('\n')
            for line in lines:
                if 'COMPLETE' in line and not re.search(r'\b[A-Z]{3}\s+\d{3}\b', line):
                    current_status = 'complete'
                elif 'IN-PROGRESS' in line:
                    current_status = 'in-progress'
                elif re.search(r'\b[A-Z]{3}\s+\d{3}\b', line):
                    course_info = re.split(r'\s{2,}', line.strip())
                    if len(course_info) >= 4:
                        course_code, course_title, grade, credits = course_info[:4]
                        try:
                            credits = float(credits.replace('(', '').replace(')', ''))
                        except ValueError:
                            continue
                        if current_status == 'complete':
                            completed_classes.append((course_code, course_title, grade, credits))
                            completed_credits += credits
                        elif current_status == 'in-progress':
                            in_progress_classes.append((course_code, course_title, credits))
                            in_progress_credits += credits
        add_to_context('completed_classes', completed_classes)
        return completed_classes, in_progress_classes, completed_credits, in_progress_credits

    def handle_query(self, student_info, json_data, user_query):
        # Normalize and sanitize the user query
        normalized_query = sanitize_input(user_query).lower()
        add_to_context("user", normalized_query)

        # Check for course codes in the query
        course_code_match = re.search(r'\b[A-Z]{3}\s*\d{3}\b', normalized_query)
        if course_code_match:
            course_code = course_code_match.group().replace(' ', '')
            course_info = self.core_classes_info.get(course_code)
            if course_info:
                response_context = self.check_prerequisites(course_info)
                add_to_context('assistant', response_context)
                return response_context

        if "completed courses" in normalized_query:
            # Attempt to retrieve completed classes from context
            completed_classes = None
            for entry in context_history:
                if entry.get("role") == "assistant" and isinstance(entry.get("content"), list):
                    completed_classes = entry.get("content")
                    break
            if completed_classes:
                response = "Here are the courses you have completed:\n"
                for course_code, course_title, grade, credits in completed_classes:
                    response += f"- {course_title} ({course_code}), Grade: {grade}, Credits: {credits}\n"
                add_to_context('assistant', response)
                return response

        if "elective courses" in normalized_query:
            completed_classes = None
            for entry in context_history:
                if entry.get("role") == "assistant" and isinstance(entry.get("content"), list):
                    completed_classes = entry.get("content")
                    break
            if completed_classes:
                electives = [
                    course for course in self.core_classes_info.values()
                    if 'elective' in course.get('description', '').lower()
                ]
                completed_electives = [
                    course for course in electives
                    if course['course_code'] in [c[0] for c in completed_classes]
                ]
                response = "Here are the elective courses you have completed:\n"
                for course in completed_electives:
                    response += f"- {course['name']} ({course['course_code']})\n"
                add_to_context('assistant', response)
                return response

        # Fallback to using OpenAI API if no relevant course code is found
        csv_data_str = "\n".join([
            f"{code}: {info['name']} ({info['credits']} credits), Prerequisites: {info['prerequisites']}"
            for code, info in self.core_classes_info.items()
        ])
        prompt = (
            f"Student info:\n\n{json_data}\n\n"
            f"Course information:\n\n{csv_data_str}\n\n"
            f"Please provide the relevant information for the following query: {user_query}"
        )
        full_response = ""
        while True:
            try:
                response = openai.ChatCompletion.create(
                    model="gpt-4o",
                    messages=[
                        {"role": "system", "content": "You are Sammy C. Hawk, a friendly, concise, and skillful academic advisor. You help students with course information, prerequisites, and evaluations based on provided data. You are super friendly and always give thoughtful course recommendations. You also answer follow-up questions and switch context if needed."},
                        {"role": "system", "content": "Always start your responses by acknowledging the student's concern or question in a friendly manner."},
                        {"role": "system", "content": "Manage context effectively by summarizing key points from previous talks when appropriate."},
                        {"role": "user", "content": prompt}
                    ],
                    max_tokens=1000
                )
            except Exception as e:
                full_response = f"An error occurred while processing your request: {e}"
                break

            response_text = response.choices[0].message['content'].strip()
            full_response += response_text
            # Check if response might be incomplete
            if not response_text.endswith(('and', ',', 'of', 'or', 'with')):
                break
            prompt = "Continue the previous response."

        add_to_context('assistant', full_response)
        return full_response

    def check_prerequisites(self, course_info):
        course_name = course_info.get('name')
        prerequisites = course_info.get('prerequisites')
        response = f"The course {course_name} has the following prerequisites: {prerequisites}."
        return response

    def generate_report(self, completed_classes, in_progress_classes, completed_credits, in_progress_credits):
        report = []
        if completed_classes:
            report.append("Completed Classes and Credits:")
            for cls, title, grade, credits in completed_classes:
                report.append(f"{cls} ({title}) - Grade: {grade}, Credits: {credits}")
            report.append(f"Total Completed Credits: {completed_credits}\n")
        if in_progress_classes:
            report.append("In-Progress Classes and Credits:")
            for cls, title, credits in in_progress_classes:
                report.append(f"{cls} ({title}) - Credits: {credits}")
            report.append(f"Total In-Progress Credits: {in_progress_credits}")
        return "\n".join(report)


#########################################################################
# MAIN CODE - Using a "Upload PDF" button (via Tkinter) to pick PDF file
#########################################################################

def main():
    chatbot = SammySeaHawk()
    print("Hey, I am Sammy SeaHawk! How can I help you today?")

    # Load CSV data
    csv_paths = [graduate_csv_path, undergraduate_csv_path]
    chatbot.load_all_courses(csv_paths)

    # We'll store the PDF path in a mutable dict for convenience
    # so the callback can set it.
    pdf_state = {"path": None}

    # --- Create a simple Tkinter window with "Upload PDF" button ---
    def on_upload_click():
        # Open a file dialog to select a PDF
        pdf_path = filedialog.askopenfilename(
            title="Select a PDF File",
            filetypes=[("PDF files", "*.pdf")]
        )
        if pdf_path:
            pdf_state["path"] = pdf_path
        root.destroy()  # Close the Tkinter window

    # Initialize Tkinter
    root = tk.Tk()
    root.title("Upload PDF")

    # Create and pack a button
    upload_btn = tk.Button(root, text="Upload PDF", command=on_upload_click)
    upload_btn.pack(padx=20, pady=20)

    # Run the Tkinter event loop (blocks until window is closed)
    root.mainloop()

    # After the window closes, check if we got a PDF path
    if not pdf_state["path"]:
        print("No PDF file selected. Exiting.")
        return

    # Sanitize and check the PDF file
    pdf_path = sanitize_input(pdf_state["path"])
    if not pdf_path.lower().endswith('.pdf'):
        print("Error: The selected file must be a PDF.")
        return
    if not os.path.isfile(pdf_path):
        print("Error: The specified file does not exist.")
        return

    # --- Now proceed in console/terminal after file selection ---
    json_data = chatbot.pdf_to_json(pdf_path)
    json_obj = json.loads(json_data)

    student_info = chatbot.extract_student_info(json_obj)
    print(f"\nHey {student_info['name'] if student_info['name'] != 'N/A' else 'there'}, here's what I found for you:")

    if student_info['department'] != 'N/A':
        print(f"Department: {student_info['department']}")
    if student_info['advisor'] != 'N/A':
        print(f"Advisor: {student_info['advisor']}")
    if student_info['gpa'] != 'N/A':
        print(f"GPA: {student_info['gpa']}\n")

    # Extract and display completed/in-progress classes
    completed_classes, in_progress_classes, completed_credits, in_progress_credits = chatbot.extract_credits(json_obj)
    report = chatbot.generate_report(completed_classes, in_progress_classes, completed_credits, in_progress_credits)
    if report:
        print(report)

    # Chat interaction loop (in console)
    while True:
        user_input = input("\nHow can I assist you further? (or type 'exit' to quit): ")
        if user_input.lower() == 'exit':
            print("Goodbye! Have a great day!")
            break
        elif user_input.lower() in ["clear context", "reset"]:
            clear_context()
            print("Context cleared.")
            continue

        response = chatbot.handle_query(student_info, json_data, user_input)
        add_to_context("user", user_input)
        add_to_context("assistant", response)
        print("\nResponse from Sammy SeaHawk:")
        print(response)

if __name__ == "__main__":
    main()


Hey, I am Sammy SeaHawk! How can I help you today?
CSV Columns: Index(['Course Code', 'Class Name', 'Class Credits', 'Prerequisites'], dtype='object')
Loaded 25 courses from CSV: /Users/buluttok/UNCW_Core_Courses.csv
CSV Columns: Index(['Course Code', 'Course Title', 'Credit Hours', 'Prerequisite Courses',
       'Corequisite Courses', 'Additional Restrictions/Requirements',
       'Course Repeatability', 'Maximum Repeatable Hours'],
      dtype='object')
Loaded 31 courses from CSV: /Users/buluttok/uncw_cs_courses.csv

Hey Tok, Bulut, here's what I found for you:
Advisor: Cem Canel
GPA: 3.784




How can I assist you further? (or type 'exit' to quit):  can you give me my elective courses from previous semesters?



Response from Sammy SeaHawk:
Hi there! Let's take a look at your elective courses from previous semesters. Based on your degree audit, here are the electives you've completed:

1. **CSC 502: Machine Learning Fundamentals**
   - Grade: A
   - Credits: 3
   - Term: Spring 2024

2. **CSC 592: Topics in Computing**
   - Grade: A
   - Credits: 3
   - Term: Fall 2023

3. **MIS 505: Data Visualization**
   - Grade: A
   - Credits: 3
   - Term: Fall 2023

4. **MIS 506: Text and Unstructured Data Analysis**
   - Grade: A
   - Credits: 3
   - Term: Spring 2024

These courses seem to be enriching your skills in various advanced topics. If you need help selecting future electives or have any other questions, feel free to ask!



How can I assist you further? (or type 'exit' to quit):  exit


Goodbye! Have a great day!


In [1]:
import os
import re
import json
import fitz  # PyMuPDF
import openai
import pandas as pd
import tkinter as tk
from tkinter import filedialog

# Set your OpenAI API key
openai.api_key = "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"

# Paths to the CSV files containing course information
graduate_csv_path = '/Users/buluttok/UNCW_Core_Courses.csv'
undergraduate_csv_path = '/Users/buluttok/uncw_cs_courses.csv'

# Initialize context management
context_history = []

# Function to add context
def add_to_context(role, content):
    context_history.append({"role": role, "content": content})

# Function to get the context for the prompt
def get_context():
    return context_history[-10:]  # Last 10 interactions to keep the context relevant

# Function to clear context
def clear_context():
    global context_history
    context_history = []

# Input sanitization function
def sanitize_input(user_input):
    """
    Sanitize user input to remove potentially dangerous characters.
    This simple approach allows alphanumerics, whitespace, and common punctuation.
    """
    allowed_pattern = re.compile(r'[^\w\s\-\.,:;\/\\\(\)!?]')
    sanitized = allowed_pattern.sub('', user_input)
    return sanitized.strip()

# Note: When implementing database queries in the future,
# always use parameterized queries or an ORM to protect against SQL injection.
# For example, with sqlite3:
#     cursor.execute("SELECT * FROM courses WHERE course_code = ?", (user_input,))

class SammySeaHawk:
    def __init__(self):
        self.core_classes_info = {}

    def load_all_courses(self, csv_paths):
        for csv_path in csv_paths:
            course_data = self.read_course_info_from_csv(csv_path)
            self.core_classes_info.update(course_data)

    def read_course_info_from_csv(self, csv_path):
        try:
            df = pd.read_csv(csv_path)
            print("CSV Columns:", df.columns)
            # Identify column names dynamically
            course_code_col = 'Course Code'
            name_col = 'Course Title' if 'Course Title' in df.columns else 'Class Name'
            credits_col = 'Credit Hours' if 'Credit Hours' in df.columns else 'Class Credits'
            prerequisites_col = 'Prerequisite Courses' if 'Prerequisite Courses' in df.columns else 'Prerequisites'
            max_repeatability_col = 'Course Repeatability' if 'Course Repeatability' in df.columns else 'Max Repeatability'
            
            course_dict = {}
            for _, row in df.iterrows():
                course_code = row[course_code_col]
                course_dict[course_code] = {
                    'course_code': course_code,
                    'name': row[name_col],
                    'credits': row[credits_col],
                    'prerequisites': row[prerequisites_col],
                    'max_repeatability': row.get(max_repeatability_col, 'Not specified'),
                    'description': row.get('Course Description', 'Description not available')
                }
            print(f"Loaded {len(course_dict)} courses from CSV: {csv_path}")
            return course_dict
        except KeyError as e:
            print(f"Error reading CSV file: {e}")
            return {}
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return {}

    def pdf_to_json(self, pdf_path):
        pdf_document = fitz.open(pdf_path)
        pdf_content = {}
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text = page.get_text("text")
            pdf_content[f"Page_{page_num + 1}"] = text
        return json.dumps(pdf_content, ensure_ascii=False, indent=4)

    def extract_student_info(self, json_obj):
        student_info = {'name': 'N/A', 'department': 'N/A', 'advisor': 'N/A', 'gpa': 'N/A'}
        content = " ".join(json_obj.values())
        patterns = {
            'name': re.compile(r"Student name[:\s]*([^\n]+)", re.IGNORECASE),
            'department': re.compile(r"(?:Plan description|Department)[:\s]*([^\n]+)", re.IGNORECASE),
            'advisor': re.compile(r"Primary Advisor[:\s]*([^\n]+)", re.IGNORECASE),
            'gpa': re.compile(r"\bGPA[:\s]*([\d\.]+)", re.IGNORECASE)
        }
        for key, pattern in patterns.items():
            match = pattern.search(content)
            if match:
                student_info[key] = match.group(1).strip()
        return student_info

    def extract_credits(self, json_obj):
        completed_classes, in_progress_classes = [], []
        completed_credits, in_progress_credits = 0, 0
        current_status = None
        for page, content in json_obj.items():
            lines = content.split('\n')
            for line in lines:
                if 'COMPLETE' in line and not re.search(r'\b[A-Z]{3}\s+\d{3}\b', line):
                    current_status = 'complete'
                elif 'IN-PROGRESS' in line:
                    current_status = 'in-progress'
                elif re.search(r'\b[A-Z]{3}\s+\d{3}\b', line):
                    course_info = re.split(r'\s{2,}', line.strip())
                    if len(course_info) >= 4:
                        course_code, course_title, grade, credits = course_info[:4]
                        try:
                            credits = float(credits.replace('(', '').replace(')', ''))
                        except ValueError:
                            continue
                        if current_status == 'complete':
                            completed_classes.append((course_code, course_title, grade, credits))
                            completed_credits += credits
                        elif current_status == 'in-progress':
                            in_progress_classes.append((course_code, course_title, credits))
                            in_progress_credits += credits
        add_to_context('completed_classes', completed_classes)
        return completed_classes, in_progress_classes, completed_credits, in_progress_credits

    def handle_query(self, student_info, json_data, user_query):
        # Normalize and sanitize the user query
        normalized_query = sanitize_input(user_query).lower()
        add_to_context("user", normalized_query)

        # Check for course codes in the query (existing logic)
        course_code_match = re.search(r'\b[A-Z]{3}\s*\d{3}\b', normalized_query)
        if course_code_match:
            course_code = course_code_match.group().replace(' ', '')
            course_info = self.core_classes_info.get(course_code)
            if course_info:
                response_context = self.check_prerequisites(course_info)
                add_to_context('assistant', response_context)
                return response_context

        if "completed courses" in normalized_query:
            completed_classes = None
            for entry in context_history:
                if entry.get("role") == "assistant" and isinstance(entry.get("content"), list):
                    completed_classes = entry.get("content")
                    break
            if completed_classes:
                response = "Here are the courses you have completed:\n"
                for course_code, course_title, grade, credits in completed_classes:
                    response += f"- {course_title} ({course_code}), Grade: {grade}, Credits: {credits}\n"
                add_to_context('assistant', response)
                return response

        if "elective courses" in normalized_query:
            completed_classes = None
            for entry in context_history:
                if entry.get("role") == "assistant" and isinstance(entry.get("content"), list):
                    completed_classes = entry.get("content")
                    break
            if completed_classes:
                electives = [
                    course for course in self.core_classes_info.values()
                    if 'elective' in course.get('description', '').lower()
                ]
                completed_electives = [
                    course for course in electives
                    if course['course_code'] in [c[0] for c in completed_classes]
                ]
                response = "Here are the elective courses you have completed:\n"
                for course in completed_electives:
                    response += f"- {course['name']} ({course['course_code']})\n"
                add_to_context('assistant', response)
                return response

        # Process CSV data from courses for inclusion in the prompt.
        csv_data_str = "\n".join([
            f"{code}: {info['name']} ({info['credits']} credits), Prerequisites: {info['prerequisites']}"
            for code, info in self.core_classes_info.items()
        ])

        # Build a prompt that includes the JSON (PDF) data and CSV data.
        prompt = (
            f"Student info:\n\n{json_data}\n\n"
            f"Course information:\n\n{csv_data_str}\n\n"
            f"Please provide the relevant information for the following query: {user_query}"
        )

        # Build the full list of messages including your system instructions.
        system_messages = [
            {"role": "system", "content": "You are Sammy C. Hawk, a friendly, concise, and skillful academic advisor. You help students with course information, prerequisites, and evaluations based on provided data. You are super friendly and always give thoughtful course recommendations to students. You also are able to answer follow-up questions and switch the context if needed."},
            {"role": "system", "content": "Always start your responses by acknowledging the student's concern or question in a friendly manner."},
            {"role": "system", "content": "Provide clear and concise information about course prerequisites, ensuring the student understands what is required before taking the course."},
            {"role": "system", "content": "When giving course recommendations, consider the student's academic history, future goals, and any provided preferences."},
            {"role": "system", "content": "If a student asks about workload, provide an honest assessment and suggest ways to manage their time effectively."},
            {"role": "system", "content": "Encourage students to reach out if they have more questions or need further clarification, promoting an open line of communication."},
            {"role": "system", "content": "If a student asks for advice on balancing multiple commitments, offer practical suggestions and prioritize their well-being."},
            {"role": "system", "content": "If a student is unsure about their major or course selection, help them explore their interests and suggest related courses or resources."},
            {"role": "system", "content": "Always be patient and supportive, especially when students are stressed or confused about their academic path."},
            {"role": "system", "content": "When a student asks about switching majors or courses, provide a step-by-step guide on how to do so and any implications this may have."},
            {"role": "system", "content": "If the student is excelling in their courses, suggest advanced courses or opportunities like internships or research projects that align with their goals."},
            {"role": "system", "content": "Always provide relevant answers based on the student's current question, and make sure to consider the context of previous conversations to maintain coherence and continuity."},
            {"role": "system", "content": "Manage context effectively by summarizing key points from previous talks when appropriate, and use this context to provide tailored advice and recommendations."},
            {"role": "system", "content": "In cases where the conversation might shift topics, smoothly transition by acknowledging the change and addressing the new topic while maintaining awareness of the previous context."},
            {"role": "system", "content": "You excel at recognizing and responding to follow-up questions, seamlessly maintaining context from previous interactions. If a conversation shifts topics, you skillfully adjust by acknowledging the change and providing relevant information for the new topic, while still considering earlier discussions. This ensures continuity and coherence in all your responses."},
            {"role": "system", "content": "If the student has had previous interactions, briefly refer to them to show continuity and deepen the sense of a personalized advising experience."}
        ]
        messages = system_messages + [{"role": "user", "content": prompt}]

        full_response = ""
        while True:
            try:
                response = openai.ChatCompletion.create(
                    model="gpt-4o",
                    messages=messages,
                    max_tokens=1000
                )
            except Exception as e:
                full_response = f"An error occurred while processing your request: {e}"
                break

            response_text = response.choices[0].message['content'].strip()
            full_response += response_text
            # If the response doesn't seem complete, continue.
            if not response_text.endswith(('and', ',', 'of', 'or', 'with')):
                break
            messages.append({"role": "assistant", "content": response_text})
            messages.append({"role": "user", "content": "Continue the previous response."})

        add_to_context('assistant', full_response)
        return full_response

    def check_prerequisites(self, course_info):
        course_name = course_info.get('name')
        prerequisites = course_info.get('prerequisites')
        response = f"The course {course_name} has the following prerequisites: {prerequisites}."
        return response

    def generate_report(self, completed_classes, in_progress_classes, completed_credits, in_progress_credits):
        report = []
        if completed_classes:
            report.append("Completed Classes and Credits:")
            for cls, title, grade, credits in completed_classes:
                report.append(f"{cls} ({title}) - Grade: {grade}, Credits: {credits}")
            report.append(f"Total Completed Credits: {completed_credits}\n")
        if in_progress_classes:
            report.append("In-Progress Classes and Credits:")
            for cls, title, credits in in_progress_classes:
                report.append(f"{cls} ({title}) - Credits: {credits}")
            report.append(f"Total In-Progress Credits: {in_progress_credits}")
        return "\n".join(report)

#########################################################################
# MAIN CODE: Using a Tkinter "Upload PDF" button to get user input
#########################################################################

def main():
    chatbot = SammySeaHawk()
    print("Hey, I am Sammy SeaHawk! How can I help you today?")

    # Load CSV data
    csv_paths = [graduate_csv_path, undergraduate_csv_path]
    chatbot.load_all_courses(csv_paths)

    # Use Tkinter to create an "Upload PDF" button
    pdf_state = {"path": None}
    def on_upload_click():
        pdf_path = filedialog.askopenfilename(
            title="Select a PDF File",
            filetypes=[("PDF files", "*.pdf")]
        )
        if pdf_path:
            pdf_state["path"] = pdf_path
        root.destroy()  # Close the window after selection

    root = tk.Tk()
    root.title("Upload PDF")
    upload_btn = tk.Button(root, text="Upload PDF", command=on_upload_click)
    upload_btn.pack(padx=20, pady=20)
    root.mainloop()

    if not pdf_state["path"]:
        print("No PDF file selected. Exiting.")
        return

    pdf_path = sanitize_input(pdf_state["path"])
    if not pdf_path.lower().endswith('.pdf'):
        print("Error: The selected file must be a PDF.")
        return
    if not os.path.isfile(pdf_path):
        print("Error: The specified file does not exist.")
        return

    # Process the PDF file
    json_data = chatbot.pdf_to_json(pdf_path)
    json_obj = json.loads(json_data)

    student_info = chatbot.extract_student_info(json_obj)
    print(f"\nHey {student_info['name'] if student_info['name'] != 'N/A' else 'there'}, here's what I found for you:")
    if student_info['department'] != 'N/A':
        print(f"Department: {student_info['department']}")
    if student_info['advisor'] != 'N/A':
        print(f"Advisor: {student_info['advisor']}")
    if student_info['gpa'] != 'N/A':
        print(f"GPA: {student_info['gpa']}\n")

    completed_classes, in_progress_classes, completed_credits, in_progress_credits = chatbot.extract_credits(json_obj)
    report = chatbot.generate_report(completed_classes, in_progress_classes, completed_credits, in_progress_credits)
    if report:
        print(report)

    # Chat interaction loop
    while True:
        user_input = input("\nHow can I assist you further? (or type 'exit' to quit): ")
        if user_input.lower() == 'exit':
            print("Goodbye! Have a great day!")
            break
        elif user_input.lower() in ["clear context", "reset"]:
            clear_context()
            print("Context cleared.")
            continue

        response = chatbot.handle_query(student_info, json_data, user_input)
        add_to_context("user", user_input)
        add_to_context("assistant", response)
        print("\nResponse from Sammy SeaHawk:")
        print(response)

if __name__ == "__main__":
    main()


Hey, I am Sammy SeaHawk! How can I help you today?
CSV Columns: Index(['Course Code', 'Class Name', 'Class Credits', 'Prerequisites'], dtype='object')
Loaded 25 courses from CSV: /Users/buluttok/UNCW_Core_Courses.csv
CSV Columns: Index(['Course Code', 'Course Title', 'Credit Hours', 'Prerequisite Courses',
       'Corequisite Courses', 'Additional Restrictions/Requirements',
       'Course Repeatability', 'Maximum Repeatable Hours'],
      dtype='object')
Loaded 31 courses from CSV: /Users/buluttok/uncw_cs_courses.csv

Hey Tok, Bulut, here's what I found for you:
Advisor: Cem Canel
GPA: 3.784




How can I assist you further? (or type 'exit' to quit):  can you give me my elective courses?



Response from Sammy SeaHawk:
Hi Bulut! It's great to see your progress towards your Master of Science in Computer Science and Information Systems at UNCW. Let's take a look at your elective courses.

You've completed the following electives:

1. **CSC 502: Machine Learning Fundamentals** - Grade: A
2. **CSC 592: Topics in Computing** - Grade: A
3. **MIS 505: Data Visualization** - Grade: A
4. **MIS 506: Text and Unstructured Data Analysis** - Grade: A

You're doing an excellent job in your courses with such strong grades! If you're looking for more electives or advice on future courses, feel free to let me know your areas of interest or your career goals, and I can make further recommendations. Keep up the great work, and don't hesitate to reach out if you have more questions!



How can I assist you further? (or type 'exit' to quit):  exit


Goodbye! Have a great day!


In [1]:
import fitz  # PyMuPDF
import json
import openai
import re
import pandas as pd
import mysql.connector
from mysql.connector import Error

# Set your OpenAI API key
openai.api_key =  "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"
# Paths to the CSV files containing course information
graduate_csv_path = 'UNCW_Core_Courses.csv'
undergraduate_csv_path = 'uncw_cs_courses.csv'

# Initialize session storage for context 
sessions = {}

class SammySeaHawk:
    def __init__(self):
        self.core_classes_info = {}
        # Database connection
        self.connection = self.connect_to_database()

    def connect_to_database(self):
        "Establish connection to the MySQL database."
        try:
            connection = mysql.connector.connect(
                host='127.0.0.1', 
                port=3306,  
                database='Memory', 
                user='root', 
                password='Alexalex98'  
            )
            if connection.is_connected():
                print("Connected to MySQL database")
            return connection
        except Error as e:
            print(f"Error while connecting to MySQL: {e}")
            return None

    def save_session_to_db(self, session_id, user_query, assistant_response):
        """Save the session data to the MySQL database."""
        if self.connection:
            try:
                cursor = self.connection.cursor()
                query = """
                    INSERT INTO memory (session_id, query, response) 
                    VALUES (%s, %s, %s)
                """
                cursor.execute(query, (session_id, user_query, assistant_response))
                self.connection.commit()
                cursor.close()
            except Error as e:
                print(f"Error saving to MySQL: {e}")

    def load_all_courses(self, csv_paths):
        for csv_path in csv_paths:
            course_data = self.read_course_info_from_csv(csv_path)
            self.core_classes_info.update(course_data)

    def read_course_info_from_csv(self, csv_path):
        try:
            df = pd.read_csv(csv_path)
            course_code_col = 'Course Code'
            name_col = 'Course Title' if 'Course Title' in df.columns else 'Class Name'
            credits_col = 'Credit Hours' if 'Credit Hours' in df.columns else 'Class Credits'
            prerequisites_col = 'Prerequisite Courses' if 'Prerequisite Courses' in df.columns else 'Prerequisites'
            max_repeatability_col = 'Course Repeatability' if 'Course Repeatability' in df.columns else 'Max Repeatability'
            
            course_dict = {}
            for _, row in df.iterrows():
                course_code = row[course_code_col]
                course_dict[course_code] = {
                    'course_code': course_code,
                    'name': row[name_col],
                    'credits': row[credits_col],
                    'prerequisites': row[prerequisites_col],
                    'max_repeatability': row.get(max_repeatability_col, 'Not specified'),
                    'description': row.get('Course Description', 'Description not available')
                }
            return course_dict
        except KeyError as e:
            print(f"Error reading CSV file: {e}")
            return {}
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return {}

    def pdf_to_json(self, pdf_path):
        pdf_document = fitz.open(pdf_path)
        pdf_content = {}
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text = page.get_text("text")
            pdf_content[f"Page_{page_num + 1}"] = text
        return json.dumps(pdf_content, ensure_ascii=False, indent=4)

    def extract_student_info(self, json_obj):
        student_info = {'name': 'N/A', 'department': 'N/A', 'advisor': 'N/A', 'gpa': 'N/A'}
        content = " ".join(json_obj.values())
        patterns = {
            'name': re.compile(r"Student name[:\s]*([^\n]+)", re.IGNORECASE),
            'department': re.compile(r"(?:Plan description|Department)[:\s]*([^\n]+)", re.IGNORECASE),
            'advisor': re.compile(r"Primary Advisor[:\s]*([^\n]+)", re.IGNORECASE),
            'gpa': re.compile(r"\bGPA[:\s]*([\d\.]+)", re.IGNORECASE)
        }
        for key, pattern in patterns.items():
            match = pattern.search(content)
            if match:
                student_info[key] = match.group(1).strip()
        return student_info

    def extract_credits(self, json_obj):
        completed_classes, in_progress_classes = [], []
        completed_credits, in_progress_credits = 0, 0
        current_status = None
        for page, content in json_obj.items():
            lines = content.split('\n')
            for line in lines:
                if 'COMPLETE' in line and not re.search(r'\b[A-Z]{3}\s+\d{3}\b', line):
                    current_status = 'complete'
                elif 'IN-PROGRESS' in line:
                    current_status = 'in-progress'
                elif re.search(r'\b[A-Z]{3}\s+\d{3}\b', line):
                    course_info = re.split(r'\s{2,}', line.strip())
                    if len(course_info) >= 4:
                        course_code, course_title, grade, credits = course_info[:4]
                        try:
                            credits = float(credits.replace('(', '').replace(')', ''))
                        except ValueError:
                            continue
                        if current_status == 'complete':
                            completed_classes.append((course_code, course_title, grade, credits))
                            completed_credits += credits
                        elif current_status == 'in-progress':
                            in_progress_classes.append((course_code, course_title, credits))
                            in_progress_credits += credits
        return completed_classes, in_progress_classes, completed_credits, in_progress_credits

    def generate_response(self, session_id, user_query, student_info, json_data):
        if session_id not in sessions:
            sessions[session_id] = []

        # Add user query to the conversation history
        sessions[session_id].append({"role": "user", "content": user_query})

        # Prepare the conversation history for the GPT-4 API
        messages = sessions[session_id]

        # Add relevant system messages and prompts
        csv_data_str = "\n".join([
            f"{code}: {info['name']} ({info['credits']} credits), Prerequisites: {info['prerequisites']}"
            for code, info in self.core_classes_info.items()
        ])
        system_messages = [
            {"role": "system", "content": "You are Advisor SeaHawk, a very friendly, concise, and skillful academic advisor. You help students with course information, prerequisites, and evaluations based on provided data. You are super friendly and always give thoughtful course recommendations to students. You also are able to answer follow-up questions and switch the context if needed."},
            {"role": "system", "content": "Always provide relevant answers based on the student's current question."},
            {"role": "system", "content": "If the student asks for advice on balancing multiple commitments, offer practical suggestions and prioritize their well-being. if not, do not include in the answer."},
            {"role": "user", "content": f"Student info:\n\n{json_data}\n\nCourse information:\n\n{csv_data_str}\n\n"}
        ]
        messages = system_messages + messages

        # Generate the response from GPT-4
        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=messages,
            max_tokens=1000,
            temperature=0.7,
            n=1,
            stop=None,
            top_p=0.85,
            frequency_penalty=0.5        
        )

        # Extract the assistant's response
        assistant_message = response.choices[0].message['content']

        # Clean the response to remove markdown formatting
        assistant_message = self.clean_text(assistant_message)

        # Add the assistant's response to the conversation history
        sessions[session_id].append({"role": "assistant", "content": assistant_message})

        # Save the session data to the database
        self.save_session_to_db(session_id, user_query, assistant_message)

        return assistant_message

    def clean_text(self, response):
        # Replace markdown formatting symbols like ** with empty strings
        return response.replace("**", "")

    def generate_report(self, completed_classes, in_progress_classes, completed_credits, in_progress_credits):
        report = []
        if completed_classes:
            report.append("Completed Classes and Credits:")
            for cls, title, grade, credits in completed_classes:
                report.append(f"{cls} ({title}) - Grade: {grade}, Credits: {credits}")
            report.append(f"Total Completed Credits: {completed_credits}\n")

        if in_progress_classes:
            report.append("In-Progress Classes and Credits:")
            for cls, title, credits in in_progress_classes:
                report.append(f"{cls} ({title}) - Credits: {credits}")
            report.append(f"Total In-Progress Credits: {in_progress_credits}")

        return "\n".join(report)

def main():
    # Initialize the chatbot and load course data
    chatbot = SammySeaHawk()
    csv_paths = [graduate_csv_path, undergraduate_csv_path]
    chatbot.load_all_courses(csv_paths)

    # Prompt the user for a PDF file path
    pdf_path = input("Enter the path to the PDF file: ")
    try:
        json_data_str = chatbot.pdf_to_json(pdf_path)
        json_obj = json.loads(json_data_str)
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return

    # Extract student information from the PDF content
    student_info = chatbot.extract_student_info(json_obj)
    print("\nExtracted Student Information:")
    for key, value in student_info.items():
        print(f"{key}: {value}")

    # Generate and display a credits report
    completed_classes, in_progress_classes, completed_credits, in_progress_credits = chatbot.extract_credits(json_obj)
    report = chatbot.generate_report(completed_classes, in_progress_classes, completed_credits, in_progress_credits)
    print("\nCredits Report:")
    print(report)

    # Interactive chat session with Advisor SeaHawk
    print("\nEnter your questions to Advisor SeaHawk (type 'exit' to quit):")
    session_id = "unique_session_id"  # Use a constant session ID for this example
    while True:
        user_query = input("\nYour question: ")
        if user_query.lower() == 'exit':
            break
        response = chatbot.generate_response(session_id, user_query, student_info, json_data_str)
        print(f"\nAdvisor SeaHawk: {response}")

if __name__ == "__main__":
    main()


Connected to MySQL database


Enter the path to the PDF file:  exit


Error processing PDF: no such file: 'exit'


In [2]:
import fitz  # PyMuPDF
import json
import openai
import re
import pandas as pd
import mysql.connector
from mysql.connector import Error
import tkinter as tk
from tkinter import filedialog

# Set your OpenAI API key
openai.api_key = "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"

# Paths to the CSV files containing course information
graduate_csv_path = 'UNCW_Core_Courses.csv'
undergraduate_csv_path = 'uncw_cs_courses.csv'

# Initialize session storage for context 
sessions = {}

class SammySeaHawk:
    def __init__(self):
        self.core_classes_info = {}
        # Database connection
        self.connection = self.connect_to_database()

    def connect_to_database(self):
        "Establish connection to the MySQL database."
        try:
            connection = mysql.connector.connect(
                host='127.0.0.1', 
                port=3306,  
                database='Memory', 
                user='root', 
                password='Alexalex98'  
            )
            if connection.is_connected():
                print("Connected to MySQL database")
            return connection
        except Error as e:
            print(f"Error while connecting to MySQL: {e}")
            return None

    def save_session_to_db(self, session_id, user_query, assistant_response):
        """Save the session data to the MySQL database."""
        if self.connection:
            try:
                cursor = self.connection.cursor()
                query = """
                    INSERT INTO memory (session_id, query, response) 
                    VALUES (%s, %s, %s)
                """
                cursor.execute(query, (session_id, user_query, assistant_response))
                self.connection.commit()
                cursor.close()
            except Error as e:
                print(f"Error saving to MySQL: {e}")

    def load_all_courses(self, csv_paths):
        for csv_path in csv_paths:
            course_data = self.read_course_info_from_csv(csv_path)
            self.core_classes_info.update(course_data)

    def read_course_info_from_csv(self, csv_path):
        try:
            df = pd.read_csv(csv_path)
            course_code_col = 'Course Code'
            name_col = 'Course Title' if 'Course Title' in df.columns else 'Class Name'
            credits_col = 'Credit Hours' if 'Credit Hours' in df.columns else 'Class Credits'
            prerequisites_col = 'Prerequisite Courses' if 'Prerequisite Courses' in df.columns else 'Prerequisites'
            max_repeatability_col = 'Course Repeatability' if 'Course Repeatability' in df.columns else 'Max Repeatability'
            
            course_dict = {}
            for _, row in df.iterrows():
                course_code = row[course_code_col]
                course_dict[course_code] = {
                    'course_code': course_code,
                    'name': row[name_col],
                    'credits': row[credits_col],
                    'prerequisites': row[prerequisites_col],
                    'max_repeatability': row.get(max_repeatability_col, 'Not specified'),
                    'description': row.get('Course Description', 'Description not available')
                }
            return course_dict
        except KeyError as e:
            print(f"Error reading CSV file: {e}")
            return {}
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return {}

    def pdf_to_json(self, pdf_path):
        pdf_document = fitz.open(pdf_path)
        pdf_content = {}
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text = page.get_text("text")
            pdf_content[f"Page_{page_num + 1}"] = text
        return json.dumps(pdf_content, ensure_ascii=False, indent=4)

    def extract_student_info(self, json_obj):
        student_info = {'name': 'N/A', 'department': 'N/A', 'advisor': 'N/A', 'gpa': 'N/A'}
        content = " ".join(json_obj.values())
        patterns = {
            'name': re.compile(r"Student name[:\s]*([^\n]+)", re.IGNORECASE),
            'department': re.compile(r"(?:Plan description|Department)[:\s]*([^\n]+)", re.IGNORECASE),
            'advisor': re.compile(r"Primary Advisor[:\s]*([^\n]+)", re.IGNORECASE),
            'gpa': re.compile(r"\bGPA[:\s]*([\d\.]+)", re.IGNORECASE)
        }
        for key, pattern in patterns.items():
            match = pattern.search(content)
            if match:
                student_info[key] = match.group(1).strip()
        return student_info

    def extract_credits(self, json_obj):
        completed_classes, in_progress_classes = [], []
        completed_credits, in_progress_credits = 0, 0
        current_status = None
        for page, content in json_obj.items():
            lines = content.split('\n')
            for line in lines:
                if 'COMPLETE' in line and not re.search(r'\b[A-Z]{3}\s+\d{3}\b', line):
                    current_status = 'complete'
                elif 'IN-PROGRESS' in line:
                    current_status = 'in-progress'
                elif re.search(r'\b[A-Z]{3}\s+\d{3}\b', line):
                    course_info = re.split(r'\s{2,}', line.strip())
                    if len(course_info) >= 4:
                        course_code, course_title, grade, credits = course_info[:4]
                        try:
                            credits = float(credits.replace('(', '').replace(')', ''))
                        except ValueError:
                            continue
                        if current_status == 'complete':
                            completed_classes.append((course_code, course_title, grade, credits))
                            completed_credits += credits
                        elif current_status == 'in-progress':
                            in_progress_classes.append((course_code, course_title, credits))
                            in_progress_credits += credits
        return completed_classes, in_progress_classes, completed_credits, in_progress_credits

    def generate_response(self, session_id, user_query, student_info, json_data):
        if session_id not in sessions:
            sessions[session_id] = []

        # Add user query to the conversation history
        sessions[session_id].append({"role": "user", "content": user_query})

        # Prepare the conversation history for the GPT-4 API
        messages = sessions[session_id]

        # Add relevant system messages and prompts
        csv_data_str = "\n".join([
            f"{code}: {info['name']} ({info['credits']} credits), Prerequisites: {info['prerequisites']}"
            for code, info in self.core_classes_info.items()
        ])
        system_messages = [
            {"role": "system", "content": "You are Advisor SeaHawk, a very friendly, concise, and skillful academic advisor. You help students with course information, prerequisites, and evaluations based on provided data. You are super friendly and always give thoughtful course recommendations to students. You also are able to answer follow-up questions and switch the context if needed."},
            {"role": "system", "content": "Always provide relevant answers based on the student's current question."},
            {"role": "system", "content": "If the student asks for advice on balancing multiple commitments, offer practical suggestions and prioritize their well-being. if not, do not include in the answer."},
            {"role": "user", "content": f"Student info:\n\n{json_data}\n\nCourse information:\n\n{csv_data_str}\n\n"}
        ]
        messages = system_messages + messages

        # Generate the response from GPT-4
        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=messages,
            max_tokens=1000,
            temperature=0.7,
            n=1,
            stop=None,
            top_p=0.85,
            frequency_penalty=0.5        
        )

        # Extract the assistant's response
        assistant_message = response.choices[0].message['content']

        # Clean the response to remove markdown formatting
        assistant_message = self.clean_text(assistant_message)

        # Add the assistant's response to the conversation history
        sessions[session_id].append({"role": "assistant", "content": assistant_message})

        # Save the session data to the database
        self.save_session_to_db(session_id, user_query, assistant_message)

        return assistant_message

    def clean_text(self, response):
        # Replace markdown formatting symbols like ** with empty strings
        return response.replace("**", "")

    def generate_report(self, completed_classes, in_progress_classes, completed_credits, in_progress_credits):
        report = []
        if completed_classes:
            report.append("Completed Classes and Credits:")
            for cls, title, grade, credits in completed_classes:
                report.append(f"{cls} ({title}) - Grade: {grade}, Credits: {credits}")
            report.append(f"Total Completed Credits: {completed_credits}\n")

        if in_progress_classes:
            report.append("In-Progress Classes and Credits:")
            for cls, title, credits in in_progress_classes:
                report.append(f"{cls} ({title}) - Credits: {credits}")
            report.append(f"Total In-Progress Credits: {in_progress_credits}")

        return "\n".join(report)

def main():
    # Initialize the chatbot and load course data
    chatbot = SammySeaHawk()
    csv_paths = [graduate_csv_path, undergraduate_csv_path]
    chatbot.load_all_courses(csv_paths)

    # Open a file chooser dialog to select the PDF file
    root = tk.Tk()
    root.withdraw()  # Hide the main Tk window
    pdf_path = filedialog.askopenfilename(
        title="Select PDF File",
        filetypes=[("PDF Files", "*.pdf")]
    )

    if not pdf_path:
        print("No file selected. Exiting.")
        return

    try:
        json_data_str = chatbot.pdf_to_json(pdf_path)
        json_obj = json.loads(json_data_str)
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return

    # Extract student information from the PDF content
    student_info = chatbot.extract_student_info(json_obj)
    print("\nExtracted Student Information:")
    for key, value in student_info.items():
        print(f"{key}: {value}")

    # Generate and display a credits report
    completed_classes, in_progress_classes, completed_credits, in_progress_credits = chatbot.extract_credits(json_obj)
    report = chatbot.generate_report(completed_classes, in_progress_classes, completed_credits, in_progress_credits)
    print("\nCredits Report:")
    print(report)

    # Interactive chat session with Advisor SeaHawk
    print("\nEnter your questions to Advisor SeaHawk (type 'exit' to quit):")
    session_id = "unique_session_id"  # Use a constant session ID for this example
    while True:
        user_query = input("\nYour question: ")
        if user_query.lower() == 'exit':
            break
        response = chatbot.generate_response(session_id, user_query, student_info, json_data_str)
        print(f"\nAdvisor SeaHawk: {response}")

if __name__ == "__main__":
    main()


Connected to MySQL database

Extracted Student Information:
name: Tok, Bulut
department: N/A
advisor: Cem Canel
gpa: 3.784

Credits Report:


Enter your questions to Advisor SeaHawk (type 'exit' to quit):



Your question:  can you give me my electives and the sum of tmy credits only from lectives as a list



Advisor SeaHawk: Sure! Based on your academic record, here are the electives you have taken along with their respective credits:

1. CSC 502: Machine Learning Fundamentals - 3 credits
2. CSC 592: Topics in Computing - 3 credits
3. MIS 505: Data Visualization - 3 credits
4. MIS 506: Text and Unstructured Data Analysis - 3 credits

The total sum of elective credits is 12 credits.

If you need further assistance or have more questions, feel free to ask! 😊



Your question:  exit 



Advisor SeaHawk: If you have any more questions in the future, don't hesitate to reach out. Have a great day! 😊



Your question:  exit


In [8]:
pip install selenium bs4


    torch (>=1.7torchvision)
          ~~~~~~^[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [12]:
import re
import time
import json
import urllib.parse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# ---------------------------
# Setup Selenium (headless)
# ---------------------------
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
main_window = driver.current_window_handle

# ---------------------------
# Base URL and constant parameters
# ---------------------------
base_url = "https://catalogue.uncw.edu/search_advanced.php"
base_params = {
    "cur_cat_oid": "74",
    "search_database": "Search",
    "search_db": "Search",
    # These page parameters will be overridden.
    "cpage": "1",
    "ecpage": "1",
    "ppage": "1",
    "spage": "1",
    "tpage": "1",
    "location": "3",
    # "filter[keyword]" will be set per keyword.
}

# ---------------------------
# Settings
# ---------------------------
# Process courses whose title begins with one of these codes.
search_keywords = ["MIS", "CSC", "CYBR", "BAN"]

# Define sections:
#   - "prefix" uses the "cpage" parameter.
#   - "location" uses the "ecpage" parameter.
sections = [("prefix", "cpage"), ("location", "ecpage")]

# Set to track processed detail URLs to avoid duplicates.
processed_details = set()
all_courses = []

# ---------------------------
# Helper: Get maximum page number
# ---------------------------
def get_max_page(soup):
    nav_elem = soup.find(lambda tag: tag.name in ["nav", "div", "p", "td"] and "Page:" in tag.get_text())
    if nav_elem:
        nav_text = nav_elem.get_text(separator=" ", strip=True)
        m = re.search(r"Page:\s*(.*)", nav_text)
        if m:
            pages_str = m.group(1)
            nums = re.findall(r"\d+", pages_str)
            if nums:
                return int(nums[-1])
    return None

# ---------------------------
# Helper: Clean and structure the schedule block
# ---------------------------
def parse_schedule_block(raw_text):
    """
    Cleans the raw schedule text by:
      - Splitting into lines and removing extraneous navigation lines.
      - Removing any line containing "Ellucian Company L.P. and its affiliates."
      - Returning the block starting at "Scheduled Meeting Times" as a list of lines.
      - If the resulting block is empty or contains a "No classes" message, returns a fallback message.
    """
    lines = raw_text.splitlines()
    removal_patterns = [
        r"Class Schedule Listing",
        r"Go to Main Content",
        r"SeaNet",
        r"HELP",
        r"\|",
        r"EXIT",
        r"Print[- ]?Friendly Page",
        r"Return to Previous",
        r"Skip to top of page",
        r"Release:",
        r"Powered by",
        r"Ellucian Company L\.P\. and its affiliates\."
    ]
    # Remove unwanted lines.
    clean_lines = [line.strip() for line in lines if line.strip() and not any(re.search(pat, line, re.IGNORECASE) for pat in removal_patterns)]
    try:
        header_index = clean_lines.index("Scheduled Meeting Times")
    except ValueError:
        clean_text = "\n".join(clean_lines)
        if re.search(r"No classes were found", clean_text, re.IGNORECASE):
            return "No information is available"
        return clean_text
    schedule_block = clean_lines[header_index:]
    schedule_str = "\n".join(schedule_block)
    if re.search(r"No classes were found", schedule_str, re.IGNORECASE):
        return "No information is available"
    return schedule_block

# ---------------------------
# Helper: Parse additional info from description block
# ---------------------------
def parse_additional_info(text):
    """
    From the text block (typically after Credit Hours:),
    extract key-value pairs for known fields.
    If a field line contains only the field name, then use the next non-empty line as its value.
    Returns a dictionary mapping field names to values.
    """
    additional = {}
    expected_fields = [
        "Credit Hours",
        "Prerequisite Courses",
        "Additional Restrictions/ Requirements",
        "Course Repeatability",
        "Maximum Repeatable Hours",
        "Equivalent Courses",
        "Undergraduate Crosslisting",
        "Additional Course Fees",
        "Course Attribute"
    ]
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    for i, line in enumerate(lines):
        for field in expected_fields:
            # Match if the line starts with the field name (optionally followed by a colon)
            if re.match(rf"^{re.escape(field)}\s*:?", line, re.IGNORECASE):
                # Try to split on colon.
                parts = line.split(":", 1)
                if len(parts) == 2 and parts[1].strip():
                    additional[field] = parts[1].strip()
                elif i + 1 < len(lines):
                    additional[field] = lines[i + 1]
                else:
                    additional[field] = None
    # Ensure all expected fields exist.
    for field in expected_fields:
        if field not in additional:
            additional[field] = None
    return additional

# ---------------------------
# Helper: Extract course details from a detail URL.
# ---------------------------
def extract_course_details(detail_url):
    """
    Loads the course detail page and extracts:
      - description: text after "Course Description:" (optionally up to "Credit Hours:")
      - additional_info: dictionary of extra fields extracted from the description block.
      - schedule: clicks the schedule link (if available), obtains raw schedule text, and passes it through parse_schedule_block().
    Returns a tuple (description, additional_info, schedule)
    """
    driver.get(detail_url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
    time.sleep(2)
    
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    container = soup.find("div", id="coursecontent")
    full_text = container.get_text(separator="\n", strip=True) if container else soup.get_text(separator="\n", strip=True)
    
    # Extract course description and additional info.
    description = ""
    additional_info = {}
    if "Course Description:" in full_text:
        parts = full_text.split("Course Description:", 1)
        desc_part = parts[1]
        if "Credit Hours:" in desc_part:
            description = desc_part.split("Credit Hours:", 1)[0].strip()
            additional_text = "Credit Hours:" + desc_part.split("Credit Hours:", 1)[1]
        else:
            description = desc_part.strip()
            additional_text = ""
        additional_info = parse_additional_info(additional_text)
    else:
        description = full_text
    
    # Now attempt to click the schedule link.
    schedule = ""
    try:
        schedule_link = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//*[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'click here for the spring 2025 class schedule')]"))
        )
        print("        Found schedule link; clicking it...")
        schedule_link.click()
        time.sleep(2)
        current_windows = driver.window_handles
        if len(current_windows) > 1:
            for win in current_windows:
                if win != main_window:
                    driver.switch_to.window(win)
                    break
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            time.sleep(2)
            html_sched = driver.page_source
            soup_sched = BeautifulSoup(html_sched, "html.parser")
            raw_schedule = soup_sched.get_text(separator="\n", strip=True)
            driver.close()
            driver.switch_to.window(main_window)
        else:
            html_sched = driver.page_source
            soup_sched = BeautifulSoup(html_sched, "html.parser")
            raw_schedule = soup_sched.get_text(separator="\n", strip=True)
        schedule = parse_schedule_block(raw_schedule)
    except Exception as e:
        print("        Schedule link not found or error:", e)
        schedule = "No information is available"
    
    return description, additional_info, schedule

# ---------------------------
# Main Loop: Process each keyword and section
# ---------------------------
for keyword in search_keywords:
    for section_name, page_param in sections:
        page = 1
        max_page_found = None
        consecutive_empty_pages = 0
        while True:
            params = base_params.copy()
            params["filter[keyword]"] = keyword
            if page_param == "cpage":
                params["cpage"] = str(page)
                params["ecpage"] = "1"
            else:
                params["ecpage"] = str(page)
                params["cpage"] = "1"
            query = urllib.parse.urlencode(params)
            page_url = f"{base_url}?{query}"
            print(f"\nKeyword '{keyword}', section '{section_name}', page {page}:")
            print("  URL:", page_url)
            driver.get(page_url)
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.XPATH, "//a[contains(@href, 'preview_course_nopop.php')]"))
                )
            except Exception as e:
                print("  Timeout waiting for course links:", e)
                break
            time.sleep(2)
            
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            if max_page_found is None:
                max_page_found = get_max_page(soup)
                if max_page_found:
                    print(f"  Determined maximum page = {max_page_found} for this section.")
                else:
                    print("  No maximum page info found; will break when no courses are returned.")
            
            course_link_elements = soup.find_all("a", href=lambda href: href and "preview_course_nopop.php" in href)
            if not course_link_elements:
                print("  No course links found; ending pagination for this section.")
                break
            
            course_links = []
            for elem in course_link_elements:
                text = elem.get_text(strip=True)
                href_val = elem.get("href")
                if href_val and text and text.upper().startswith(keyword.upper()):
                    if not href_val.startswith("http"):
                        href_val = "https://catalogue.uncw.edu/" + href_val.lstrip("/")
                    course_links.append((href_val, text))
            print(f"  Found {len(course_links)} courses on this page after filtering by prefix '{keyword}'.")
            
            new_courses_count = 0
            for orig_href, course_text in course_links:
                detail_url = orig_href.replace("preview_course_nopop.php", "preview_course.php")
                if detail_url in processed_details:
                    print(f"    Skipping duplicate course: {course_text}")
                    continue
                processed_details.add(detail_url)
                new_courses_count += 1
                print(f"    Processing course: {course_text}")
                print("      Detail URL:", detail_url)
                desc, add_info, sched = extract_course_details(detail_url)
                print("        Description length:", len(desc))
                if isinstance(sched, list):
                    print("        Schedule entries:", len(sched))
                else:
                    print("        Schedule:", sched)
                all_courses.append({
                    "course": course_text,
                    "detail_url": detail_url,
                    "keyword": keyword,
                    "section": section_name,
                    "description": desc,
                    "additional_info": add_info,
                    "schedule": sched
                })
                time.sleep(1)
            
            if new_courses_count == 0:
                consecutive_empty_pages += 1
                print(f"  No new courses found on this page (consecutive empty pages: {consecutive_empty_pages}).")
            else:
                consecutive_empty_pages = 0
            
            if consecutive_empty_pages >= 2:
                print("  Two consecutive pages with no new courses; ending pagination for this section.")
                break
            if max_page_found is not None and page >= max_page_found:
                print(f"  Reached maximum page ({max_page_found}); ending section '{section_name}' for keyword '{keyword}'.")
                break
            
            page += 1
            time.sleep(1)

driver.quit()

# ---------------------------
# Output results as JSON
# ---------------------------
json_data = json.dumps({"courses": all_courses}, indent=4)
print("\nJSON output:")
print(json_data)



Keyword 'MIS', section 'prefix', page 1:
  URL: https://catalogue.uncw.edu/search_advanced.php?cur_cat_oid=74&search_database=Search&search_db=Search&cpage=1&ecpage=1&ppage=1&spage=1&tpage=1&location=3&filter%5Bkeyword%5D=MIS
  Determined maximum page = 3 for this section.
  Found 10 courses on this page after filtering by prefix 'MIS'.
    Processing course: MIS503 - Programming For Analytics
      Detail URL: https://catalogue.uncw.edu/preview_course.php?catoid=74&coid=277198
        Found schedule link; clicking it...
        Description length: 674
        Schedule entries: 109
    Processing course: MIS504 - Databases for Analytics
      Detail URL: https://catalogue.uncw.edu/preview_course.php?catoid=74&coid=277199
        Found schedule link; clicking it...
        Description length: 447
        Schedule entries: 109
    Processing course: MIS505 - Data Visualization
      Detail URL: https://catalogue.uncw.edu/preview_course.php?catoid=74&coid=277200
        Found schedule li

In [13]:
import re
import time
import json
import urllib.parse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# ---------------------------
# Setup Selenium (headless)
# ---------------------------
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
main_window = driver.current_window_handle

# ---------------------------
# Base URL and constant parameters
# ---------------------------
base_url = "https://catalogue.uncw.edu/search_advanced.php"
base_params = {
    "cur_cat_oid": "74",
    "search_database": "Search",
    "search_db": "Search",
    # These page parameters will be overridden.
    "cpage": "1",
    "ecpage": "1",
    "ppage": "1",
    "spage": "1",
    "tpage": "1",
    "location": "3",
    # "filter[keyword]" will be set per keyword.
}

# ---------------------------
# Settings
# ---------------------------
# Process courses whose title begins with one of these codes.
search_keywords = ["MIS", "CSC", "CYBR", "BAN"]

# Define sections:
#   - "prefix" uses the "cpage" parameter.
#   - "location" uses the "ecpage" parameter.
sections = [("prefix", "cpage"), ("location", "ecpage")]

# Set to track processed detail URLs to avoid duplicates.
processed_details = set()
all_courses = []

# ---------------------------
# Helper: Get maximum page number from navigation
# ---------------------------
def get_max_page(soup):
    nav_elem = soup.find(lambda tag: tag.name in ["nav", "div", "p", "td"] and "Page:" in tag.get_text())
    if nav_elem:
        nav_text = nav_elem.get_text(separator=" ", strip=True)
        m = re.search(r"Page:\s*(.*)", nav_text)
        if m:
            pages_str = m.group(1)
            nums = re.findall(r"\d+", pages_str)
            if nums:
                return int(nums[-1])
    return None

# ---------------------------
# Helper: Clean and structure the schedule block
# ---------------------------
def parse_schedule_block(raw_text):
    """
    Cleans the raw schedule text by:
      - Splitting into lines and removing extraneous navigation lines.
      - Removing lines containing "Ellucian Company L.P. and its affiliates."
      - Returning the block starting at "Scheduled Meeting Times" as a list of lines.
      - If the cleaned schedule indicates "No classes..." or is empty, returns "No information is available".
    """
    lines = raw_text.splitlines()
    removal_patterns = [
        r"Class Schedule Listing",
        r"Go to Main Content",
        r"SeaNet",
        r"HELP",
        r"\|",
        r"EXIT",
        r"Print[- ]?Friendly Page",
        r"Return to Previous",
        r"Skip to top of page",
        r"Release:",
        r"Powered by",
        r"Ellucian Company L\.P\. and its affiliates\."
    ]
    clean_lines = [line.strip() for line in lines if line.strip() and not any(re.search(pat, line, re.IGNORECASE) for pat in removal_patterns)]
    try:
        header_index = clean_lines.index("Scheduled Meeting Times")
    except ValueError:
        clean_text = "\n".join(clean_lines)
        if re.search(r"No classes were found", clean_text, re.IGNORECASE):
            return "No information is available"
        return clean_text
    schedule_block = clean_lines[header_index:]
    schedule_str = "\n".join(schedule_block)
    if re.search(r"No classes were found", schedule_str, re.IGNORECASE):
        return "No information is available"
    return schedule_block

# ---------------------------
# Helper: Parse additional info from the description block
# ---------------------------
def parse_additional_info(text):
    """
    From the text block (typically after Credit Hours:), extract key-value pairs for extra fields.
    Looks for lines like "Field Name: Value" for known fields.
    Returns a dictionary.
    """
    additional = {}
    expected_fields = [
        "Credit Hours",
        "Prerequisite Courses",
        "Additional Restrictions/ Requirements",
        "Course Repeatability",
        "Maximum Repeatable Hours",
        "Equivalent Courses",
        "Undergraduate Crosslisting",
        "Additional Course Fees",
        "Course Attribute"
    ]
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    for i, line in enumerate(lines):
        for field in expected_fields:
            # Match if the line starts with the field name (optionally followed by a colon)
            if re.match(rf"^{re.escape(field)}\s*:?", line, re.IGNORECASE):
                parts = line.split(":", 1)
                if len(parts) == 2 and parts[1].strip():
                    additional[field] = parts[1].strip()
                elif i + 1 < len(lines):
                    additional[field] = lines[i + 1]
                else:
                    additional[field] = None
    # Ensure all expected fields exist.
    for field in expected_fields:
        if field not in additional:
            additional[field] = None
    return additional

# ---------------------------
# Helper: Extract course details (description, additional_info, schedule)
# ---------------------------
def extract_course_details(detail_url):
    """
    Loads the course detail page and extracts:
      - description: text after "Course Description:" (optionally until "Credit Hours:")
      - additional_info: a dictionary parsed from the text after "Credit Hours:"
      - schedule: clicks the schedule link (if available), extracts the raw schedule text, and cleans it.
    If no schedule link is found, returns "No information is available" for schedule.
    Returns a tuple (description, additional_info, schedule)
    """
    driver.get(detail_url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
    time.sleep(2)
    
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    container = soup.find("div", id="coursecontent")
    full_text = container.get_text(separator="\n", strip=True) if container else soup.get_text(separator="\n", strip=True)
    
    description = ""
    additional_info = {}
    if "Course Description:" in full_text:
        parts = full_text.split("Course Description:", 1)
        desc_part = parts[1]
        if "Credit Hours:" in desc_part:
            description = desc_part.split("Credit Hours:", 1)[0].strip()
            additional_text = "Credit Hours:" + desc_part.split("Credit Hours:", 1)[1]
        else:
            description = desc_part.strip()
            additional_text = ""
        additional_info = parse_additional_info(additional_text)
    else:
        description = full_text
    
    schedule = ""
    try:
        schedule_link = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//*[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'click here for the spring 2025 class schedule')]"))
        )
        print("        Found schedule link; clicking it...")
        schedule_link.click()
        time.sleep(2)
        current_windows = driver.window_handles
        if len(current_windows) > 1:
            for win in current_windows:
                if win != main_window:
                    driver.switch_to.window(win)
                    break
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            time.sleep(2)
            html_sched = driver.page_source
            soup_sched = BeautifulSoup(html_sched, "html.parser")
            raw_schedule = soup_sched.get_text(separator="\n", strip=True)
            driver.close()
            driver.switch_to.window(main_window)
        else:
            html_sched = driver.page_source
            soup_sched = BeautifulSoup(html_sched, "html.parser")
            raw_schedule = soup_sched.get_text(separator="\n", strip=True)
        schedule = parse_schedule_block(raw_schedule)
    except Exception as e:
        print("        Schedule link not found or error:", e)
        schedule = "No information is available"
    
    return description, additional_info, schedule

# ---------------------------
# Main Loop: Process each keyword and section
# ---------------------------
for keyword in search_keywords:
    for section_name, page_param in sections:
        page = 1
        max_page_found = None
        consecutive_empty_pages = 0
        while True:
            params = base_params.copy()
            params["filter[keyword]"] = keyword
            if page_param == "cpage":
                params["cpage"] = str(page)
                params["ecpage"] = "1"
            else:
                params["ecpage"] = str(page)
                params["cpage"] = "1"
            query = urllib.parse.urlencode(params)
            page_url = f"{base_url}?{query}"
            print(f"\nKeyword '{keyword}', section '{section_name}', page {page}:")
            print("  URL:", page_url)
            driver.get(page_url)
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.XPATH, "//a[contains(@href, 'preview_course_nopop.php')]"))
                )
            except Exception as e:
                print("  Timeout waiting for course links:", e)
                break
            time.sleep(2)
            
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            if max_page_found is None:
                max_page_found = get_max_page(soup)
                if max_page_found:
                    print(f"  Determined maximum page = {max_page_found} for this section.")
                else:
                    print("  No maximum page info found; will break when no courses are returned.")
            
            course_link_elements = soup.find_all("a", href=lambda href: href and "preview_course_nopop.php" in href)
            if not course_link_elements:
                print("  No course links found; ending pagination for this section.")
                break
            
            course_links = []
            for elem in course_link_elements:
                text = elem.get_text(strip=True)
                href_val = elem.get("href")
                if href_val and text and text.upper().startswith(keyword.upper()):
                    if not href_val.startswith("http"):
                        href_val = "https://catalogue.uncw.edu/" + href_val.lstrip("/")
                    course_links.append((href_val, text))
            print(f"  Found {len(course_links)} courses on this page after filtering by prefix '{keyword}'.")
            
            new_courses_count = 0
            for orig_href, course_text in course_links:
                detail_url = orig_href.replace("preview_course_nopop.php", "preview_course.php")
                if detail_url in processed_details:
                    print(f"    Skipping duplicate course: {course_text}")
                    continue
                processed_details.add(detail_url)
                new_courses_count += 1
                print(f"    Processing course: {course_text}")
                print("      Detail URL:", detail_url)
                desc, add_info, sched = extract_course_details(detail_url)
                print("        Description length:", len(desc))
                if isinstance(sched, list):
                    print("        Schedule entries:", len(sched))
                else:
                    print("        Schedule:", sched)
                all_courses.append({
                    "course": course_text,
                    "detail_url": detail_url,
                    "keyword": keyword,
                    "section": section_name,
                    "description": desc,
                    "additional_info": add_info,
                    "schedule": sched
                })
                time.sleep(1)
            
            if new_courses_count == 0:
                consecutive_empty_pages += 1
                print(f"  No new courses found on this page (consecutive empty pages: {consecutive_empty_pages}).")
            else:
                consecutive_empty_pages = 0
            
            if consecutive_empty_pages >= 2:
                print("  Two consecutive pages with no new courses; ending pagination for this section.")
                break
            
            if max_page_found is not None and page >= max_page_found:
                print(f"  Reached maximum page ({max_page_found}); ending section '{section_name}' for keyword '{keyword}'.")
                break
            
            page += 1
            time.sleep(1)

driver.quit()

# ---------------------------
# Write JSON output to file
# ---------------------------
output_filename = "2024_2025_spring_courses.json"
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump({"courses": all_courses}, f, indent=4)

print("\nJSON output written to", output_filename)



Keyword 'MIS', section 'prefix', page 1:
  URL: https://catalogue.uncw.edu/search_advanced.php?cur_cat_oid=74&search_database=Search&search_db=Search&cpage=1&ecpage=1&ppage=1&spage=1&tpage=1&location=3&filter%5Bkeyword%5D=MIS
  Determined maximum page = 3 for this section.
  Found 10 courses on this page after filtering by prefix 'MIS'.
    Processing course: MIS503 - Programming For Analytics
      Detail URL: https://catalogue.uncw.edu/preview_course.php?catoid=74&coid=277198
        Found schedule link; clicking it...
        Description length: 674
        Schedule entries: 109
    Processing course: MIS504 - Databases for Analytics
      Detail URL: https://catalogue.uncw.edu/preview_course.php?catoid=74&coid=277199
        Found schedule link; clicking it...
        Description length: 447
        Schedule entries: 109
    Processing course: MIS505 - Data Visualization
      Detail URL: https://catalogue.uncw.edu/preview_course.php?catoid=74&coid=277200
        Found schedule li

In [14]:
import re
import time
import json
import urllib.parse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# ---------------------------
# Setup Selenium (headless)
# ---------------------------
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
main_window = driver.current_window_handle

# ---------------------------
# Base URL and constant parameters
# ---------------------------
base_url = "https://catalogue.uncw.edu/search_advanced.php"
base_params = {
    "cur_cat_oid": "74",
    "search_database": "Search",
    "search_db": "Search",
    # These page parameters will be overridden.
    "cpage": "1",
    "ecpage": "1",
    "ppage": "1",
    "spage": "1",
    "tpage": "1",
    "location": "3",
    # "filter[keyword]" will be set per keyword.
}

# ---------------------------
# Settings
# ---------------------------
# Process courses whose title begins with one of these codes.
search_keywords = ["MIS", "CSC", "CYBR", "BAN"]

# Define sections:
#   - "prefix" uses the "cpage" parameter.
#   - "location" uses the "ecpage" parameter.
sections = [("prefix", "cpage"), ("location", "ecpage")]

# Set to track processed detail URLs to avoid duplicates.
processed_details = set()
all_courses = []

# ---------------------------
# Helper: Get maximum page number
# ---------------------------
def get_max_page(soup):
    nav_elem = soup.find(lambda tag: tag.name in ["nav", "div", "p", "td"] and "Page:" in tag.get_text())
    if nav_elem:
        nav_text = nav_elem.get_text(separator=" ", strip=True)
        m = re.search(r"Page:\s*(.*)", nav_text)
        if m:
            pages_str = m.group(1)
            nums = re.findall(r"\d+", pages_str)
            if nums:
                return int(nums[-1])
    return None

# ---------------------------
# Helper: Clean and structure the schedule block
# ---------------------------
def parse_schedule_block(raw_text):
    """
    Cleans the raw schedule text by:
      - Splitting into lines and removing extraneous navigation lines.
      - Removing any line containing "Ellucian Company L.P. and its affiliates."
      - Returning the block starting at "Scheduled Meeting Times" as a list of lines.
      - If the resulting block is empty or contains a "No classes" message, returns a fallback message.
    """
    lines = raw_text.splitlines()
    removal_patterns = [
        r"Class Schedule Listing",
        r"Go to Main Content",
        r"SeaNet",
        r"HELP",
        r"\|",
        r"EXIT",
        r"Print[- ]?Friendly Page",
        r"Return to Previous",
        r"Skip to top of page",
        r"Release:",
        r"Powered by",
        r"Ellucian Company L\.P\. and its affiliates\."
    ]
    # Remove unwanted lines.
    clean_lines = [line.strip() for line in lines if line.strip() and not any(re.search(pat, line, re.IGNORECASE) for pat in removal_patterns)]
    try:
        header_index = clean_lines.index("Scheduled Meeting Times")
    except ValueError:
        clean_text = "\n".join(clean_lines)
        if re.search(r"No classes were found", clean_text, re.IGNORECASE):
            return "No information is available"
        return clean_text
    schedule_block = clean_lines[header_index:]
    schedule_str = "\n".join(schedule_block)
    if re.search(r"No classes were found", schedule_str, re.IGNORECASE):
        return "No information is available"
    return schedule_block

# ---------------------------
# Helper: Parse additional info from description block
# ---------------------------
def parse_additional_info(text):
    """
    From the text block (typically after Credit Hours:),
    extract key-value pairs for known fields.
    If a field line contains only the field name, then use the next non-empty line as its value.
    Returns a dictionary mapping field names to values. Fields with no available value are omitted.
    """
    additional = {}
    expected_fields = [
        "Credit Hours",
        "Prerequisite Courses",
        "Additional Restrictions/ Requirements",
        "Course Repeatability",
        "Maximum Repeatable Hours",
        "Equivalent Courses",
        "Undergraduate Crosslisting",
        "Additional Course Fees",
        "Course Attribute"
    ]
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    for i, line in enumerate(lines):
        for field in expected_fields:
            # Match if the line starts with the field name (optionally followed by a colon)
            if re.match(rf"^{re.escape(field)}\s*:?", line, re.IGNORECASE):
                parts = line.split(":", 1)
                if len(parts) == 2 and parts[1].strip():
                    additional[field] = parts[1].strip()
                elif i + 1 < len(lines):
                    additional[field] = lines[i + 1]
                # If no valid value is found, do not add the key.
    # Filter out any keys that ended up with None or empty values.
    additional = {k: v for k, v in additional.items() if v is not None and v != ""}
    return additional

# ---------------------------
# Helper: Extract course details from a detail URL.
# ---------------------------
def extract_course_details(detail_url):
    """
    Loads the course detail page and extracts:
      - description: text after "Course Description:" (optionally up to "Credit Hours:")
      - additional_info: dictionary of extra fields extracted from the description block.
      - schedule: clicks the schedule link (if available), obtains raw schedule text, and passes it through parse_schedule_block().
    Returns a tuple (description, additional_info, schedule)
    """
    driver.get(detail_url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
    time.sleep(2)
    
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    container = soup.find("div", id="coursecontent")
    full_text = container.get_text(separator="\n", strip=True) if container else soup.get_text(separator="\n", strip=True)
    
    # Extract course description and additional info.
    description = ""
    additional_info = {}
    if "Course Description:" in full_text:
        parts = full_text.split("Course Description:", 1)
        desc_part = parts[1]
        if "Credit Hours:" in desc_part:
            description = desc_part.split("Credit Hours:", 1)[0].strip()
            additional_text = "Credit Hours:" + desc_part.split("Credit Hours:", 1)[1]
        else:
            description = desc_part.strip()
            additional_text = ""
        additional_info = parse_additional_info(additional_text)
    else:
        description = full_text
    
    # Now attempt to click the schedule link.
    schedule = ""
    try:
        schedule_link = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//*[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'click here for the spring 2025 class schedule')]"))
        )
        print("        Found schedule link; clicking it...")
        schedule_link.click()
        time.sleep(2)
        current_windows = driver.window_handles
        if len(current_windows) > 1:
            for win in current_windows:
                if win != main_window:
                    driver.switch_to.window(win)
                    break
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            time.sleep(2)
            html_sched = driver.page_source
            soup_sched = BeautifulSoup(html_sched, "html.parser")
            raw_schedule = soup_sched.get_text(separator="\n", strip=True)
            driver.close()
            driver.switch_to.window(main_window)
        else:
            html_sched = driver.page_source
            soup_sched = BeautifulSoup(html_sched, "html.parser")
            raw_schedule = soup_sched.get_text(separator="\n", strip=True)
        schedule = parse_schedule_block(raw_schedule)
    except Exception as e:
        print("        Schedule link not found or error:", e)
        schedule = "No information is available"
    
    return description, additional_info, schedule

# ---------------------------
# Main Loop: Process each keyword and section
# ---------------------------
for keyword in search_keywords:
    for section_name, page_param in sections:
        page = 1
        max_page_found = None
        consecutive_empty_pages = 0
        while True:
            params = base_params.copy()
            params["filter[keyword]"] = keyword
            if page_param == "cpage":
                params["cpage"] = str(page)
                params["ecpage"] = "1"
            else:
                params["ecpage"] = str(page)
                params["cpage"] = "1"
            query = urllib.parse.urlencode(params)
            page_url = f"{base_url}?{query}"
            print(f"\nKeyword '{keyword}', section '{section_name}', page {page}:")
            print("  URL:", page_url)
            driver.get(page_url)
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.XPATH, "//a[contains(@href, 'preview_course_nopop.php')]"))
                )
            except Exception as e:
                print("  Timeout waiting for course links:", e)
                break
            time.sleep(2)
            
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            if max_page_found is None:
                max_page_found = get_max_page(soup)
                if max_page_found:
                    print(f"  Determined maximum page = {max_page_found} for this section.")
                else:
                    print("  No maximum page info found; will break when no courses are returned.")
            
            course_link_elements = soup.find_all("a", href=lambda href: href and "preview_course_nopop.php" in href)
            if not course_link_elements:
                print("  No course links found; ending pagination for this section.")
                break
            
            course_links = []
            for elem in course_link_elements:
                text = elem.get_text(strip=True)
                href_val = elem.get("href")
                if href_val and text and text.upper().startswith(keyword.upper()):
                    if not href_val.startswith("http"):
                        href_val = "https://catalogue.uncw.edu/" + href_val.lstrip("/")
                    course_links.append((href_val, text))
            print(f"  Found {len(course_links)} courses on this page after filtering by prefix '{keyword}'.")
            
            new_courses_count = 0
            for orig_href, course_text in course_links:
                detail_url = orig_href.replace("preview_course_nopop.php", "preview_course.php")
                if detail_url in processed_details:
                    print(f"    Skipping duplicate course: {course_text}")
                    continue
                processed_details.add(detail_url)
                new_courses_count += 1
                print(f"    Processing course: {course_text}")
                print("      Detail URL:", detail_url)
                desc, add_info, sched = extract_course_details(detail_url)
                print("        Description length:", len(desc))
                if isinstance(sched, list):
                    print("        Schedule entries:", len(sched))
                else:
                    print("        Schedule:", sched)
                
                # Build the course dictionary and filter out any keys with None (or empty) values.
                course_data = {
                    "course": course_text,
                    "detail_url": detail_url,
                    "keyword": keyword,
                    "section": section_name,
                    "description": desc,
                    "additional_info": add_info,
                    "schedule": sched
                }
                filtered_course_data = {k: v for k, v in course_data.items() if v is not None and v != ""}
                all_courses.append(filtered_course_data)
                time.sleep(1)
            
            if new_courses_count == 0:
                consecutive_empty_pages += 1
                print(f"  No new courses found on this page (consecutive empty pages: {consecutive_empty_pages}).")
            else:
                consecutive_empty_pages = 0
            
            if consecutive_empty_pages >= 2:
                print("  Two consecutive pages with no new courses; ending pagination for this section.")
                break
            if max_page_found is not None and page >= max_page_found:
                print(f"  Reached maximum page ({max_page_found}); ending section '{section_name}' for keyword '{keyword}'.")
                break
            
            page += 1
            time.sleep(1)

driver.quit()

# ---------------------------
# Output results as JSON
# ---------------------------
json_data = json.dumps({"courses": all_courses}, indent=4)
print("\nJSON output:")
print(json_data)



Keyword 'MIS', section 'prefix', page 1:
  URL: https://catalogue.uncw.edu/search_advanced.php?cur_cat_oid=74&search_database=Search&search_db=Search&cpage=1&ecpage=1&ppage=1&spage=1&tpage=1&location=3&filter%5Bkeyword%5D=MIS
  Determined maximum page = 3 for this section.
  Found 10 courses on this page after filtering by prefix 'MIS'.
    Processing course: MIS503 - Programming For Analytics
      Detail URL: https://catalogue.uncw.edu/preview_course.php?catoid=74&coid=277198
        Found schedule link; clicking it...
        Description length: 674
        Schedule entries: 109
    Processing course: MIS504 - Databases for Analytics
      Detail URL: https://catalogue.uncw.edu/preview_course.php?catoid=74&coid=277199
        Found schedule link; clicking it...
        Description length: 447
        Schedule entries: 109
    Processing course: MIS505 - Data Visualization
      Detail URL: https://catalogue.uncw.edu/preview_course.php?catoid=74&coid=277200
        Found schedule li

In [15]:
import re
import time
import json
import urllib.parse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# ---------------------------
# Setup Selenium (headless)
# ---------------------------
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
main_window = driver.current_window_handle

# ---------------------------
# Base URL and constant parameters
# ---------------------------
base_url = "https://catalogue.uncw.edu/search_advanced.php"
base_params = {
    "cur_cat_oid": "74",
    "search_database": "Search",
    "search_db": "Search",
    # These page parameters will be overridden.
    "cpage": "1",
    "ecpage": "1",
    "ppage": "1",
    "spage": "1",
    "tpage": "1",
    "location": "3",
    # "filter[keyword]" will be set per keyword.
}

# ---------------------------
# Settings
# ---------------------------
# Process courses whose title begins with one of these codes.
search_keywords = ["MIS", "CSC", "CYBR", "BAN"]

# Define sections:
#   - "prefix" uses the "cpage" parameter.
#   - "location" uses the "ecpage" parameter.
sections = [("prefix", "cpage"), ("location", "ecpage")]

# Set to track processed detail URLs to avoid duplicates.
processed_details = set()
all_courses = []

# ---------------------------
# Helper: Get maximum page number
# ---------------------------
def get_max_page(soup):
    nav_elem = soup.find(lambda tag: tag.name in ["nav", "div", "p", "td"] and "Page:" in tag.get_text())
    if nav_elem:
        nav_text = nav_elem.get_text(separator=" ", strip=True)
        m = re.search(r"Page:\s*(.*)", nav_text)
        if m:
            pages_str = m.group(1)
            nums = re.findall(r"\d+", pages_str)
            if nums:
                return int(nums[-1])
    return None

# ---------------------------
# Helper: Clean and structure the schedule block (text‑based fallback)
# ---------------------------
def parse_schedule_block(raw_text):
    """
    Cleans the raw schedule text by:
      - Splitting into lines and removing extraneous navigation lines.
      - Removing any line containing "Ellucian Company L.P. and its affiliates."
      - Returning the block starting at "Scheduled Meeting Times" as a list of lines.
      - If the resulting block is empty or contains a "No classes" message, returns a fallback message.
    """
    lines = raw_text.splitlines()
    removal_patterns = [
        r"Class Schedule Listing",
        r"Go to Main Content",
        r"SeaNet",
        r"HELP",
        r"\|",
        r"EXIT",
        r"Print[- ]?Friendly Page",
        r"Return to Previous",
        r"Skip to top of page",
        r"Release:",
        r"Powered by",
        r"Ellucian Company L\.P\. and its affiliates\."
    ]
    # Remove unwanted lines.
    clean_lines = [line.strip() for line in lines if line.strip() and not any(re.search(pat, line, re.IGNORECASE) for pat in removal_patterns)]
    try:
        header_index = clean_lines.index("Scheduled Meeting Times")
    except ValueError:
        clean_text = "\n".join(clean_lines)
        if re.search(r"No classes were found", clean_text, re.IGNORECASE):
            return "No information is available"
        return clean_text
    schedule_block = clean_lines[header_index:]
    schedule_str = "\n".join(schedule_block)
    if re.search(r"No classes were found", schedule_str, re.IGNORECASE):
        return "No information is available"
    return schedule_block

# ---------------------------
# New Helper: Parse schedule using structural (table) information if available.
# ---------------------------
def parse_schedule(soup_sched):
    """
    Attempts structure-aware extraction from the schedule page.
    
    1. If a <table> exists, assume it contains schedule data:
       - Extract header row text.
       - Extract each row into a dictionary mapping header -> value.
    2. If no table is found or if table parsing fails, fallback to text‑based parsing.
    """
    schedule_table = soup_sched.find("table")
    if schedule_table:
        schedule_entries = []
        # Attempt to get header row (looking in <th> or first <tr>)
        header_row = schedule_table.find("tr")
        headers = []
        if header_row:
            header_cells = header_row.find_all(["th", "td"])
            headers = [cell.get_text(strip=True) for cell in header_cells]
        # Process remaining rows:
        for row in schedule_table.find_all("tr")[1:]:
            cells = row.find_all("td")
            if cells:
                entry = {}
                for header, cell in zip(headers, cells):
                    value = cell.get_text(strip=True)
                    if value and value.lower() not in ["", "none"]:
                        entry[header] = value
                if entry:
                    schedule_entries.append(entry)
        if schedule_entries:
            return schedule_entries
    # Fallback: use text‑based parsing.
    raw_text = soup_sched.get_text(separator="\n", strip=True)
    return parse_schedule_block(raw_text)

# ---------------------------
# Helper: Parse additional info from description block
# ---------------------------
def parse_additional_info(text):
    """
    From the text block (typically after Credit Hours:),
    extract key-value pairs for known fields.
    If a field line contains only the field name, then use the next non-empty line as its value.
    Returns a dictionary mapping field names to values. Fields with no available value are omitted.
    """
    additional = {}
    expected_fields = [
        "Credit Hours",
        "Prerequisite Courses",
        "Additional Restrictions/ Requirements",
        "Course Repeatability",
        "Maximum Repeatable Hours",
        "Equivalent Courses",
        "Undergraduate Crosslisting",
        "Additional Course Fees",
        "Course Attribute"
    ]
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    for i, line in enumerate(lines):
        for field in expected_fields:
            if re.match(rf"^{re.escape(field)}\s*:?", line, re.IGNORECASE):
                parts = line.split(":", 1)
                if len(parts) == 2 and parts[1].strip():
                    additional[field] = parts[1].strip()
                elif i + 1 < len(lines):
                    additional[field] = lines[i + 1]
    additional = {k: v for k, v in additional.items() if v is not None and v != ""}
    return additional

# ---------------------------
# Helper: Extract course details from a detail URL.
# ---------------------------
def extract_course_details(detail_url):
    """
    Loads the course detail page and extracts:
      - description: text after "Course Description:" (optionally up to "Credit Hours:")
      - additional_info: dictionary of extra fields extracted from the description block.
      - schedule: attempts structure-aware schedule extraction by checking for a table in the schedule page.
                  Falls back to text‑based cleaning if necessary.
    Returns a tuple (description, additional_info, schedule)
    """
    driver.get(detail_url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
    time.sleep(2)
    
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    container = soup.find("div", id="coursecontent")
    full_text = container.get_text(separator="\n", strip=True) if container else soup.get_text(separator="\n", strip=True)
    
    # Extract course description and additional info.
    description = ""
    additional_info = {}
    if "Course Description:" in full_text:
        parts = full_text.split("Course Description:", 1)
        desc_part = parts[1]
        if "Credit Hours:" in desc_part:
            description = desc_part.split("Credit Hours:", 1)[0].strip()
            additional_text = "Credit Hours:" + desc_part.split("Credit Hours:", 1)[1]
        else:
            description = desc_part.strip()
            additional_text = ""
        additional_info = parse_additional_info(additional_text)
    else:
        description = full_text
    
    # Attempt to click the schedule link and process the schedule page.
    schedule = ""
    try:
        schedule_link = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//*[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'click here for the spring 2025 class schedule')]"))
        )
        print("        Found schedule link; clicking it...")
        schedule_link.click()
        time.sleep(2)
        current_windows = driver.window_handles
        if len(current_windows) > 1:
            for win in current_windows:
                if win != main_window:
                    driver.switch_to.window(win)
                    break
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            time.sleep(2)
            html_sched = driver.page_source
            soup_sched = BeautifulSoup(html_sched, "html.parser")
            # Use the new structure-aware schedule parser.
            schedule = parse_schedule(soup_sched)
            driver.close()
            driver.switch_to.window(main_window)
        else:
            html_sched = driver.page_source
            soup_sched = BeautifulSoup(html_sched, "html.parser")
            schedule = parse_schedule(soup_sched)
    except Exception as e:
        print("        Schedule link not found or error:", e)
        schedule = "No information is available"
    
    return description, additional_info, schedule

# ---------------------------
# Main Loop: Process each keyword and section
# ---------------------------
for keyword in search_keywords:
    for section_name, page_param in sections:
        page = 1
        max_page_found = None
        consecutive_empty_pages = 0
        while True:
            params = base_params.copy()
            params["filter[keyword]"] = keyword
            if page_param == "cpage":
                params["cpage"] = str(page)
                params["ecpage"] = "1"
            else:
                params["ecpage"] = str(page)
                params["cpage"] = "1"
            query = urllib.parse.urlencode(params)
            page_url = f"{base_url}?{query}"
            print(f"\nKeyword '{keyword}', section '{section_name}', page {page}:")
            print("  URL:", page_url)
            driver.get(page_url)
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.XPATH, "//a[contains(@href, 'preview_course_nopop.php')]"))
                )
            except Exception as e:
                print("  Timeout waiting for course links:", e)
                break
            time.sleep(2)
            
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            if max_page_found is None:
                max_page_found = get_max_page(soup)
                if max_page_found:
                    print(f"  Determined maximum page = {max_page_found} for this section.")
                else:
                    print("  No maximum page info found; will break when no courses are returned.")
            
            course_link_elements = soup.find_all("a", href=lambda href: href and "preview_course_nopop.php" in href)
            if not course_link_elements:
                print("  No course links found; ending pagination for this section.")
                break
            
            course_links = []
            for elem in course_link_elements:
                text = elem.get_text(strip=True)
                href_val = elem.get("href")
                if href_val and text and text.upper().startswith(keyword.upper()):
                    if not href_val.startswith("http"):
                        href_val = "https://catalogue.uncw.edu/" + href_val.lstrip("/")
                    course_links.append((href_val, text))
            print(f"  Found {len(course_links)} courses on this page after filtering by prefix '{keyword}'.")
            
            new_courses_count = 0
            for orig_href, course_text in course_links:
                detail_url = orig_href.replace("preview_course_nopop.php", "preview_course.php")
                if detail_url in processed_details:
                    print(f"    Skipping duplicate course: {course_text}")
                    continue
                processed_details.add(detail_url)
                new_courses_count += 1
                print(f"    Processing course: {course_text}")
                print("      Detail URL:", detail_url)
                desc, add_info, sched = extract_course_details(detail_url)
                print("        Description length:", len(desc))
                if isinstance(sched, list):
                    print("        Schedule entries:", len(sched))
                else:
                    print("        Schedule:", sched)
                
                course_data = {
                    "course": course_text,
                    "detail_url": detail_url,
                    "keyword": keyword,
                    "section": section_name,
                    "description": desc,
                    "additional_info": add_info,
                    "schedule": sched
                }
                filtered_course_data = {k: v for k, v in course_data.items() if v is not None and v != ""}
                all_courses.append(filtered_course_data)
                time.sleep(1)
            
            if new_courses_count == 0:
                consecutive_empty_pages += 1
                print(f"  No new courses found on this page (consecutive empty pages: {consecutive_empty_pages}).")
            else:
                consecutive_empty_pages = 0
            
            if consecutive_empty_pages >= 2:
                print("  Two consecutive pages with no new courses; ending pagination for this section.")
                break
            if max_page_found is not None and page >= max_page_found:
                print(f"  Reached maximum page ({max_page_found}); ending section '{section_name}' for keyword '{keyword}'.")
                break
            
            page += 1
            time.sleep(1)

driver.quit()

# ---------------------------
# Output results as JSON
# ---------------------------
json_data = json.dumps({"courses": all_courses}, indent=4)
print("\nJSON output:")
print(json_data)



Keyword 'MIS', section 'prefix', page 1:
  URL: https://catalogue.uncw.edu/search_advanced.php?cur_cat_oid=74&search_database=Search&search_db=Search&cpage=1&ecpage=1&ppage=1&spage=1&tpage=1&location=3&filter%5Bkeyword%5D=MIS
  Determined maximum page = 3 for this section.
  Found 10 courses on this page after filtering by prefix 'MIS'.
    Processing course: MIS503 - Programming For Analytics
      Detail URL: https://catalogue.uncw.edu/preview_course.php?catoid=74&coid=277198
        Found schedule link; clicking it...
        Description length: 674
        Schedule entries: 109
    Processing course: MIS504 - Databases for Analytics
      Detail URL: https://catalogue.uncw.edu/preview_course.php?catoid=74&coid=277199
        Found schedule link; clicking it...
        Description length: 447
        Schedule entries: 109
    Processing course: MIS505 - Data Visualization
      Detail URL: https://catalogue.uncw.edu/preview_course.php?catoid=74&coid=277200
        Found schedule li

In [16]:
import re
import time
import json
import urllib.parse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# ---------------------------
# Setup Selenium (headless)
# ---------------------------
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
main_window = driver.current_window_handle

# ---------------------------
# Base URL and constant parameters
# ---------------------------
base_url = "https://catalogue.uncw.edu/search_advanced.php"
base_params = {
    "cur_cat_oid": "74",
    "search_database": "Search",
    "search_db": "Search",
    # These page parameters will be overridden.
    "cpage": "1",
    "ecpage": "1",
    "ppage": "1",
    "spage": "1",
    "tpage": "1",
    "location": "3",
    # "filter[keyword]" will be set per keyword.
}

# ---------------------------
# Settings
# ---------------------------
# Process courses whose title begins with one of these codes.
search_keywords = ["MIS", "CSC", "CYBR", "BAN"]

# Define sections:
#   - "prefix" uses the "cpage" parameter.
#   - "location" uses the "ecpage" parameter.
sections = [("prefix", "cpage"), ("location", "ecpage")]

# Set to track processed detail URLs to avoid duplicates.
processed_details = set()
all_courses = []

# ---------------------------
# Helper: Get maximum page number
# ---------------------------
def get_max_page(soup):
    nav_elem = soup.find(lambda tag: tag.name in ["nav", "div", "p", "td"] and "Page:" in tag.get_text())
    if nav_elem:
        nav_text = nav_elem.get_text(separator=" ", strip=True)
        m = re.search(r"Page:\s*(.*)", nav_text)
        if m:
            pages_str = m.group(1)
            nums = re.findall(r"\d+", pages_str)
            if nums:
                return int(nums[-1])
    return None

# ---------------------------
# Helper: Clean and structure the schedule block (text‑based fallback)
# ---------------------------
def parse_schedule_block(raw_text):
    """
    Cleans the raw schedule text by:
      - Splitting into lines and removing extraneous navigation lines.
      - Removing any line containing "Ellucian Company L.P. and its affiliates."
      - Returning the block starting at "Scheduled Meeting Times" as a list of lines.
      - If the resulting block is empty or contains a "No classes" message, returns a fallback message.
    """
    lines = raw_text.splitlines()
    removal_patterns = [
        r"Class Schedule Listing",
        r"Go to Main Content",
        r"SeaNet",
        r"HELP",
        r"\|",
        r"EXIT",
        r"Print[- ]?Friendly Page",
        r"Return to Previous",
        r"Skip to top of page",
        r"Release:",
        r"Powered by",
        r"Ellucian Company L\.P\. and its affiliates\."
    ]
    # Remove unwanted lines.
    clean_lines = [line.strip() for line in lines if line.strip() and not any(re.search(pat, line, re.IGNORECASE) for pat in removal_patterns)]
    try:
        header_index = clean_lines.index("Scheduled Meeting Times")
    except ValueError:
        clean_text = "\n".join(clean_lines)
        if re.search(r"No classes were found", clean_text, re.IGNORECASE):
            return "No information is available"
        return clean_text
    schedule_block = clean_lines[header_index:]
    schedule_str = "\n".join(schedule_block)
    if re.search(r"No classes were found", schedule_str, re.IGNORECASE):
        return "No information is available"
    return schedule_block

# ---------------------------
# New Helper: Parse schedule using structural (table) information if available.
# ---------------------------
def parse_schedule(soup_sched):
    """
    Attempts structure-aware extraction from the schedule page.
    
    1. If a <table> exists, assume it contains schedule data:
       - Extract header row text.
       - Extract each row into a dictionary mapping header -> value.
    2. If no table is found or if table parsing fails, fallback to text‑based parsing.
    """
    schedule_table = soup_sched.find("table")
    if schedule_table:
        schedule_entries = []
        # Attempt to get header row (looking in <th> or first <tr>)
        header_row = schedule_table.find("tr")
        headers = []
        if header_row:
            header_cells = header_row.find_all(["th", "td"])
            headers = [cell.get_text(strip=True) for cell in header_cells]
        # Process remaining rows:
        for row in schedule_table.find_all("tr")[1:]:
            cells = row.find_all("td")
            if cells:
                entry = {}
                for header, cell in zip(headers, cells):
                    value = cell.get_text(strip=True)
                    if value and value.lower() not in ["", "none"]:
                        entry[header] = value
                if entry:
                    schedule_entries.append(entry)
        if schedule_entries:
            return schedule_entries
    # Fallback: use text‑based parsing.
    raw_text = soup_sched.get_text(separator="\n", strip=True)
    return parse_schedule_block(raw_text)

# ---------------------------
# Helper: Parse additional info from description block
# ---------------------------
def parse_additional_info(text):
    """
    From the text block (typically after Credit Hours:),
    extract key-value pairs for known fields.
    If a field line contains only the field name, then use the next non-empty line as its value.
    Returns a dictionary mapping field names to values. Fields with no available value are omitted.
    """
    additional = {}
    expected_fields = [
        "Credit Hours",
        "Prerequisite Courses",
        "Additional Restrictions/ Requirements",
        "Course Repeatability",
        "Maximum Repeatable Hours",
        "Equivalent Courses",
        "Undergraduate Crosslisting",
        "Additional Course Fees",
        "Course Attribute"
    ]
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    for i, line in enumerate(lines):
        for field in expected_fields:
            if re.match(rf"^{re.escape(field)}\s*:?", line, re.IGNORECASE):
                parts = line.split(":", 1)
                if len(parts) == 2 and parts[1].strip():
                    additional[field] = parts[1].strip()
                elif i + 1 < len(lines):
                    additional[field] = lines[i + 1]
    additional = {k: v for k, v in additional.items() if v is not None and v != ""}
    return additional

# ---------------------------
# New Helper: Summarize course description using custom heuristics.
# ---------------------------
def summarize_description(desc):
    """
    Uses simple rule-based heuristics to capture the core elements of a course description:
      - The introductory sentence (assumed to be the first sentence).
      - A sentence containing a colon (assumed to list techniques/examples).
      - The concluding sentence (the last sentence).
      
    The summary is constructed by concatenating these parts.
    """
    # Split into sentences using punctuation followed by whitespace.
    sentences = re.split(r'(?<=[.!?])\s+', desc.strip())
    summary_parts = []
    if sentences:
        # Add introductory sentence.
        intro = sentences[0].strip()
        if intro:
            summary_parts.append(intro)
        # Look for a sentence with a colon that may list techniques/examples.
        colon_sentences = [s.strip() for s in sentences if ':' in s and s.strip() != intro]
        if colon_sentences:
            summary_parts.append(colon_sentences[0])
        # Add the concluding sentence if it's not the same as the intro.
        if len(sentences) > 1:
            conclusion = sentences[-1].strip()
            if conclusion and conclusion != intro:
                summary_parts.append(conclusion)
    return " ".join(summary_parts)

# ---------------------------
# Helper: Extract course details from a detail URL.
# ---------------------------
def extract_course_details(detail_url):
    """
    Loads the course detail page and extracts:
      - description: text after "Course Description:" (optionally up to "Credit Hours:").
        It is then summarized using rule-based heuristics.
      - additional_info: dictionary of extra fields extracted from the description block.
      - schedule: attempts structure-aware schedule extraction by checking for a table in the schedule page.
                  Falls back to text‑based cleaning if necessary.
    Returns a tuple (description, additional_info, schedule)
    """
    driver.get(detail_url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
    time.sleep(2)
    
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    container = soup.find("div", id="coursecontent")
    full_text = container.get_text(separator="\n", strip=True) if container else soup.get_text(separator="\n", strip=True)
    
    # Extract course description and additional info.
    description = ""
    additional_info = {}
    if "Course Description:" in full_text:
        parts = full_text.split("Course Description:", 1)
        desc_part = parts[1]
        if "Credit Hours:" in desc_part:
            description = desc_part.split("Credit Hours:", 1)[0].strip()
            additional_text = "Credit Hours:" + desc_part.split("Credit Hours:", 1)[1]
        else:
            description = desc_part.strip()
            additional_text = ""
        additional_info = parse_additional_info(additional_text)
    else:
        description = full_text

    # Apply rule-based summarization to the description.
    description = summarize_description(description)
    
    # Attempt to click the schedule link and process the schedule page.
    schedule = ""
    try:
        schedule_link = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//*[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'click here for the spring 2025 class schedule')]"))
        )
        print("        Found schedule link; clicking it...")
        schedule_link.click()
        time.sleep(2)
        current_windows = driver.window_handles
        if len(current_windows) > 1:
            for win in current_windows:
                if win != main_window:
                    driver.switch_to.window(win)
                    break
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            time.sleep(2)
            html_sched = driver.page_source
            soup_sched = BeautifulSoup(html_sched, "html.parser")
            schedule = parse_schedule(soup_sched)
            driver.close()
            driver.switch_to.window(main_window)
        else:
            html_sched = driver.page_source
            soup_sched = BeautifulSoup(html_sched, "html.parser")
            schedule = parse_schedule(soup_sched)
    except Exception as e:
        print("        Schedule link not found or error:", e)
        schedule = "No information is available"
    
    return description, additional_info, schedule

# ---------------------------
# Main Loop: Process each keyword and section
# ---------------------------
for keyword in search_keywords:
    for section_name, page_param in sections:
        page = 1
        max_page_found = None
        consecutive_empty_pages = 0
        while True:
            params = base_params.copy()
            params["filter[keyword]"] = keyword
            if page_param == "cpage":
                params["cpage"] = str(page)
                params["ecpage"] = "1"
            else:
                params["ecpage"] = str(page)
                params["cpage"] = "1"
            query = urllib.parse.urlencode(params)
            page_url = f"{base_url}?{query}"
            print(f"\nKeyword '{keyword}', section '{section_name}', page {page}:")
            print("  URL:", page_url)
            driver.get(page_url)
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.XPATH, "//a[contains(@href, 'preview_course_nopop.php')]"))
                )
            except Exception as e:
                print("  Timeout waiting for course links:", e)
                break
            time.sleep(2)
            
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            if max_page_found is None:
                max_page_found = get_max_page(soup)
                if max_page_found:
                    print(f"  Determined maximum page = {max_page_found} for this section.")
                else:
                    print("  No maximum page info found; will break when no courses are returned.")
            
            course_link_elements = soup.find_all("a", href=lambda href: href and "preview_course_nopop.php" in href)
            if not course_link_elements:
                print("  No course links found; ending pagination for this section.")
                break
            
            course_links = []
            for elem in course_link_elements:
                text = elem.get_text(strip=True)
                href_val = elem.get("href")
                if href_val and text and text.upper().startswith(keyword.upper()):
                    if not href_val.startswith("http"):
                        href_val = "https://catalogue.uncw.edu/" + href_val.lstrip("/")
                    course_links.append((href_val, text))
            print(f"  Found {len(course_links)} courses on this page after filtering by prefix '{keyword}'.")
            
            new_courses_count = 0
            for orig_href, course_text in course_links:
                detail_url = orig_href.replace("preview_course_nopop.php", "preview_course.php")
                if detail_url in processed_details:
                    print(f"    Skipping duplicate course: {course_text}")
                    continue
                processed_details.add(detail_url)
                new_courses_count += 1
                print(f"    Processing course: {course_text}")
                print("      Detail URL:", detail_url)
                desc, add_info, sched = extract_course_details(detail_url)
                print("        Summarized description:", desc)
                if isinstance(sched, list):
                    print("        Schedule entries:", len(sched))
                else:
                    print("        Schedule:", sched)
                
                course_data = {
                    "course": course_text,
                    "detail_url": detail_url,
                    "keyword": keyword,
                    "section": section_name,
                    "description": desc,
                    "additional_info": add_info,
                    "schedule": sched
                }
                filtered_course_data = {k: v for k, v in course_data.items() if v is not None and v != ""}
                all_courses.append(filtered_course_data)
                time.sleep(1)
            
            if new_courses_count == 0:
                consecutive_empty_pages += 1
                print(f"  No new courses found on this page (consecutive empty pages: {consecutive_empty_pages}).")
            else:
                consecutive_empty_pages = 0
            
            if consecutive_empty_pages >= 2:
                print("  Two consecutive pages with no new courses; ending pagination for this section.")
                break
            if max_page_found is not None and page >= max_page_found:
                print(f"  Reached maximum page ({max_page_found}); ending section '{section_name}' for keyword '{keyword}'.")
                break
            
            page += 1
            time.sleep(1)

driver.quit()

# ---------------------------
# Output results as JSON
# ---------------------------
json_data = json.dumps({"courses": all_courses}, indent=4)
print("\nJSON output:")
print(json_data)



Keyword 'MIS', section 'prefix', page 1:
  URL: https://catalogue.uncw.edu/search_advanced.php?cur_cat_oid=74&search_database=Search&search_db=Search&cpage=1&ecpage=1&ppage=1&spage=1&tpage=1&location=3&filter%5Bkeyword%5D=MIS
  Determined maximum page = 3 for this section.
  Found 10 courses on this page after filtering by prefix 'MIS'.
    Processing course: MIS503 - Programming For Analytics
      Detail URL: https://catalogue.uncw.edu/preview_course.php?catoid=74&coid=277198
        Found schedule link; clicking it...
        Summarized description: This course introduces the essential general programming concepts and techniques to a data analytics audience with limited or no prior programming experience. The course introduces software techniques to write functions, debug, and organize and comment code.
        Schedule entries: 109
    Processing course: MIS504 - Databases for Analytics
      Detail URL: https://catalogue.uncw.edu/preview_course.php?catoid=74&coid=277199
        F

In [17]:
import re
import time
import json
import urllib.parse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# ---------------------------
# Setup Selenium (headless)
# ---------------------------
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
main_window = driver.current_window_handle

# ---------------------------
# Base URL and constant parameters
# ---------------------------
base_url = "https://catalogue.uncw.edu/search_advanced.php"
base_params = {
    "cur_cat_oid": "74",
    "search_database": "Search",
    "search_db": "Search",
    # These page parameters will be overridden.
    "cpage": "1",
    "ecpage": "1",
    "ppage": "1",
    "spage": "1",
    "tpage": "1",
    "location": "3",
    # "filter[keyword]" will be set per keyword.
}

# ---------------------------
# Settings
# ---------------------------
# Process courses whose title begins with one of these codes.
search_keywords = ["MIS", "CSC", "CYBR", "BAN"]

# Define sections:
#   - "prefix" uses the "cpage" parameter.
#   - "location" uses the "ecpage" parameter.
sections = [("prefix", "cpage"), ("location", "ecpage")]

# Set to track processed detail URLs to avoid duplicates.
processed_details = set()
all_courses = []

# ---------------------------
# Helper: Get maximum page number
# ---------------------------
def get_max_page(soup):
    nav_elem = soup.find(lambda tag: tag.name in ["nav", "div", "p", "td"] and "Page:" in tag.get_text())
    if nav_elem:
        nav_text = nav_elem.get_text(separator=" ", strip=True)
        m = re.search(r"Page:\s*(.*)", nav_text)
        if m:
            pages_str = m.group(1)
            nums = re.findall(r"\d+", pages_str)
            if nums:
                return int(nums[-1])
    return None

# ---------------------------
# Helper: Clean and structure the schedule block
# ---------------------------
def parse_schedule_block(raw_text):
    """
    Cleans the raw schedule text by:
      - Splitting into lines and removing extraneous navigation lines.
      - Removing any line containing "Ellucian Company L.P. and its affiliates."
      - Returning the block starting at "Scheduled Meeting Times" as a list of lines.
      - If the resulting block is empty or contains a "No classes" message, returns a fallback message.
    """
    lines = raw_text.splitlines()
    removal_patterns = [
        r"Class Schedule Listing",
        r"Go to Main Content",
        r"SeaNet",
        r"HELP",
        r"\|",
        r"EXIT",
        r"Print[- ]?Friendly Page",
        r"Return to Previous",
        r"Skip to top of page",
        r"Release:",
        r"Powered by",
        r"Ellucian Company L\.P\. and its affiliates\."
    ]
    # Remove unwanted lines.
    clean_lines = [line.strip() for line in lines if line.strip() and not any(re.search(pat, line, re.IGNORECASE) for pat in removal_patterns)]
    try:
        header_index = clean_lines.index("Scheduled Meeting Times")
    except ValueError:
        clean_text = "\n".join(clean_lines)
        if re.search(r"No classes were found", clean_text, re.IGNORECASE):
            return "No information is available"
        return clean_text
    schedule_block = clean_lines[header_index:]
    schedule_str = "\n".join(schedule_block)
    if re.search(r"No classes were found", schedule_str, re.IGNORECASE):
        return "No information is available"
    return schedule_block

# ---------------------------
# Helper: Parse additional info from description block
# ---------------------------
def parse_additional_info(text):
    """
    From the text block (typically after Credit Hours:),
    extract key-value pairs for known fields.
    If a field line contains only the field name, then use the next non-empty line as its value.
    Returns a dictionary mapping field names to values.
    """
    additional = {}
    expected_fields = [
        "Credit Hours",
        "Prerequisite Courses",
        "Additional Restrictions/ Requirements",
        "Course Repeatability",
        "Maximum Repeatable Hours",
        "Equivalent Courses",
        "Undergraduate Crosslisting",
        "Additional Course Fees",
        "Course Attribute"
    ]
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    for i, line in enumerate(lines):
        for field in expected_fields:
            # Match if the line starts with the field name (optionally followed by a colon)
            if re.match(rf"^{re.escape(field)}\s*:?", line, re.IGNORECASE):
                # Try to split on colon.
                parts = line.split(":", 1)
                if len(parts) == 2 and parts[1].strip():
                    additional[field] = parts[1].strip()
                elif i + 1 < len(lines):
                    additional[field] = lines[i + 1]
                else:
                    additional[field] = None
    # Ensure all expected fields exist.
    for field in expected_fields:
        if field not in additional:
            additional[field] = None
    return additional

# ---------------------------
# Helper: Extract course details from a detail URL.
# ---------------------------
def extract_course_details(detail_url):
    """
    Loads the course detail page and extracts:
      - description: text after "Course Description:" (optionally up to "Credit Hours:")
      - additional_info: dictionary of extra fields extracted from the description block.
      - schedule: clicks the schedule link (if available), obtains raw schedule text, and passes it through parse_schedule_block().
    Returns a tuple (description, additional_info, schedule)
    """
    driver.get(detail_url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
    time.sleep(2)
    
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    container = soup.find("div", id="coursecontent")
    full_text = container.get_text(separator="\n", strip=True) if container else soup.get_text(separator="\n", strip=True)
    
    # Extract course description and additional info.
    description = ""
    additional_info = {}
    if "Course Description:" in full_text:
        parts = full_text.split("Course Description:", 1)
        desc_part = parts[1]
        if "Credit Hours:" in desc_part:
            description = desc_part.split("Credit Hours:", 1)[0].strip()
            additional_text = "Credit Hours:" + desc_part.split("Credit Hours:", 1)[1]
        else:
            description = desc_part.strip()
            additional_text = ""
        additional_info = parse_additional_info(additional_text)
    else:
        description = full_text
    
    # Now attempt to click the schedule link.
    schedule = ""
    try:
        schedule_link = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//*[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'click here for the spring 2025 class schedule')]"))
        )
        print("        Found schedule link; clicking it...")
        schedule_link.click()
        time.sleep(2)
        current_windows = driver.window_handles
        if len(current_windows) > 1:
            for win in current_windows:
                if win != main_window:
                    driver.switch_to.window(win)
                    break
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            time.sleep(2)
            html_sched = driver.page_source
            soup_sched = BeautifulSoup(html_sched, "html.parser")
            raw_schedule = soup_sched.get_text(separator="\n", strip=True)
            driver.close()
            driver.switch_to.window(main_window)
        else:
            html_sched = driver.page_source
            soup_sched = BeautifulSoup(html_sched, "html.parser")
            raw_schedule = soup_sched.get_text(separator="\n", strip=True)
        schedule = parse_schedule_block(raw_schedule)
    except Exception as e:
        print("        Schedule link not found or error:", e)
        schedule = "No information is available"
    
    return description, additional_info, schedule

# ---------------------------
# Main Loop: Process each keyword and section
# ---------------------------
for keyword in search_keywords:
    for section_name, page_param in sections:
        page = 1
        max_page_found = None
        consecutive_empty_pages = 0
        while True:
            params = base_params.copy()
            params["filter[keyword]"] = keyword
            if page_param == "cpage":
                params["cpage"] = str(page)
                params["ecpage"] = "1"
            else:
                params["ecpage"] = str(page)
                params["cpage"] = "1"
            query = urllib.parse.urlencode(params)
            page_url = f"{base_url}?{query}"
            print(f"\nKeyword '{keyword}', section '{section_name}', page {page}:")
            print("  URL:", page_url)
            driver.get(page_url)
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.XPATH, "//a[contains(@href, 'preview_course_nopop.php')]"))
                )
            except Exception as e:
                print("  Timeout waiting for course links:", e)
                break
            time.sleep(2)
            
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            if max_page_found is None:
                max_page_found = get_max_page(soup)
                if max_page_found:
                    print(f"  Determined maximum page = {max_page_found} for this section.")
                else:
                    print("  No maximum page info found; will break when no courses are returned.")
            
            course_link_elements = soup.find_all("a", href=lambda href: href and "preview_course_nopop.php" in href)
            if not course_link_elements:
                print("  No course links found; ending pagination for this section.")
                break
            
            course_links = []
            for elem in course_link_elements:
                text = elem.get_text(strip=True)
                href_val = elem.get("href")
                if href_val and text and text.upper().startswith(keyword.upper()):
                    if not href_val.startswith("http"):
                        href_val = "https://catalogue.uncw.edu/" + href_val.lstrip("/")
                    course_links.append((href_val, text))
            print(f"  Found {len(course_links)} courses on this page after filtering by prefix '{keyword}'.")
            
            new_courses_count = 0
            for orig_href, course_text in course_links:
                detail_url = orig_href.replace("preview_course_nopop.php", "preview_course.php")
                if detail_url in processed_details:
                    print(f"    Skipping duplicate course: {course_text}")
                    continue
                processed_details.add(detail_url)
                new_courses_count += 1
                print(f"    Processing course: {course_text}")
                print("      Detail URL:", detail_url)
                desc, add_info, sched = extract_course_details(detail_url)
                print("        Description length:", len(desc))
                if isinstance(sched, list):
                    print("        Schedule entries:", len(sched))
                else:
                    print("        Schedule:", sched)
                all_courses.append({
                    "course": course_text,
                    "detail_url": detail_url,
                    "keyword": keyword,
                    "section": section_name,
                    "description": desc,
                    "additional_info": add_info,
                    "schedule": sched
                })
                time.sleep(1)
            
            if new_courses_count == 0:
                consecutive_empty_pages += 1
                print(f"  No new courses found on this page (consecutive empty pages: {consecutive_empty_pages}).")
            else:
                consecutive_empty_pages = 0
            
            if consecutive_empty_pages >= 2:
                print("  Two consecutive pages with no new courses; ending pagination for this section.")
                break
            if max_page_found is not None and page >= max_page_found:
                print(f"  Reached maximum page ({max_page_found}); ending section '{section_name}' for keyword '{keyword}'.")
                break
            
            page += 1
            time.sleep(1)

driver.quit()

# ---------------------------
# Output results as JSON to a file
# ---------------------------
with open("2025SpringCatalogue.json", "w") as f:
    json.dump({"courses": all_courses}, f, indent=4)

print("\nJSON file '2025SpringCatalogue.json' created successfully.")



Keyword 'MIS', section 'prefix', page 1:
  URL: https://catalogue.uncw.edu/search_advanced.php?cur_cat_oid=74&search_database=Search&search_db=Search&cpage=1&ecpage=1&ppage=1&spage=1&tpage=1&location=3&filter%5Bkeyword%5D=MIS
  Determined maximum page = 3 for this section.
  Found 10 courses on this page after filtering by prefix 'MIS'.
    Processing course: MIS503 - Programming For Analytics
      Detail URL: https://catalogue.uncw.edu/preview_course.php?catoid=74&coid=277198
        Found schedule link; clicking it...
        Description length: 674
        Schedule entries: 109
    Processing course: MIS504 - Databases for Analytics
      Detail URL: https://catalogue.uncw.edu/preview_course.php?catoid=74&coid=277199
        Found schedule link; clicking it...
        Description length: 447
        Schedule entries: 109
    Processing course: MIS505 - Data Visualization
      Detail URL: https://catalogue.uncw.edu/preview_course.php?catoid=74&coid=277200
        Found schedule li

In [18]:
import json

def transform_course_data(old_data):
    """
    Takes the old JSON structure:
    {
      "MIS503": {
        "title": "...",
        "description": "...",
        "additional_Info": {
          "Program Restrictions": "None",
          "Additional Restrictions/ Requirements": "None",
          ...
        },
        "schedule": {
          "Fall": {...},
          "Spring": {...}
        }
      },
      ...
    }

    Returns a new JSON structure:
    {
      "courses": [
        {
          "code": "MIS503",
          "title": "...",
          "description": "...",
          "additionalInfo": {
            "programRestrictions": "None",
            "additionalRequirements": "None"
          },
          "schedule": {
            "fall": {...},
            "spring": {...}
          }
        },
        ...
      ]
    }
    """
    new_data = {"courses": []}

    for course_code, course_info in old_data.items():
        new_course = {}

        # Keep the course code in a 'code' field
        new_course["code"] = course_code

        # Copy over title and description directly (if they exist)
        new_course["title"] = course_info.get("title", "")
        new_course["description"] = course_info.get("description", "")

        # Handle "additional_Info" → "additionalInfo"
        old_additional_info = course_info.get("additional_Info", {})
        new_additional_info = {}

        # Example of renaming nested keys:
        if "Program Restrictions" in old_additional_info:
            new_additional_info["programRestrictions"] = old_additional_info["Program Restrictions"]
        if "Additional Restrictions/ Requirements" in old_additional_info:
            new_additional_info["additionalRequirements"] = old_additional_info["Additional Restrictions/ Requirements"]
        if "Undergraduate Courses Required" in old_additional_info:
            new_additional_info["undergraduateCoursesRequired"] = old_additional_info["Undergraduate Courses Required"]
        if "Maximum Transfer Credits" in old_additional_info:
            new_additional_info["maximumTransferCredits"] = old_additional_info["Maximum Transfer Credits"]

        # Add the transformed additional info
        if new_additional_info:
            new_course["additionalInfo"] = new_additional_info

        # Handle schedule
        # If you want to rename "Fall" to "fall" or "Spring" to "spring", do it here:
        old_schedule = course_info.get("schedule", {})
        new_schedule = {}
        if "Fall" in old_schedule:
            new_schedule["fall"] = old_schedule["Fall"]
        if "Spring" in old_schedule:
            new_schedule["spring"] = old_schedule["Spring"]

        if new_schedule:
            new_course["schedule"] = new_schedule

        # Append to the courses list
        new_data["courses"].append(new_course)

    return new_data


def main():
    # 1. Load the old JSON data (the "current format")
    with open("course_index_cache.json", "r") as f:
        old_data = json.load(f)

    # 2. Transform the old data into the new structure
    new_data = transform_course_data(old_data)

    # 3. Write the new data out in a nicely formatted (indented) JSON file
    with open("converted_course_index.json", "w") as f:
        json.dump(new_data, f, indent=4)

    print("Conversion complete! Check 'converted_course_index.json' for the result.")

if __name__ == "__main__":
    main()


Conversion complete! Check 'converted_course_index.json' for the result.
