# Webscraper

### Import libraries

In [None]:
import asyncio
import aiohttp
import json
import hashlib
import os
import time
from datetime import datetime
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [None]:

OUTPUT_DIR = "data_backups"
OUTPUT_FILE = "hierarchy.json"
BASE = "https://ksu.edu.sa"
UA = {"User-Agent": "Mozilla/5.0"}

is_arabic = False

# Ensure output directory exists
def ensure_output_dir():
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        print(f"📁 Created directory: {OUTPUT_DIR}")

def save_to_json(data, filename=None):
    """Save data to JSON file with proper formatting"""
    ensure_output_dir()
    
    if filename is None:
        filename = OUTPUT_FILE
    
    filepath = os.path.join(OUTPUT_DIR, filename)
    
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        
        print(f"✅ Data successfully saved to: {filepath}")
        print(f"📊 File size: {os.path.getsize(filepath)} bytes")
        
        # Optional: Create a backup with timestamp
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        backup_filename = f"menu_hierarchy_backup_{timestamp}.json"
        backup_filepath = os.path.join(OUTPUT_DIR, backup_filename)
        
        with open(backup_filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        
        print(f"🔄 Backup created: {backup_filepath}")
        
    except Exception as e:
        print(f"❌ Error saving to JSON: {e}")
        raise

# Session management for connection pooling
async def create_session():
    connector = aiohttp.TCPConnector(limit=50, limit_per_host=20)
    timeout = aiohttp.ClientTimeout(total=30)
    return aiohttp.ClientSession(
        connector=connector,
        timeout=timeout,
        headers=UA
    )

# get_soup finds the base url and joins other urls based on needs. If the pages are in arabic (ar), the method finds an English option and clicks on that or switches to en.
async def get_soup(session, path_or_url):
    global is_arabic

    url = path_or_url if path_or_url.startswith("http") else urljoin(BASE, path_or_url)
    
    try:
        async with session.get(url) as response:
            response.raise_for_status()
            text = await response.text()
            soup = BeautifulSoup(text, "html.parser")

            if not is_arabic:  # Only check for Arabic and redirect if needed
                html_tag = soup.find("html")
                if html_tag and html_tag.get("lang", "").startswith("ar"):
                    eng_link = soup.find("a", string=lambda text: text and "English" in text)
                    if eng_link and eng_link.get("href"):
                        eng_url = urljoin(url, eng_link["href"])
                        print(f"🔁 Switching to English version: {eng_url}")
                        async with session.get(eng_url) as eng_response:
                            eng_response.raise_for_status()
                            eng_text = await eng_response.text()
                            soup = BeautifulSoup(eng_text, "html.parser")
                            is_arabic = True

            return soup
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        raise

# Removing all Arabic Unicode blocks: \u0600 to \u06FF, plus extended blocks if needed
def remove_arabic(text):
    arabic_re = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+')
    return arabic_re.sub('', text)

async def find_faculty_links(session, dep_url):
    try:
        soup = await get_soup(session, dep_url)
    except Exception:
        return []

    links = []
    for a in soup.select("a[href]"):
        txt = remove_arabic(a.get_text(strip=True)).lower()
        if any(kw in txt for kw in ["faculty", "staff", "member", "academic team", "people", "employee", "employees"]):
            href = a["href"]
            if href:
                full_url = urljoin(dep_url, href)
                if full_url.startswith("http"):
                    links.append({
                        "title": a.get_text(strip=True),
                        "url": full_url
                    })
    return links


# Finding the contact link for the academic departments 
def find_contact_link(soup, base_url):
    for a in soup.select("a[href]"):
        text = remove_arabic(a.get_text(strip=True)).lower()
        if "contact" in text:
            href = a["href"]
            return urljoin(base_url, href)
    return None


# Scraping the contact info from the contact links
async def extract_contact_info(session, contact_url):
    """Scrape the Contact Us page and extract phone, email, location, and other info."""
    try:
        soup = await get_soup(session, contact_url)
    except Exception:
        return "Contact page could not be loaded."

    text = soup.get_text(separator="\n")
    lines = [line.strip() for line in text.split("\n") if line.strip()]

    phone = "Not found"
    email = "Not found"
    location = "Not found"
    extras = []

    for line in lines:
        lowered = line.lower()

        if email == "Not found" and re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", line):
            email = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", line).group(0)
            continue

        if phone == "Not found" and re.search(r"(\+?\d[\d\s\-()]{7,})", line):
            phone = re.search(r"(\+?\d[\d\s\-()]{7,})", line).group(0)
            continue

        if location == "Not found" and any(x in lowered for x in ["ksa", "riyadh", "building", "street", "road", "kingdom", "campus", "p.o", "box", "hall"]):
            location = line
            continue

        if any(x in lowered for x in ["fax", "hours", "box", "p.o", "linkedin", "facebook", "twitter", "instagram", "ext", "extension", "mobile", "call"]):
            extras.append(line)

    extra_text = "\n".join(extras) if extras else "None"
    return f"Phone Number: {phone}, Email: {email}, Location: {location}\nOthers:\n{extra_text}"

# Fetching about of the college categories, colleges and their departments
async def fetch_about_of(session, url):
    """
    Fetch the "about" text from a department (or college) URL,
    ignoring any <p> that contains an <img>.
    """
    try:
        soup = await get_soup(session, url)
    except Exception:
        return ""

    # 1) Drupal body block
    block = soup.select_one("div.field--name-body")
    if block:
        paras = []
        for p in block.find_all("p"):
            if p.find("img"):
                continue
            text = remove_arabic(p.get_text(" ", strip=True))
            if text:
                paras.append(text)
        if paras:
            return " ".join(paras)

    # 2) Views‑body fallback
    span_block = soup.select_one(
        "span.views-field.views-field-body span.field-content"
    )
    if span_block:
        paras = []
        for p in span_block.find_all("p"):
            if p.find("img"):
                continue
            text = remove_arabic(p.get_text(" ", strip=True))
            if text:
                paras.append(text)
        if paras:
            return " ".join(paras)

    # 3) Heading‑driven
    hdr = soup.find(
        lambda t: t.name in ("h2","h3","h4")
        and any(kw in t.text for kw in ("About Department","About the Department","About College"))
    )
    if hdr:
        collected = []
        for elem in hdr.find_all_next():
            if elem.name in ("h2","h3","h4"):
                break
            if elem.name == "p" and not elem.find("img"):
                txt = elem.get_text(" ", strip=True)
                if txt:
                    collected.append(txt)
        if collected:
            return " ".join(collected)

    # 4) Ultimate fallback: first pure <p> starting "The …"
    for p in soup.find_all("p"):
        if p.find("img"):
            continue
        txt = p.get_text(" ", strip=True)
        if txt.startswith("The"):
            return txt

    return ""

def find_main_menu_ul(soup):
    for nav in soup.find_all("nav"):
        ul = nav.find("ul")
        if ul and len(ul.find_all("li", recursive=False)) >= 5:
            return ul
    header = soup.find("header") or soup
    best, bc = None, 0
    for ul in header.find_all("ul"):
        cnt = len(ul.find_all("li", recursive=False))
        if cnt > bc:
            best, bc = ul, cnt
    if best and bc >= 5:
        return best
    raise RuntimeError("Main menu <ul> not found")

def recurse_menu(ul):
    out = []
    for li in ul.find_all("li", recursive=False):
        a = li.find("a", recursive=False)
        title = a.get_text(strip=True) if a else li.get_text(strip=True)
        href  = a["href"] if a and a.has_attr("href") else None
        sub = li.find("ul", recursive=False)
        out.append({
            "title":    title,
            "url":      href,
            "children": recurse_menu(sub) if sub else []
        })
    return out

# Scraping the menu list
async def scrape_menu(session):
    soup = await get_soup(session, "/en/home")
    main_ul = find_main_menu_ul(soup)
    return recurse_menu(main_ul)

# Scrape the categories from college categories
async def scrape_category(session, path):
    soup = await get_soup(session, path)
    view = soup.find("div", class_="view-content")
    items = []
    if view:
        for card in view.find_all("div", class_="views-row"):
            a = card.find("a", href=True)
            if not a:
                continue
            items.append({
                "title":    a.get_text(strip=True),
                "url":      a["href"],
                "children": []
            })
    return items

# Scrape the department links from the colleges (Non-JScript version)
def get_departments(soup, base_url):
    """Try to extract department links either from academic section or fallback menu."""
    departments = []

    # Primary: find "Academic Departments" or similar header
    hdr = soup.find(lambda t: t.name in ["h2", "h3", "h4"] and any(
        kw in t.get_text(strip=True).lower() for kw in ["academic departments", "departments", "academic"]
    ))

    if hdr:
        ul = hdr.find_next_sibling(lambda t: t.name == "ul")
        if ul:
            for li in ul.find_all("li"):
                a = li.find("a", href=True)
                if a:
                    departments.append({
                        "title": a.get_text(strip=True),
                        "url": urljoin(base_url, a["href"])
                    })

    # Fallback: try sidebars, navs, menus
    if not departments:
        menus = soup.find_all(["nav", "aside", "div"], class_=lambda c: c and "menu" in c.lower())
        for menu in menus:
            for a in menu.find_all("a", href=True):
                text = a.get_text(strip=True).lower()
                if "department" in text:
                    departments.append({
                        "title": a.get_text(strip=True),
                        "url": urljoin(base_url, a["href"])
                    })

    return departments

# Scraping the departments with JScript sections just like in the Science Departments
def extract_js_departments(college_url):
    """Uses Selenium to extract department links from JS-rendered sections like carousels."""
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--log-level=3")  # Reduce console logs

    driver = webdriver.Chrome(options=options)
    driver.get(college_url)
    time.sleep(3)  # Wait for JS to load

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    driver.quit()

    departments = []

    # Look for known carousel sections, or any link that looks like a department
    for a in soup.find_all("a", href=True):
        text = a.get_text(strip=True).lower()
        href = a["href"]
        if any(kw in text for kw in ["department", "unit", "program", "academic"]) and href != "#":
            full_url = href if href.startswith("http") else urljoin(college_url, href)
            departments.append({
                "title": a.get_text(strip=True),
                "url": full_url
            })

    return departments

# Scrape the FAQ pages
async def scrape_dar_faqs(session):
    url = "https://dar.ksu.edu.sa/en/faqs"
    soup = await get_soup(session, url)

    faq_list = []
    content_block = soup.select_one("div.region-content") or soup.select_one("main") or soup.select_one("div.main-content")

    if not content_block:
        return []

    current_question = None

    for tag in content_block.find_all(["strong", "p", "div"]):
        if tag.name == "strong":
            q_text = tag.get_text(strip=True)
            if q_text:
                current_question = q_text
        elif current_question:
            a_text = tag.get_text(strip=True)
            if a_text:
                faq_list.append({
                    "question": current_question,
                    "answer": a_text
                })
                current_question = None

    return faq_list

# Remove duplicated FAQs after combining from different sources
def deduplicate_faqs(faqs):
    seen = set()
    deduped = []
    for faq in faqs:
        q = faq["question"]
        if q not in seen:
            deduped.append(faq)
            seen.add(q)
    return deduped

# Scrape the FAQs with JScript in it
def scrape_faqs_with_selenium():
    base_url = "https://www.ksu.edu.sa/en"

    # Step 1: Launch headless browser
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(options=options)
    driver.get(base_url)

    # Step 2: Get the FAQs link from the footer
    faq_link = None
    time.sleep(3)  # Let JS load
    soup = BeautifulSoup(driver.page_source, "html.parser")
    footer = soup.find("footer")
    if footer:
        for a in footer.find_all("a", href=True):
            if "faq" in a["href"].lower():
                faq_link = urljoin(base_url, a["href"])
                break

    if not faq_link:
        print("❌ Could not find FAQ link in footer.")
        driver.quit()
        return []

    # Step 3: Visit FAQs page
    print(f"🔗 Visiting FAQ page: {faq_link}")
    driver.get(faq_link)
    time.sleep(3)
    fsoup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    # Step 4: Scrape FAQ content
    faq_items = fsoup.select("div.faq-item")
    if not faq_items:
        print("❌ No FAQ items found after rendering.")
        return []

    faq_children = []
    for item in faq_items:
        summary = item.find("summary")
        answer_div = item.find("div", class_="faq-body")

        if not summary or not answer_div:
            continue

        question = summary.get_text(strip=True)
        answer = answer_div.get_text(separator=" ", strip=True)

        faq_children.append({
            "title": question,
            "answer": answer
        })

    return [{
        "title": "FAQs",
        "url": faq_link,
        "children": faq_children
    }]

# Merge all the FAQs sources
async def merge_all_faqs(session):
    # Scrape both sources concurrently
    ksu_task = asyncio.create_task(asyncio.to_thread(scrape_faqs_with_selenium))
    dar_task = asyncio.create_task(scrape_dar_faqs(session))
    
    ksu_faq_sections, dar_faq_items = await asyncio.gather(ksu_task, dar_task)

    # Ensure we have base KSU FAQs section
    if not ksu_faq_sections:
        print("❌ Could not load KSU FAQs.")
        return []

    # Extract the first (and only) section
    ksu_faq_section = ksu_faq_sections[0]

    # Convert DAR items into same structure as KSU's children
    dar_children = [{
        "title": item["question"],
        "answer": item["answer"],
        "source": "https://dar.ksu.edu.sa/en/faqs"
    } for item in dar_faq_items]

    # Optional: Deduplicate all FAQs (by question text)
    combined_children = ksu_faq_section["children"] + dar_children
    deduped_children = deduplicate_faqs([
        {"question": c["title"], "answer": c["answer"]} for c in combined_children
    ])

    # Re-wrap into children format
    final_children = [{
        "title": item["question"],
        "answer": item["answer"]
    } for item in deduped_children]

    # Update and return unified section
    return [{
        "title": "FAQs",
        "url": ksu_faq_section["url"],
        "children": final_children
    }]

# Scrape plagiarism
async def scrape_plagiarism_content(session):
    url = "https://chss.ksu.edu.sa/en/plagiarism-en"
    soup = await get_soup(session, url)
    content_block = soup.select_one("div.region-content") or soup.select_one("main") or soup.select_one("div.main-content")

    if not content_block:
        return ""

    text_elements = content_block.find_all(["p", "h2", "h3", "li"])
    content = "\n".join(p.get_text(strip=True) for p in text_elements if p.get_text(strip=True))
    return content

# Scrape the grading system table
async def scrape_grading_system_table(session):
    url = "https://dar.ksu.edu.sa/en/node/815"
    soup = await get_soup(session, url)
    
    # Find the first table in the content region
    table = soup.find("table")
    if not table:
        return ""

    rows = []
    for tr in table.find_all("tr"):
        cols = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
        rows.append(" | ".join(cols))

    # Format as markdown-style table
    if not rows:
        return ""

    header = rows[0]
    separator = " | ".join(["---"] * len(header.split(" | ")))
    body = "\n".join(rows[1:])
    return f"**KSU Grading System Table:**\n\n{header}\n{separator}\n{body}"

# Scrape the policies
async def scrape_regulations_and_policies(session):
    print("📘 Scraping Regulations and Policies section...")
    base_url = "https://ksu.edu.sa/en/policies"
    soup = await get_soup(session, base_url)

    # Use the main content area (not header or footer)
    content_block = soup.select_one("div.region-content") or soup.select_one("main") or soup.select_one("div.main-content")

    # Extract base page content
    page_content = ""
    if content_block:
        main_text_elements = content_block.find_all(["p", "h2", "h3", "li"])
        page_content = "\n".join(p.get_text(strip=True) for p in main_text_elements if p.get_text(strip=True))

    # Extract child links in main content only
    children = []
    seen_urls = set()

    if content_block:
        for a in content_block.select("a[href]"):
            title = a.get_text(strip=True)
            href = a["href"].strip()

            if not title or "javascript" in href.lower() or href.startswith("#"):
                continue

            full_url = urljoin(base_url, href)
            if full_url in seen_urls:
                continue
            seen_urls.add(full_url)

            children.append({
                "title": title,
                "url": full_url
            })

    # Get plagiarism and grading content concurrently
    plagiarism_task = asyncio.create_task(scrape_plagiarism_content(session))
    grading_task = asyncio.create_task(scrape_grading_system_table(session))
    
    plagiarism_content, grading_content = await asyncio.gather(plagiarism_task, grading_task)

    #  Manually add extra children (outside the loop) like group plagiarism, grading systems and policies together under regulations and policies section
    children.append({
        "title": "Plagiarism",
        "url": "https://chss.ksu.edu.sa/en/plagiarism-en",
        "content": plagiarism_content
    })

    children.append({
        "title": "Grading System",
        "url": "https://dar.ksu.edu.sa/en/node/815",
        "content": grading_content,
        "info": "https://engineering.ksu.edu.sa/sites/engineering.ksu.edu.sa/files/imce_images/regulations_of_study_and_examinations_of_ksu.pdf"
    })

    return {
        "title": "Regulations and Policies",
        "url": base_url,
        "content": page_content,
        "children": children
    }

# Scrape the admission requirements of masters and phd students 
async def scrape_admission_requirements(session):
    print("📘 Scraping Admission Requirements section...")
    url = "https://graduatestudies.ksu.edu.sa/en/node/859"
    soup = await get_soup(session, url)

    table = soup.find("table")
    children = []

    if table:
        rows = table.find_all("tr")
        for row in rows:
            cols = row.find_all(["td", "th"])
            if len(cols) >= 2:
                title = cols[0].get_text(strip=True)
                content = cols[1].get_text(separator="\n", strip=True)
                if title and content:
                    children.append({
                        "title": title,
                        "url": url,
                        "content": content
                    })

    return {
        "title": "Admission Requirements",
        "url": url,
        "content": "Graduate admission criteria as listed by the Deanship of Graduate Studies.",
        "children": children
    }

# Scrape research institures 
async def scrape_research_institutes(session):
    url = "https://ksu.edu.sa/en/node/3106"
    soup = await get_soup(session, url)
    content_block = soup.select_one("div.region-content") or soup.select_one("main") or soup.select_one("div.main-content")
    if not content_block:
        return []

    institutes = []
    text = content_block.get_text(separator="\n").strip()
    # the page lists institute names in plain text—best-effort split lines:
    for line in text.split("\n"):
        name = line.strip()
        if name and not name.lower().startswith("do you like"):
            institutes.append({"title": name})
    return institutes

# Scrape libraries sections and the libraries in each section and then finally scrape the content inside
async def scrape_library_section(session):
    base_url = "https://library.ksu.edu.sa"
    page_url = f"{base_url}/en"
    
    async with session.get(page_url) as response:
        response.raise_for_status()
        text = await response.text()
        soup = BeautifulSoup(text, "html.parser")

    # Arabic → English mapping
    title_map = {
        "مكتبات مشتركة": "Shared libraries",
        "Men libraries": "Men libraries",
        "مكتبات الطالبات": "Female libraries"
    }

    # Find "المكتبات" tab
    menu_items = soup.select("li.menu-item--expanded > a")
    libraries_link = None
    for a in menu_items:
        if "المكتبات" in a.get_text(strip=True):
            libraries_link = a.find_parent("li")
            break

    if not libraries_link:
        print("❌ Could not find المكتبات menu.")
        return None

    libraries_section = {
        "title": "Libraries",
        "url": page_url,
        "children": []
    }

    # Process each library type concurrently
    async def process_library_type(a):
        arabic_title = a.get_text(strip=True)
        url = a["href"]
        full_url = urljoin(base_url, url)
        english_title = title_map.get(arabic_title, arabic_title)

        # Scrape children links inside the page's main content
        async with session.get(full_url) as response:
            response.raise_for_status()
            page_text = await response.text()
            page_soup = BeautifulSoup(page_text, "html.parser")
            
        main_content = (
            page_soup.select_one("main") or
            page_soup.select_one("div.region-content") or
            page_soup.select_one("div.main-content") or
            page_soup.body
        )

        link_children = []
        if main_content:
            # Process child pages concurrently
            async def process_child_link(link):
                text = link.get_text(strip=True)
                href = link["href"]

                if not text or href.startswith("#") or "mailto:" in href:
                    return None

                child_url = urljoin(full_url, href)

                try:
                    async with session.get(child_url) as child_resp:
                        child_resp.raise_for_status()
                        child_text = await child_resp.text()
                        child_soup = BeautifulSoup(child_text, "html.parser")
                        
                    child_main = (
                        child_soup.select_one("main") or
                        child_soup.select_one("div.region-content") or
                        child_soup.select_one("div.main-content") or
                        child_soup.body
                    )

                    # Extract paragraphs as "Information"
                    paragraphs = [p.get_text(separator="\n", strip=True) for p in child_main.find_all("p")]
                    info_section = {
                        "title": "Information",
                        "content": "\n\n".join(paragraphs)
                    } if paragraphs else None

                    # Extract tables as "Contact info" - filter out empty tables
                    tables = []
                    for table in child_main.find_all("table"):
                        rows = []
                        for row in table.find_all("tr"):
                            cols = [cell.get_text(strip=True) for cell in row.find_all(["td", "th"])]
                            # Only add row if it has non-empty content
                            if cols and any(col.strip() for col in cols):
                                rows.append(cols)

                        headers = rows[0] if rows and table.find("th") else []
                        data_rows = rows[1:] if headers else rows

                        # Only add table if it has meaningful content
                        if rows and any(any(cell.strip() for cell in row) for row in rows):
                            tables.append({
                                "headers": headers,
                                "rows": data_rows
                            })

                    contact_info_section = {
                        "title": "Contact info",
                        "tables": tables
                    } if tables else None

                    # Extract location link
                    location_section = None
                    for a_tag in child_main.find_all("a", href=True):
                        if "click here" in a_tag.get_text(strip=True).lower():
                            location_section = {
                                "title": "Location",
                                "url": urljoin(child_url, a_tag["href"])
                            }
                            break

                    child_sections = []
                    if info_section:
                        child_sections.append(info_section)
                    if contact_info_section:
                        child_sections.append(contact_info_section)
                    if location_section:
                        child_sections.append(location_section)

                    return {
                        "title": text,
                        "url": child_url,
                        "children": child_sections
                    }

                except Exception as e:
                    return None

            # Process all child links concurrently
            child_tasks = [process_child_link(link) for link in main_content.find_all("a", href=True)]
            child_results = await asyncio.gather(*child_tasks, return_exceptions=True)
            
            # Filter out None results and exceptions
            link_children = [result for result in child_results if result is not None and not isinstance(result, Exception)]

        return {
            "title": english_title,
            "url": full_url,
            "children": link_children
        }

    # Process all library types concurrently
    library_tasks = [process_library_type(a) for a in libraries_link.select("ul.menu a")]
    library_results = await asyncio.gather(*library_tasks, return_exceptions=True)
    
    # Filter out exceptions
    libraries_section["children"] = [result for result in library_results if not isinstance(result, Exception)]

    return libraries_section

# Scrape the academic calendar table and contents
async def scrape_academic_calendar(session):
    url = "https://dar.ksu.edu.sa/en/CurrentCalendar"
    
    async with session.get(url) as response:
        response.raise_for_status()
        text = await response.text()
        soup = BeautifulSoup(text, "html.parser")

    # Attempt to locate a <table> first
    tbl = soup.find("table")
    if tbl:
        headers = [th.get_text(strip=True) for th in tbl.select("thead th")] if tbl.find("thead") else []
        rows = [
            [cell.get_text(strip=True) for cell in row.find_all(["td","th"])]
            for row in tbl.find_all("tr")
        ]
        table_content = {
            "headers": headers,
            "rows": rows
        }
    else:
        # Fallback: parse free‑text to key‑value rows
        text_content = soup.get_text(separator="\n")
        lines = [ln.strip() for ln in text_content.split("\n") if ln.strip()]
        # Remove footer notices
        lines = [ln for ln in lines if not ln.lower().startswith("last updated")]
        table_content = {
            "text_rows": lines
        }

    return {
        "title": "Academic Calendar",
        "url": url,
        "table": table_content
    }

# Scrape housing sections for both faculties and students
async def scrape_housing_section(session):
    housing_url = f"{BASE}/en/housing"
    soup = await get_soup(session, housing_url)

    async def extract_links_from_tab(tab_id):
        tab_div = soup.select_one(tab_id)
        if not tab_div:
            return []

        children = []
        for link in tab_div.find_all("a", href=True):
            title = link.get_text(strip=True)
            href = link["href"]
            if not title or href.startswith("#") or "mailto:" in href:
                continue

            full_url = urljoin(BASE, href)
            children.append({
                "title": title,
                "url": full_url,
                "children": []
            })
        return children

    # Extract faculty and student links concurrently
    faculty_task = extract_links_from_tab("#nav-faculty")
    student_task = extract_links_from_tab("#nav-students")
    
    faculty_links, student_links = await asyncio.gather(faculty_task, student_task)

    # Add Procedural Guide for Registration in Student Housing
    procedural_url = "https://sa.ksu.edu.sa/en/node/1013"
    procedural_soup = await get_soup(session, procedural_url)
    article = procedural_soup.select_one("article")
    procedural_text = article.get_text(separator="\n", strip=True) if article else ""

    student_links.append({
        "title": "Procedural Guide for Registration in Student Housing",
        "url": procedural_url,
        "content": procedural_text,
        "children": []
    })

    student_base_url = "https://sa.ksu.edu.sa/en/node/1007"
    student_soup = await get_soup(session, student_base_url)

    # Find the parent <li> that links to /en/node/6649
    female_menu_li = student_soup.select_one('li.menu-item--expanded > a[href="/en/node/6649"]')
    if female_menu_li:
        parent_li = female_menu_li.find_parent("li")
        submenu = parent_li.find("ul", class_="menu sub-menu")

        female_children = []

        if submenu:
            for a in submenu.find_all("a", href=True):
                title = a.get_text(strip=True)
                full_url = urljoin(student_base_url, a["href"])

                female_children.append({
                    "title": title,
                    "url": full_url,
                    "children": []
                })

        student_links.append({
            "title": "Registration in Female Student Housing",
            "url": urljoin(student_base_url, "/en/node/6649"),
            "children": female_children
        })

    # Scrape "RELATED LINKS" from the English housing site
    faculty_housing_url = "https://housing.ksu.edu.sa/en/"
    faculty_soup = await get_soup(session, faculty_housing_url)

    related_links_url = ""
    related_children = []

    # Find <a> tag that says "Related links" (case-insensitive)
    for a in faculty_soup.find_all("a", href=True):
        text = a.get_text(strip=True).lower()
        if "related links" in text:
            related_links_url = urljoin(faculty_housing_url, a["href"])
            print(f"[✅] Found Related Links URL: {related_links_url}")
            break

    # Now fetch the Related Links page if found
    if related_links_url:
        related_links_section = await get_soup(session, related_links_url)

        content_area = related_links_section.select_one("article")

        if content_area:
            links = []
            for p in content_area.find_all("p"):
                a = p.find("a", href=True)
                if not a:
                    continue
                href = a["href"]
                strong = a.find("strong")
                title = strong.get_text(strip=True) if strong else a.get_text(strip=True)
                if not title or href.startswith("#") or "mailto:" in href:
                    continue
                full_url = urljoin(related_links_url, href)
                links.append({
                    "title": title,
                    "url": full_url,
                })

            # Deduplicate links by (title + url)
            seen = set()
            deduped_links = []
            for link in links:
                key = (link["title"], link["url"])
                if key not in seen:
                    deduped_links.append(link)
                    seen.add(key)

            if deduped_links:
                related_children.append({
                    "title": "Related Links",
                    "url": related_links_url,
                    "children": deduped_links
                })

        # Add "Related Links" as a sub-section under Faculty Housing
        faculty_links.extend(related_children)

    return {
        "title": "Housing",
        "url": housing_url,
        "children": [
            {
                "title": "Faculty Housing",
                "children": faculty_links
            },
            {
                "title": "Student Housing",
                "children": student_links
            }
        ]
    }

# Adding the scraped excel sheet from the IT Helpdesk, categorizing them as either students or staff and adding them to the hierarchy
def build_it_helpdesk_tree(csv_path):
    df = pd.read_csv(csv_path)

    # Clean and drop invalid rows
    df.dropna(subset=["Audience", "Category", "Subcategory"], inplace=True)

    # Normalize strings
    df["Audience"] = df["Audience"].str.strip().str.title()
    df["Category"] = df["Category"].str.strip().str.title()
    df["Subcategory"] = df["Subcategory"].str.strip().str.title()

    # Remove exact duplicate rows
    df.drop_duplicates(subset=["Audience", "Category", "Subcategory"], inplace=True)

    # Build tree structure: Audience -> Category -> Subcategory
    audiences = {}
    for _, row in df.iterrows():
        aud = row["Audience"]
        cat = row["Category"]
        sub = row["Subcategory"]

        if aud not in audiences:
            audiences[aud] = {}

        if cat not in audiences[aud]:
            audiences[aud][cat] = set()

        audiences[aud][cat].add(sub)

    # Construct final JSON
    audience_nodes = []
    for aud, cats in audiences.items():
        category_nodes = []
        for cat, subcats in cats.items():
            sub_nodes = [{"title": s, "children": []} for s in sorted(subcats)]
            category_nodes.append({
                "title": f"Categories: {cat}",
                "children": sub_nodes
            })
        audience_nodes.append({
            "title": aud,
            "children": category_nodes
        })

    tree = {
        "title": "IT Helpdesk",
        "url": "https://its.ksu.edu.sa/",
        "children": audience_nodes
    }

    return tree

# Scrape the college details, about colleges, academic departments, url, content, faculty links and their contact info for both non-JScript and JScript
async def scrape_college_details(session, path):
    """
    Given a college URL, return its six modules:
      – About College (with your existing fallbacks)
      – Academic Departments (with children: title, url, content, faculty_links, contact_info)
      – News, Events…, Service, Important links (unchanged)
    """
    college_base_url = path if path.startswith("http") else urljoin(BASE, path)
    soup = await get_soup(session, path)
    modules = []

    # — About College —
    about = ""
    block = soup.select_one("div.field--name-body")
    if block:
        ps = block.find_all("p")
        about = " ".join(p.get_text(" ", strip=True) for p in ps if p.get_text(strip=True))

    if not about:
        span_block = soup.select_one("span.views-field.views-field-body span.field-content")
        if span_block:
            about = span_block.get_text(" ", strip=True)

    if not about:
        hdr = soup.find(lambda t: t.name in ["h2", "h3", "h4"] and "About College" in t.text)
        if hdr:
            p = hdr.find_next_sibling("p")
            if p:
                about = p.get_text(" ", strip=True)

    if not about:
        for p in soup.find_all("p"):
            t = p.get_text(" ", strip=True)
            if t.startswith("The College of"):
                about = t
                break

    modules.append({"section": "About College", "content": about})

    # — Academic Departments —
    dept_links = []
    seen_urls = set()
    keywords = ["academic departments", "departments", "department", "academic"]
    hdr = soup.find(lambda t: t.name in ["h2", "h3", "h4"] and any(
        kw in t.get_text(strip=True).lower() for kw in keywords)
    )

    if "sciences.ksu.edu.sa" in path:
        print("🔁 Special handling for College of Sciences")

        soup = await get_soup(session, path)
        base = path if path.endswith("/") else path + "/"
        departments = []

        # Step 1: From main page, find departments from dropdown menu
        for a in soup.select("li.menu-item a[href]"):
            text = a.get_text(strip=True).lower()
            if "department" in text and "/en/" in a["href"]:
                dept_url = urljoin(base, a["href"])
                dept_title = a.get_text(strip=True)
                print(f"📁 Found department: {dept_title}")

                try:
                    dept_soup = await get_soup(session, dept_url)
                    about = await fetch_about_of(session, dept_url)
                    contact_link = find_contact_link(dept_soup, dept_url)
                    contact_info = await extract_contact_info(session, contact_link) if contact_link else "Contact page not found."

                    # Step 2: Inside department page, find EDUCATION dropdown menu
                    edu_links = []
                    for edu_a in dept_soup.select("li.menu-item a[href]"):
                        edu_text = edu_a.get_text(strip=True).lower()
                        if any(x in edu_text for x in ["faculty", "staff", "employee"]):
                            edu_url = urljoin(dept_url, edu_a["href"])
                            edu_links.append({
                                "title": edu_a.get_text(strip=True),
                                "url": edu_url
                            })
                            print(f"👥 Found faculty/staff link: {edu_url}")

                    departments.append({
                        "title": dept_title,
                        "url": dept_url,
                        "content": about,
                        "contact_info": contact_info,
                        "faculty_staff_links": edu_links
                    })

                except Exception as e:
                    print(f"❌ Error processing {dept_url}: {e}")

        modules = []
        if departments:
            modules.append({
                "section": "Academic Departments",
                "children": departments
            })

        return modules

    grid = hdr.find_next("div", class_="views-view-grid") if hdr else None

    if grid:
        # Process departments concurrently
        async def process_department(a):
            title = a.get_text(strip=True)
            href = a["href"].strip()
            if not title:
                return None
            if href.startswith("/ar/") and "/en/" not in href:
                href = href.replace("/ar/", "/en/")
            href = href if href.startswith("http") else urljoin(college_base_url, href)
            if href in seen_urls:
                return None
            seen_urls.add(href)

            try:
                content_task = fetch_about_of(session, href)
                dep_soup_task = get_soup(session, href)
                
                content, dep_soup = await asyncio.gather(content_task, dep_soup_task)
            except Exception as e:
                print(f"Skipping {href} due to error: {e}")
                return None

            contact_link = find_contact_link(dep_soup, href)
            
            # Run contact extraction and faculty links concurrently
            contact_task = extract_contact_info(session, contact_link) if contact_link else asyncio.create_task(asyncio.sleep(0, result="Contact page not found."))
            faculty_task = find_faculty_links(session, href)
            
            contact_info, faculty_links = await asyncio.gather(contact_task, faculty_task)

            return {
                "title":   title,
                "url":     href,
                "content": content,
                "faculty_links": faculty_links,
                "contact_info": contact_info
            }

        # Process all departments concurrently
        dept_tasks = [process_department(a) for a in grid.select(".portfolio-content a[href]")]
        dept_results = await asyncio.gather(*dept_tasks, return_exceptions=True)
        
        # Filter out None results and exceptions
        dept_links = [result for result in dept_results if result is not None and not isinstance(result, Exception)]
    else:
        print(f"⚠️ Fallback (JS-rendered): using Selenium on {college_base_url}")
        try:
            js_departments = await asyncio.to_thread(extract_js_departments, college_base_url)
            
            async def process_js_department(dep):
                title = dep["title"]
                href = dep["url"]
                if href in seen_urls:
                    return None
                seen_urls.add(href)
                print(f"🧲 JS dept found: {title} → {href}")
                
                try:
                    dep_soup = await get_soup(session, href)
                except Exception as e:
                    print(f"❌ Skipping {href} due to error: {e}")
                    return None
                    
                content = await fetch_about_of(session, href)
                contact_link = find_contact_link(dep_soup, href)
                
                contact_task = extract_contact_info(session, contact_link) if contact_link else asyncio.create_task(asyncio.sleep(0, result="Contact page not found."))
                faculty_task = find_faculty_links(session, href)
                
                contact_info, faculty_links = await asyncio.gather(contact_task, faculty_task)
                
                return {
                    "title":   title,
                    "url":     href,
                    "content": content,
                    "faculty_links": faculty_links,
                    "contact_info": contact_info
                }

            js_dept_tasks = [process_js_department(dep) for dep in js_departments]
            js_dept_results = await asyncio.gather(*js_dept_tasks, return_exceptions=True)
            
            dept_links = [result for result in js_dept_results if result is not None and not isinstance(result, Exception)]
            
        except Exception as e:
            print(f"⚠️ JS Fallback failed: {e}")

    modules.append({
        "section":  "Academic Departments",
        "children": dept_links
    })

    # — Remaining: News, Events…, Service, Important links —
    for name in ["Service"]:
        items = []
        hdr2 = soup.find(lambda t: t.name in ["h2", "h3", "h4"] and name in t.text)
        if hdr2:
            view2 = hdr2.find_next_sibling("div", class_="view-content")
            if view2:
                for a in view2.find_all("a", href=True):
                    items.append({
                        "title": a.get_text(strip=True),
                        "url":   a["href"].strip()
                    })
        modules.append({"section": name, "items": items})

    return modules

# The main for all the methods above and then output it into a json file
async def main():
    session = await create_session()
    
    try:
        print("Starting KSU website scraping...")
        print(f"Output directory: {OUTPUT_DIR}")
        print(f"Output file: {OUTPUT_FILE}")
        
        # 1) scrape menu
        print("Scraping main menu...")
        menu = await scrape_menu(session)

        # Create tasks for major sections that can run concurrently
        print("Scraping major sections concurrently...")
        regulations_task = scrape_regulations_and_policies(session)
        admission_task = scrape_admission_requirements(session)
        faq_task = merge_all_faqs(session)
        research_task = scrape_research_institutes(session)
        library_task = scrape_library_section(session)
        calendar_task = scrape_academic_calendar(session)
        housing_task = scrape_housing_section(session)
        
        # Wait for all major sections to complete
        (regulations_section, admission_req_section, faq_section, 
         research_institutes, library_section, academic_calendar_section, 
         housing_section) = await asyncio.gather(
            regulations_task, admission_task, faq_task, research_task,
            library_task, calendar_task, housing_task,
            return_exceptions=True
        )

        # Add sections to menu
        if not isinstance(regulations_section, Exception) and regulations_section["children"]:
            menu.append(regulations_section)
            print("Added Regulations and Policies section")
        else:
            print("No policies found under Regulations and Policies.")

        if not isinstance(admission_req_section, Exception) and admission_req_section["children"]:
            menu.append(admission_req_section)
            print("Added Admission Requirements section")
        else:
            print("No admissions found.")

        if not isinstance(faq_section, Exception) and faq_section:
            menu.extend(faq_section)
            print("✅ Added FAQs section")
        else:
            print("No FAQs found.")

        if not isinstance(research_institutes, Exception):
            labs_children = [
                {"title": inst["title"], "url": "https://ksu.edu.sa/en/node/3106", "content": ""}
                for inst in research_institutes
            ]
            # manually append Central Research Lab
            labs_children.append({
                "title": "Central Research Lab",
                "url": "https://crl.ksu.edu.sa/en",
                "content": ""
            })

            research_section = {
                "title": "Research",
                "url": "https://ksu.edu.sa/en/node/3106",
                "content": "Research institutes at King Saud University.",
                "children": [
                    {
                        "title": "Labs",
                        "url": "https://ksu.edu.sa/en/node/3106",
                        "children": labs_children
                    }
                ]
            }
            menu.append(research_section)
            print("Added Research section")

        # Add other sections
        if not isinstance(library_section, Exception) and library_section:
            menu.append(library_section)
            print("Added Libraries section")
        else:
            print("Library section failed.")

        if not isinstance(academic_calendar_section, Exception) and academic_calendar_section:
            menu.append(academic_calendar_section)
            print("Added Academic Calendar section")

        if not isinstance(housing_section, Exception) and housing_section:
            menu.append(housing_section)
            print("Added Housing section")

        # IT Helpdesk (synchronous operation)
        print("Processing IT Helpdesk data...")
        csv_path = "IT_Helpdesk_cleaned.csv"
        try:
            it_helpdesk_section = build_it_helpdesk_tree(csv_path)
            if it_helpdesk_section:
                menu.append(it_helpdesk_section)
                print("Added IT Helpdesk section")
        except FileNotFoundError:
            print(f"CSV file '{csv_path}' not found. Skipping IT Helpdesk section.")
        except Exception as e:
            print(f"Error processing IT Helpdesk: {e}")

        # 2) drill into Study at KSU → Colleges
        print("Processing colleges data...")
        try:
            study = next(m for m in menu if m["title"].lower() == "study at ksu")
            colleges_node = next(c for c in study["children"] if c["title"].lower() == "colleges")

            # 3) build each category → colleges → details
            for cat in colleges_node["children"]:
                print(f"Processing category: {cat['title']}")
                cat["children"] = await scrape_category(session, cat["url"])
                
                # Process all colleges in this category concurrently
                college_tasks = [scrape_college_details(session, coll["url"]) for coll in cat["children"]]
                college_results = await asyncio.gather(*college_tasks, return_exceptions=True)
                
                # Assign results to colleges
                for i, result in enumerate(college_results):
                    if not isinstance(result, Exception):
                        cat["children"][i]["children"] = result
                        print(f"Processed college: {cat['children'][i]['title']}")
                    else:
                        print(f"Error processing college {cat['children'][i]['title']}: {result}")
                        cat["children"][i]["children"] = []
        except StopIteration:
            print("Could not find 'Study at KSU' or 'Colleges' section in menu")

        # 4) Save to JSON file
        print("Saving data to JSON file...")
        save_to_json(menu)
        
        print("Scraping completed successfully!")
        print(f"Total menu items: {len(menu)}")

    except Exception as e:
        print(f"Critical error in main function: {e}")
        raise
    finally:
        await session.close()
        print("Session closed")

if __name__ == "__main__":
    # Check if we're in a Jupyter notebook or already have an event loop
    try:
        # Try to get the current event loop
        loop = asyncio.get_running_loop()
        # If we get here, we're in a notebook - use await instead
        print("Running in Jupyter notebook - use: await main()")
    except RuntimeError:
        # No event loop running - safe to use asyncio.run()
        asyncio.run(main())

# For Jupyter notebooks, uncomment and run this instead:
await main()

Running in Jupyter notebook - use: await main()
🚀 Starting KSU website scraping...
📁 Output directory: data_backups
📄 Output file: hierarchy.json
📋 Scraping main menu...
📘 Scraping major sections concurrently...
📘 Scraping Regulations and Policies section...
📘 Scraping Admission Requirements section...
[✅] Found Related Links URL: https://housing.ksu.edu.sa/en/node/83
🔗 Visiting FAQ page: https://ksu.edu.sa/en/faqs
✅ Added Regulations and Policies section
✅ Added Admission Requirements section
✅ Added FAQs section
✅ Added Research section
✅ Added Libraries section
✅ Added Academic Calendar section
✅ Added Housing section
💻 Processing IT Helpdesk data...
✅ Added IT Helpdesk section
🏫 Processing colleges data...
📚 Processing category: Science Colleges
🔁 Switching to English version: https://ccis.ksu.edu.sa/en
🔁 Special handling for College of Sciences
📁 Found department: Development of Department
👥 Found faculty/staff link: https://signboards.ksu.edu.sa/Employees/Default.aspx
📁 Found dep

# MAIN CHATBOT

### Importing necessary libraries

In [1]:
import json
import nltk
import spacy
import re
import torch
from collections import defaultdict
from sentence_transformers import SentenceTransformer
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import warnings
warnings.filterwarnings('ignore')

### Creating a university chatbot that answers university data. 

NLTK tokenizers used:
- punkt
- stopwords

Sentence Transformers used for semantic similarity:
- all-MiniLM-L6-v2
- all-mpnet-base-v2
- paraphrase-MiniLM-L3-v2

Models used
- spaCy for NER recognition
- Local LLM for text generation: distilgpt2

There is no need to download separately. Just run the code below and it will download everything needed on the run


In [None]:
class UniversityChatbot():

    # Initializing the university chatbot with the data and intent
    def __init__(self, json_file_path):
        """Initialize the chatbot with university data"""
        
        # Download required NLTK data
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt')
        
        try:
            nltk.data.find('corpora/stopwords')
        except LookupError:
            nltk.download('stopwords')
        
        # Load spaCy model
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except OSError:
            print("⚠️  spaCy English model not found. Install with: python -m spacy download en_core_web_sm")
            self.nlp = None
        
        # Initialize Sentence Transformer for semantic similarity
        try:
            self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        except Exception as e:
            print(f"⚠️  Error loading sentence transformer: {e}")
            try:
                self.sentence_model = SentenceTransformer('all-mpnet-base-v2')
            except:
                try:
                    self.sentence_model = SentenceTransformer('paraphrase-MiniLM-L3-v2')
                except:
                    print("❌ Could not load any sentence transformer model")
                    self.sentence_model = None
        
        # Initialize local LLM for better response generation (optional)
        try:
            self.generator = pipeline("text-generation", 
                                    model="distilgpt2", 
                                    tokenizer="distilgpt2",
                                    max_length=100,
                                    temperature=0.7,
                                    do_sample=True,
                                    pad_token_id=50256)
        except Exception as e:
            print(f"⚠️  Could not load language model: {e}")
            self.generator = None
        
        # Load university data
        self.data = self.load_data(json_file_path)
        
        # Prepare semantic corpus
        self.prepare_semantic_corpus()
        
        # Enhanced intent patterns with semantic variations
        self.intent_patterns = {
            'admission_requirements': [
                'admission requirements', 'how to apply', 'application process', 'entrance requirements',
                'eligibility criteria', 'prerequisites for admission', 'admission criteria',
                'what do I need to apply', 'application requirements', 'how can I get admitted',
                'admission guidelines', 'entry requirements', 'application procedure'
            ],
            'academic_calendar': [
                'academic calendar', 'semester schedule', 'academic year dates', 'term dates',
                'when does semester start', 'exam schedule', 'registration dates',
                'academic timeline', 'semester timeline', 'course schedule', 'class schedule',
                'important dates', 'academic deadlines', 'holiday schedule'
            ],
            'degree_programs': [
                'degree programs', 'available majors', 'courses offered', 'study programs',
                'what can I study', 'academic programs', 'curriculum information',
                'undergraduate programs', 'graduate programs', 'fields of study',
                'departments and colleges', 'course catalog', 'program information', "majors", 
                'bachelors program', 'masters program', 'phd programs'
            ],
            'faculty': [
                'faculty directory', 'professor information', 'faculty members', 'teaching staff',
                'instructor details', 'faculty contacts', 'who teaches what',
                'professor contacts', 'faculty profiles', 'academic staff',
                'department faculty', 'faculty list'
            ],
            'contact_info': [
                'contact information', 'phone numbers', 'email addresses', 'office locations',
                'how to reach', 'contact details', 'department contacts',
                'office hours', 'where to find', 'contact directory'
            ],
            'housing': [
                'housing information', 'dormitory details', 'campus accommodation', 'residence halls',
                'where to live', 'student housing', 'residential facilities',
                'accommodation options', 'campus living', 'housing services'
            ],
            'library': [
                'library information', 'library services', 'study resources', 'library hours',
                'book collection', 'research materials', 'library facilities',
                'study spaces', 'library locations', 'library catalog'
            ],
            'grading': [
                'grading system', 'grade scale', 'how grades work', 'GPA calculation',
                'academic evaluation', 'marking scheme', 'grade distribution',
                'transcript information', 'academic performance'
            ],
            'plagiarism': [
                'plagiarism policy', 'academic integrity', 'cheating policy', 'academic dishonesty',
                'citation requirements', 'academic misconduct', 'integrity guidelines'
            ],
            'attendance': [
                'attendance policy', 'class attendance', 'attendance requirements',
                'absence policy', 'attendance rules', 'missing classes'
            ],
            'research': [
                'research opportunities', 'research labs', 'research facilities', 'research centers',
                'laboratory information', 'research programs', 'research projects'
            ],
            'it_support': [
                'IT support', 'technical help', 'computer problems', 'system issues',
                'help desk', 'technology support', 'login problems', 'password issues',
                'wifi problems', 'software help', 'hardware issues'
            ],
            'fees_tuition': [
                'tuition fees', 'cost of education', 'fee structure', 'payment information',
                'how much does it cost', 'financial information', 'fee payment'
            ],
            'scholarships': [
                'scholarship information', 'financial aid', 'funding opportunities',
                'scholarship applications', 'financial assistance', 'grants available'
            ]
        }
        
        # Create intent embeddings for semantic matching
        if self.sentence_model:
            self.create_intent_embeddings()
        
        # User context to track conversation state
        self.user_context = {'last_intent': None, 'entities': {}}
        self.conversation_state = None
        self.user_type = None
        
    # Loading the data    
    def load_data(self, json_file_path):
        """Load the university data from JSON file"""
        try:
            with open(json_file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                print(f"✅ Loaded university data from {json_file_path}")
                return data
        except FileNotFoundError:
            print(f"❌ Error: Could not find {json_file_path}")
            print("Creating sample data structure...")
            return self.create_sample_data()
        except json.JSONDecodeError:
            print(f"❌ Error: Invalid JSON format in {json_file_path}")
            return {}
    
    # Add the semantic embeddings onto the text corpus
    def prepare_semantic_corpus(self):
        """Prepare text corpus with semantic embeddings"""
        self.corpus = []
        self.corpus_metadata = []
        self.corpus_embeddings = None
        
        def extract_text_recursive(data, path="", parent_key="", parent_obj=None):
            if isinstance(data, dict):
                for key, value in data.items():
                    current_path = f"{path}/{key}" if path else key
                    extract_text_recursive(value, current_path, key, data)
            elif isinstance(data, list):
                for i, item in enumerate(data):
                    current_path = f"{path}[{i}]"
                    extract_text_recursive(item, current_path, parent_key, data)
            elif isinstance(data, str) and len(data.strip()) > 10:
                # Extract additional fields from parent object
                additional_fields = {}
                if parent_obj and isinstance(parent_obj, dict):
                    # Look for common additional fields like 'info', 'url', 'link', etc.
                    for field in ['info', 'url', 'link', 'source', 'reference']:
                        if field in parent_obj and parent_obj[field]:
                            additional_fields[field] = parent_obj[field]
                
                self.corpus.append(data.strip())
                self.corpus_metadata.append({
                    'path': path,
                    'parent_key': parent_key,
                    'content': data.strip(),
                    **additional_fields  # Include additional fields from parent object
                })
        
        extract_text_recursive(self.data)
        
        if self.corpus and self.sentence_model:
            print(f"🔍 Creating embeddings for {len(self.corpus)} text segments...")
            self.corpus_embeddings = self.sentence_model.encode(self.corpus, convert_to_tensor=True)
            print("✅ Semantic corpus ready")
    
    # Creating and adding embeddings for intent patterns and for better semantic matching
    def create_intent_embeddings(self):
        """Create embeddings for intent patterns for better semantic matching"""
        self.intent_embeddings = {}
        
        for intent, patterns in self.intent_patterns.items():
            # Create embeddings for all patterns of this intent
            pattern_embeddings = self.sentence_model.encode(patterns, convert_to_tensor=True)
            # Use mean embedding as the intent representation
            self.intent_embeddings[intent] = torch.mean(pattern_embeddings, dim=0)
    
    # Identifying user intent using semantic similarity
    def identify_intent_semantic(self, user_input):
        """Identify user intent using semantic similarity"""
        if not self.sentence_model or not hasattr(self, 'intent_embeddings'):
            return self.identify_intent_fallback(user_input)
        
        user_embedding = self.sentence_model.encode([user_input], convert_to_tensor=True)
        
        best_intent = 'general'
        best_score = 0.0
        
        for intent, intent_embedding in self.intent_embeddings.items():
            # Calculate cosine similarity
            similarity = torch.cosine_similarity(user_embedding, intent_embedding.unsqueeze(0))
            score = similarity.item()
            
            if score > best_score and score > 0.3:  # Threshold for intent confidence
                best_score = score
                best_intent = intent
        
        return best_intent, best_score
    
    # Falling back using keyword matching 
    def identify_intent_fallback(self, user_input):
        """Fallback intent identification using keyword matching"""
        user_input_lower = user_input.lower()
        intent_scores = defaultdict(float)
        
        # Keyword matching
        for intent, patterns in self.intent_patterns.items():
            for pattern in patterns:
                if pattern.lower() in user_input_lower:
                    intent_scores[intent] += 1
        
        if intent_scores:
            best_intent = max(intent_scores.items(), key=lambda x: x[1])[0]
            best_score = intent_scores[best_intent] / len(self.intent_patterns[best_intent])
            return best_intent, best_score
        
        return 'general', 0.0
    
    # Extradting entities and adding similar names to recognize them 
    def extract_entities_enhanced(self, user_input):
        """Enhanced entity extraction with better NLP"""
        entities = {
            'college': None,
            'department': None,
            'level': None,
            'user_type': None,
            'specific_info': []
        }
        
        user_input_lower = user_input.lower()
        
        # Enhanced education level detection
        level_patterns = {
            'undergraduate': ['undergraduate', 'undergrad', 'bachelor', 'bachelors', 'bsc', 'ba', 'first degree'],
            'masters': ['master', 'masters', 'graduate', 'msc', 'ma', 'postgraduate', 'masters degree'],
            'phd': ['phd', 'doctorate', 'doctoral', 'ph.d', 'doctor of philosophy']
        }
        
        for level, patterns in level_patterns.items():
            if any(pattern in user_input_lower for pattern in patterns):
                entities['level'] = level
                break
        
        # Enhanced user type detection
        user_type_patterns = {
            'student': ['student', 'students', 'pupil', 'learner', 'studying'],
            'staff': ['staff', 'employee', 'worker', 'administration'],
            'faculty': ['faculty', 'professor', 'teacher', 'instructor', 'lecturer', 'academic staff']
        }
        
        for user_type, patterns in user_type_patterns.items():
            if any(pattern in user_input_lower for pattern in patterns):
                entities['user_type'] = user_type
                break
        
        # Use spaCy for named entity recognition
        if self.nlp:
            doc = self.nlp(user_input)
            for ent in doc.ents:
                if ent.label_ == "ORG":
                    entities['college'] = ent.text
                elif ent.label_ in ["PERSON", "GPE"]:
                    entities['specific_info'].append(ent.text)
        
        return entities
    
    # Find most relevant content using semantic similarity which includes fallback check, encoding query, comparing similarities, best matching and filtering based on the similarity threshold
    def find_relevant_content_semantic(self, query, top_k=5, threshold=0.25):
        """Find most relevant content using semantic similarity"""
        if not self.sentence_model or self.corpus_embeddings is None or len(self.corpus) == 0:
            return self.find_relevant_content_fallback(query, top_k)
        
        query_embedding = self.sentence_model.encode([query], convert_to_tensor=True)
        
        # Calculate cosine similarities
        similarities = torch.cosine_similarity(query_embedding, self.corpus_embeddings)
        
        # Get top-k most similar documents
        top_indices = torch.topk(similarities, min(top_k, len(similarities))).indices
        
        results = []
        for idx in top_indices:
            score = similarities[idx].item()
            if score > threshold:
                results.append({
                    'content': self.corpus[idx],
                    'metadata': self.corpus_metadata[idx],
                    'similarity': score
                })
        
        return results
    
    # Fallback method for the previous one 
    def find_relevant_content_fallback(self, query, top_k=5):
        """Fallback content search using simple text matching"""
        query_lower = query.lower()
        results = []
        
        for i, content in enumerate(self.corpus):
            content_lower = content.lower()
            # Simple relevance scoring based on keyword overlap
            query_words = set(query_lower.split())
            content_words = set(content_lower.split())
            overlap = len(query_words.intersection(content_words))
            
            if overlap > 0:
                score = overlap / len(query_words)
                results.append({
                    'content': content,
                    'metadata': self.corpus_metadata[i],
                    'similarity': score
                })
        
        # Sort by similarity and return top k
        results.sort(key=lambda x: x['similarity'], reverse=True)
        return results[:top_k]
    
    # Extracting links from content
    def extract_links_from_content(self, content):
        """Extract URLs from content"""
        url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        links = re.findall(url_pattern, str(content))
        return links
    
    def format_table_from_html(self, html_content):
        """Convert HTML table to readable format"""
        if '<table' in str(html_content).lower():
            return f"📊 **Table Data:**\n{str(html_content)}\n"
        return str(html_content)
    
    # Help getter function to search for departments and colleges
    def get_colleges_and_departments(self):
        """Extract list of colleges and departments from data"""
        colleges = {}
        
        def search_recursive(data, path=""):
            if isinstance(data, dict):
                for key, value in data.items():
                    current_path = f"{path}/{key}" if path else key
                    
                    # Check if this looks like a college or department
                    if any(term in key.lower() for term in ['college', 'school', 'faculty']):
                        colleges[key] = []
                        
                        # Look for departments within this college
                        if isinstance(value, dict):
                            for sub_key in value.keys():
                                if any(term in sub_key.lower() for term in ['department', 'dept', 'program']):
                                    colleges[key].append(sub_key)
                    
                    search_recursive(value, current_path)
        
        search_recursive(self.data)
        
        # If no colleges found, try semantic search
        if not colleges and self.sentence_model:
            college_query = "college school department faculty academic division"
            relevant_content = self.find_relevant_content_semantic(college_query, top_k=10)
            
            for content in relevant_content:
                path = content['metadata']['path']
                if any(term in path.lower() for term in ['college', 'school', 'department']):
                    parts = path.split('/')
                    for part in parts:
                        if any(term in part.lower() for term in ['college', 'school']):
                            if part not in colleges:
                                colleges[part] = []
        
        return colleges

    # This function helps get admission requirements of undergraduate (found in FAQs), masters and PhD from links found in Admission Requirements section from the json file
    def handle_admission_requirements(self, entities, user_input):
        """Admission requirements handler with JSON navigation and follow-up support"""

        if not hasattr(self, 'conversation_state'):
            self.conversation_state = None

        # Handle follow-up input if waiting for level
        if self.conversation_state == "awaiting_admission_level":
            # Re-extract entities from the follow-up input
            follow_up_entities = self.extract_entities_enhanced(user_input)
            if follow_up_entities['level']:
                entities['level'] = follow_up_entities['level']
            
            self.conversation_state = None

        # If still no level detected, prompt user
        if not entities['level']:
            self.conversation_state = "awaiting_admission_level"
            return (
                "Hi! I'd be happy to help you with admission requirements. 😊\n\n"
                "Could you please specify which level you're interested in?\n"
                "🎓 **Undergraduate** (Bachelor's degree)\n"
                "📚 **Masters** (Graduate degree)\n"
                "🔬 **PhD** (Doctoral degree)\n\n"
                "Just reply with one of those!"
            )

        self.conversation_state = None  # Clear state

        # Handle UNDERGRADUATE requests
        if entities['level'] == 'undergraduate':
            # Look for FAQs section
            for section in self.data:
                if section.get("title", "").lower() == "faqs":
                    # Look for the specific FAQ about high school certificate requirements
                    for faq in section.get("children", []):
                        title = faq.get("title", "").lower()
                        if "are students who obtained their high school certificate" in title:
                            answer = faq.get("answer", "No answer provided.")
                            url = section.get("url", "")
                            
                            return (
                                "🎓 **Undergraduate Admission Requirements**\n\n"
                                f"**Answer:** {answer}\n\n"
                                f"🔗 For more detailed information, visit: {url}"
                            )
                    
                    # If specific FAQ not found, provide general undergraduate info from FAQs
                    url = section.get("url", "")
                    return (
                        "🎓 **Undergraduate Admission Requirements**\n\n"
                        "Based on the available information, undergraduate admission requirements include:\n\n"
                        "• High school certificate (from within or outside the Kingdom)\n"
                        "• General Aptitude Test scores\n"
                        "• Academic Achievement Test scores\n"
                        "• Meeting specific program requirements\n\n"
                        f"🔗 For complete details and FAQs, visit: {url}"
                    )

        # Handle MASTERS requests
        elif entities['level'] == 'masters':
            # Look for Admission Requirements section
            for section in self.data:
                if section.get("title", "").lower() == "admission requirements":
                    # Look for Master's Programs in children
                    for child in section.get("children", []):
                        if child.get("title", "").lower() in ["master’s programs", "master's programs"]:
                            url = child.get("url", section.get("url", ""))
                            content = child.get("content", "")
                            
                            # Drill down one more level into nested children
                            sub_items = child.get("children", [])
                            if sub_items:
                                content += "\n\nAdditional Details:\n"
                                for item in sub_items:
                                    title = item.get("title", "")
                                    body = item.get("content", "No details available.")
                                    content += f"\n• **{title}**: {body}"
                            
                            return (
                                "📚 **Master's Programs Admission Requirements**\n\n"
                                f"{content}\n\n"
                                f"🔗 For complete information, visit: {url}"
                            )
                    
                    # If Master's Programs not found in children, provide general info
                    url = section.get("url", "")
                    content = section.get("content", "")
                    return (
                        "📚 **Master's Programs Admission Requirements**\n\n"
                        f"{content}\n\n"
                        "Please check the admission requirements section for specific master's program criteria.\n\n"
                        f"🔗 For complete information, visit: {url}"
                    )

        # Handle PHD requests
        elif entities['level'] == 'phd':
            # Look for Admission Requirements section
            for section in self.data:
                if section.get("title", "").lower() == "admission requirements":
                    # Look for PhD Programs in children
                    for child in section.get("children", []):
                        if child.get("title", "").lower() in ["phd programs"]:
                            url = child.get("url", section.get("url", ""))
                            content = child.get("content", "")
                            
                            # Drill down one more level into nested children
                            sub_items = child.get("children", [])
                            if sub_items:
                                content += "\n\nAdditional Details:\n"
                                for item in sub_items:
                                    title = item.get("title", "")
                                    body = item.get("content", "No details available.")
                                    content += f"\n• **{title}**: {body}"
                            
                            return (
                                "📚 **PhD's Programs Admission Requirements**\n\n"
                                f"{content}\n\n"
                                f"🔗 For complete information, visit: {url}"
                            )
                    
                    # If PhD Programs not found in children, provide general info
                    url = section.get("url", "")
                    content = section.get("content", "")
                    return (
                        "🔬 **PhD Programs Admission Requirements**\n\n"
                        f"{content}\n\n"
                        "Please check the admission requirements section for specific PhD program criteria.\n\n"
                        f"🔗 For complete information, visit: {url}"
                    )

        # Fallback if level is recognized but no specific info found
        return (
            f"🤔 I couldn't find specific {entities['level']} admission requirements in our database. "
            "Please try contacting the admissions office directly or visit the university website for the most accurate and up-to-date information."
        )

    # This function helps get academic calendar from the json file and format it into a table by putting each row on top of another (since it is not in HTML format)
    def handle_academic_calendar(self):
        """Return the academic calendar from structured JSON with table formatting"""

        # Look for the academic calendar item in self.data
        calendar_item = next(
            (item for item in self.data if item.get("title", "").lower() == "academic calendar"),
            None
        )

        if not calendar_item:
            return (
                "📅 **Academic Calendar**\n\n"
                "I couldn't find the academic calendar in our current data. "
                "Please visit the registrar’s official site for updated info.\n\n"
                "🔗 https://dar.ksu.edu.sa/en/CurrentCalendar"
            )

        url = calendar_item.get("url", "")
        table = calendar_item.get("table", {})

        headers = table.get("headers", [])
        rows = table.get("rows", [])

        # Clean up any duplicate header row in the rows
        if rows and headers and rows[0] == headers:
            rows = rows[1:]

        # Start formatting
        response = "📅 **Academic Calendar**\n\n"
        response += f"🔗 [View Full Calendar]({url})\n\n"

        # Format headers
        if headers and all(isinstance(h, str) for h in headers):
            response += "| " + " | ".join(headers) + " |\n"
            response += "|" + " --- |" * len(headers) + "\n"

        # Format rows
        for row in rows:
            # Ensure each row has exactly 4 items
            padded_row = row + [""] * (len(headers) - len(row))
            response += "| " + " | ".join(padded_row[:len(headers)]) + " |\n"

        return response + "\n🗓️ **Need specific dates?** You can ask about registration deadlines, exam schedules, or semester start dates!"
      
    # This function gets the college categories, colleges, departments, their about, contact info and faculty directories
    def handle_degree_programs(self, entities, user_input):
        """
        Simple method to navigate college hierarchy
        Handles: Colleges -> College Categories -> Individual Colleges -> Academic Departments
        """
        
        def find_node(target_title, json_data):
            """Find node by title with fuzzy matching"""
            target_lower = target_title.lower().strip()
            best_match = None
            best_score = 0
            
            # Check if this is a department search
            is_department_search = any(word in target_lower for word in ['department', 'dept'])
            
            # Debug: track all nodes we're checking
            checked_nodes = []
            
            def search_recursive(node, path="root"):
                nonlocal best_match, best_score
                
                if isinstance(node, dict) and 'title' in node:
                    title_lower = node['title'].lower().strip()
                    checked_nodes.append((path, node['title']))
                    
                    # Calculate match score
                    if title_lower == target_lower:
                        score = 100
                    elif target_lower in title_lower:
                        score = 90
                    elif title_lower in target_lower:
                        score = 85
                    else:
                        # Word overlap
                        target_words = set(target_lower.split())
                        title_words = set(title_lower.split())
                        if target_words and title_words:
                            overlap = len(target_words.intersection(title_words))
                            union = len(target_words.union(title_words))
                            score = (overlap / union) * 80
                        else:
                            score = 0
                    
                    if score > best_score:
                        best_score = score
                        best_match = node
                    
                    # Special handling for department searches
                    if is_department_search and 'children' in node:
                        # Look for Academic Departments section
                        for child in node.get('children', []):
                            if isinstance(child, dict) and child.get('section') == 'Academic Departments':
                                # Search within the departments
                                for dept in child.get('children', []):
                                    if isinstance(dept, dict) and 'title' in dept:
                                        dept_title_lower = dept['title'].lower().strip()
                                        checked_nodes.append((f"{path} -> Academic Departments -> {dept['title']}", dept['title']))
                                        
                                        # Calculate department match score
                                        if dept_title_lower == target_lower:
                                            dept_score = 100
                                        elif target_lower in dept_title_lower:
                                            dept_score = 90
                                        elif dept_title_lower in target_lower:
                                            dept_score = 85
                                        else:
                                            # Word overlap for departments
                                            dept_target_words = set(target_lower.split())
                                            dept_title_words = set(dept_title_lower.split())
                                            if dept_target_words and dept_title_words:
                                                dept_overlap = len(dept_target_words.intersection(dept_title_words))
                                                dept_union = len(dept_target_words.union(dept_title_words))
                                                dept_score = (dept_overlap / dept_union) * 80
                                            else:
                                                dept_score = 0
                                        
                                        if dept_score > best_score:
                                            best_score = dept_score
                                            best_match = dept
                    
                    # Search children normally
                    if 'children' in node:
                        for i, child in enumerate(node['children']):
                            search_recursive(child, f"{path} -> children[{i}]")
                
                elif isinstance(node, list):
                    for i, item in enumerate(node):
                        search_recursive(item, f"{path}[{i}]")
            
            search_recursive(json_data)
            
            return best_match if best_score > 30 else None
        
        def get_node_type(node):
            """Determine what type of node this is"""
            if not isinstance(node, dict) or 'title' not in node:
                return 'unknown'
            
            title = node['title'].lower()
            
            # Main "Colleges" node
            if title == 'colleges' and 'children' in node:
                return 'colleges_root'
            
            # College categories (contains "colleges" in title)
            if 'colleges' in title and 'children' in node:
                return 'category'
            
            # Individual college (has "About College" section)
            if 'children' in node:
                for child in node.get('children', []):
                    if isinstance(child, dict) and child.get('section') == 'About College':
                        return 'college'
            
            # Academic department (has contact_info, faculty_links, faculty_staff_links, or detailed content)
            if any(key in node for key in ['contact_info', 'faculty_links', 'faculty_staff_links']) or \
            (node.get('content') and len(node.get('content', '')) > 100):
                return 'department'
            
            return 'unknown'
        
        # Find the target node
        target_node = find_node(user_input.strip(), self.data)
        
        if not target_node:
            return ("🤖 I couldn't find that. Try:\n"
                    "• 'Colleges' - see all categories\n"
                    "• 'Science Colleges' - see colleges in category\n"
                    "• 'College of Engineering' - specific college\n"
                    "• 'Computer Science Department' - department info")
        
        node_type = get_node_type(target_node)
        title = target_node.get('title', '')
        
        # Handle based on node type
        if node_type == 'colleges_root':
            # Show college categories only
            response = "🏛️ **King Saud University - Colleges**\n\n"
            response += "📚 **College Categories:**\n\n"
            
            total_colleges = 0
            for category in target_node.get('children', []):
                if isinstance(category, dict) and 'title' in category:
                    college_count = len(category.get('children', []))
                    total_colleges += college_count
                    response += f"• **{category['title']}** ({college_count} colleges)\n"
            
            response += f"\n🎓 **Total: {total_colleges} colleges**\n\n"
            response += "💡 Ask about any category above for details!"
            return response
        
        elif node_type == 'category':
            # Show colleges in this category
            response = f"🏛️ **{title}**\n\n"
            colleges = target_node.get('children', [])
            
            if colleges:
                response += f"**Colleges ({len(colleges)} total):**\n\n"
                for i, college in enumerate(colleges, 1):
                    if isinstance(college, dict) and 'title' in college:
                        response += f"{i}. **{college['title']}**\n"
                        if college.get('url'):
                            response += f"   🔗 {college['url']}\n"
                        response += "\n"
            
            response += "💡 Ask about any college above for details!"
            return response
        
        elif node_type == 'college':
            # Show college info + list departments (titles only)
            response = f"🎓 **{title}**\n\n"
            
            if target_node.get('url'):
                response += f"🔗 **Website:** {target_node['url']}\n\n"
            
            # Find About College section
            about_content = None
            departments = []
            
            for child in target_node.get('children', []):
                if isinstance(child, dict):
                    if child.get('section') == 'About College':
                        about_content = child.get('content', '')
                    elif child.get('section') == 'Academic Departments':
                        departments = child.get('children', [])
            
            if about_content:
                response += f"**About the College:**\n{about_content}\n\n"
            
            if departments:
                response += f"**Academic Departments ({len(departments)} total):**\n\n"
                for i, dept in enumerate(departments, 1):
                    if isinstance(dept, dict) and 'title' in dept:
                        response += f"{i}. **{dept['title']}**\n"
                
                response += "\n💡 Ask about any department for detailed info!"
            
            return response
        
        elif node_type == 'department':
            # Show full department details
            response = f"🎓 **{title}**\n\n"
            
            if target_node.get('url'):
                response += f"🔗 **Website:** {target_node['url']}\n\n"
            
            if target_node.get('content'):
                content = target_node['content'].strip()
                if content:
                    response += f"**About the Department:**\n{content}\n\n"
            
            if target_node.get('contact_info'):
                contact = target_node['contact_info'].strip()
                if contact:
                    response += f"📞 **Contact Information:**\n{contact}\n\n"
            
            # Faculty links
            faculty_links = target_node.get('faculty_links', []) + target_node.get('faculty_staff_links', [])
            if faculty_links:
                response += "👨‍🏫 **Faculty & Staff:**\n"
                for link in faculty_links:
                    if isinstance(link, dict):
                        title_text = link.get('title', 'Faculty Link')
                        url = link.get('url', '#')
                        response += f"• [{title_text}]({url})\n"
                response += "\n"
            
            return response
        
        else:
            return f"ℹ️ Found '{title}' but couldn't determine its type. Please be more specific."
       
    # This function finds the housing information (things that I thought were important)
    def handle_housing(self, entities, user_input):
        """Handle housing queries using the correct Housing section"""
        
        # Find the Housing section that actually has data (not the empty one)
        housing_section = self.find_housing_with_data()
        
        if not housing_section:
            return "🏠 Housing information not found."
        
        # If no user type specified, ask for clarification
        if not entities.get('user_type'):
            housing_types = self.get_housing_types(housing_section)
            response = "🏠 I'd be happy to help with housing information! Please specify:\n\n"
            for housing_type in housing_types:
                emoji = "🎓" if "student" in housing_type.lower() else "👨‍🏫"
                response += f"{emoji} **{housing_type}**\n"
            return response + "\nWhich type of housing are you interested in?"
        
        # Handle faculty housing
        if entities['user_type'].lower() in ['faculty', 'staff', 'employee']:
            return self.handle_faculty_housing(housing_section)
        
        # Handle student housing
        elif entities['user_type'].lower() in ['student']:
            return self.handle_student_housing(housing_section)
        
        return "🏠 Please specify faculty or student housing."

    # This addition function gets the housing information
    def find_housing_with_data(self):
        """Find the Housing section that contains actual data (not empty)"""
        def search_recursive(data):
            if isinstance(data, dict):
                title = data.get('title', '')
                if title.lower() == 'housing' and data.get('children'):
                    # Found Housing section with children - this is the one we want
                    return data
                
                # Search in children
                for child in data.get('children', []):
                    result = search_recursive(child)
                    if result:
                        return result
            elif isinstance(data, list):
                for item in data:
                    result = search_recursive(item)
                    if result:
                        return result
            return None
        
        return search_recursive(self.data)

    # This additional function gets housing type whether it is student or faculty housing
    def get_housing_types(self, housing_section):
        """Extract housing types from the housing section"""
        housing_types = []
        for child in housing_section.get('children', []):
            title = child.get('title', '')
            if title and 'housing' in title.lower():
                housing_types.append(title)
        return housing_types

    # This additional function helps navigating Faculty housing sections that I thought were important
    def handle_faculty_housing(self, housing_section):
        """Handle faculty housing navigation"""
        # Find Faculty Housing section
        faculty_housing = None
        for child in housing_section.get('children', []):
            if child.get('title', '').lower() == 'faculty housing':
                faculty_housing = child
                break
        
        if not faculty_housing:
            available_children = [child.get('title', 'No title') for child in housing_section.get('children', [])]
            return f"🏠 Faculty housing not found. Available options: {', '.join(available_children)}"
        
        response = "🏠 **Faculty Housing**\n\n"
        
        # Display direct children of Faculty Housing
        for child in faculty_housing.get('children', []):
            title = child.get('title', '')
            url = child.get('url', '')
            
            response += f"📋 **{title}**\n"
            
            if url:
                response += f"🔗 [Access {title}]({url})\n"
            
            # If this child has its own children (like "Related Links"), display them
            if child.get('children'):
                response += self.display_child_links(child)
            
            response += "\n"
        
        return response

    # This additional function helps navigating Student housing sections that I thought were important
    def handle_student_housing(self, housing_section):
        """Handle student housing navigation"""
        student_housing = None
        for child in housing_section.get('children', []):
            if child.get('title', '').lower() == 'student housing':
                student_housing = child
                break
        
        if not student_housing:
            available_children = [child.get('title', 'No title') for child in housing_section.get('children', [])]
            return f"🏠 Student housing not found. Available options: {', '.join(available_children)}"
        
        response = "🏠 **Student Housing**\n\n"
        
        for child in student_housing.get('children', []):
            title = child.get('title', '')
            url = child.get('url', '')
            content = child.get('content', '')
            
            response += f"📋 **{title}**\n"
            
            # Show content if available (like the procedural guide)
            if content:
                # Format the content nicely
                formatted_content = self.format_housing_content(content)
                response += f"{formatted_content}\n"
            
            if url:
                response += f"🔗 [Access {title}]({url})\n"
            
            # Show child links if any (like for Female Student Housing)
            if child.get('children'):
                response += self.display_child_links(child)
            
            response += "\n"
        
        return response

    # Gets child links 
    def display_child_links(self, parent_section):
        """Display children of a section (like Related Links)"""
        if not parent_section.get('children'):
            return ""
        
        response = f"\n**Available options:**\n"
        
        for child in parent_section.get('children', []):
            child_title = child.get('title', '')
            child_url = child.get('url', '')
            
            if child_title:
                response += f"• **{child_title}**"
                if child_url:
                    response += f" - [Access here]({child_url})"
                response += "\n"
        
        return response

    # Editing the housing information content for better readability
    def format_housing_content(self, content):
        """Format housing content for better readability"""
        if not content:
            return ""
        
        # Split into lines and format
        lines = content.split('\n')
        formatted_lines = []
        
        for line in lines:
            line = line.strip()
            if line:
                # Make steps and important headings bold
                if any(keyword in line.lower() for keyword in ['step ', 'condition', 'how to apply']):
                    formatted_lines.append(f"**{line}**")
                else:
                    formatted_lines.append(line)
        
        return '\n'.join(formatted_lines)
    
    # This is to handle library sections, their libraries and content of the libraries along with Location links
    def handle_library(self, entities, user_input):
        user_input_lower = user_input.lower()

        def contains_title(item, text):
            return item.get("title", "").lower() in text

        def find_matching_node(data, text):
            for node in data:
                if contains_title(node, text):
                    return node
            return None

        def find_matching_child(parent, text):
            for child in parent.get("children", []):
                if contains_title(child, text):
                    return child
            return None

        def format_contact_info(section):
            lines = []
            for table in section.get("tables", []):
                for row in table.get("rows", []):
                    lines.append(" | ".join(row))
            return "\n".join(lines)

        # STEP 1: Start from top-level "Libraries"
        libraries_root = find_matching_node(self.data, "libraries")
        if not libraries_root:
            return "❌ Could not find 'Libraries' section in data."

        # STEP 2: If user specifies a category (e.g., Shared libraries)
        selected_category = find_matching_child(libraries_root, user_input_lower)
        if selected_category:
            # STEP 2a: If user already specified a library
            matched_library = find_matching_child(selected_category, user_input_lower)
            if matched_library:
                info, contact, location = "", "", ""

                for section in matched_library.get("children", []):
                    title = section.get("title", "").lower()
                    if "information" in title:
                        info = section.get("content", "").strip()
                    elif "contact" in title:
                        contact = format_contact_info(section)
                    elif "location" in title:
                        location = section.get("url", "")

                response = f"📚 **{matched_library['title']}**\n\n"
                if info:
                    response += f"**Information:**\n{info}\n\n"
                if contact:
                    response += f"**Contact Info:**\n{contact}\n\n"
                if location:
                    response += f"**Location:** {location}"
                return response.strip()

            # STEP 2b: User just selected the category, list children
            library_titles = [child["title"] for child in selected_category.get("children", [])]
            return f"Here are the libraries under **{selected_category['title']}**:\n" + "\n".join(f"- {title}" for title in library_titles)

        # STEP 3: User mentioned a library directly without saying category
        for category in libraries_root.get("children", []):
            matched_library = find_matching_child(category, user_input_lower)
            if matched_library:
                info, contact, location = "", "", ""
                for section in matched_library.get("children", []):
                    title = section.get("title", "").lower()
                    if "information" in title:
                        info = section.get("content", "").strip()
                    elif "contact" in title:
                        contact = format_contact_info(section)
                    elif "location" in title:
                        location = section.get("url", "")

                response = f"📚 **{matched_library['title']}**\n\n"
                if info:
                    response += f"**Information:**\n{info}\n\n"
                if contact:
                    response += f"**Contact Info:**\n{contact}\n\n"
                if location:
                    response += f"**Location:** {location}"
                return response.strip()

        # STEP 4: User only said "libraries" → ask to choose category
        category_titles = [cat["title"] for cat in libraries_root.get("children", [])]
        return "🤖 Would you like to know about one of the following library categories?\n" + "\n".join(f"- {title}" for title in category_titles)
    
    # This function is to scrape grading system scale and add the PDF
    def handle_grading_system(self):
        """Handle grading system with semantic search"""
        query = "grading system grade scale GPA evaluation assessment"
        relevant_content = self.find_relevant_content_semantic(query, top_k=5)
        response = "📊 **Grading System**\n\n"
        
        for content in relevant_content[:3]:
            content_text = content['content']
            response += content_text + "\n\n"
            
            # Extract links from content text
            links = self.extract_links_from_content(content_text)
            if links:
                response += f"🔗 [Grading policy PDF]({links[0]})\n\n"
            
            # Check metadata for additional fields like 'info'
            metadata = content.get('metadata', {})
            if 'info' in metadata and metadata['info']:
                response += f"🔗 [Study and Examinations Regulations PDF]({metadata['info']})\n\n"
            elif 'url' in metadata and metadata['url']:
                response += f"🔗 [Additional information]({metadata['url']})\n\n"
        
        if not relevant_content:
            response += "Grading system information is not available in our current database. Please check the student handbook or contact academic affairs.\n\n"
        
        return response
    
    # This function is to handle the plagiarism with the PDF
    def handle_plagiarism(self):
        """Handle plagiarism queries with semantic search"""
        query = "plagiarism academic integrity policy cheating misconduct"
        relevant_content = self.find_relevant_content_semantic(query, top_k=5)
        
        response = "⚠️ **Academic Integrity & Plagiarism Policy**\n\n"
        
        for content in relevant_content[:3]:
            response += content['content'] + "\n\n"
            links = self.extract_links_from_content(content['content'])
            if links:
                response += f"🔗 [Full policy]({links[0]})\n\n"
        
        if not relevant_content:
            response += "Plagiarism policy information is not available in our current database. Please check the student handbook or contact academic affairs.\n\n"
        
        return response
    
    # This function is to handle the attendance rules with the PDF
    def handle_attendance(self):
        """Handle attendance queries"""
        query = "attendance policy class attendance requirements"
        relevant_content = self.find_relevant_content_semantic(query, top_k=3)
        
        response = "📋 **Attendance Policy**\n\n"
        
        if relevant_content:
            for content in relevant_content:
                response += content['content'] + "\n\n"
                links = self.extract_links_from_content(content['content'])
                if links:
                    response += f"🔗 [Attendance policy]({links[0]})\n\n"
        else:
            response += "Please refer to the grading system document for detailed attendance requirements.\n\n"
        
        # Always check for grading system info field as it contains attendance details
        grading_query = "grading system grade scale GPA evaluation assessment"
        grading_content = self.find_relevant_content_semantic(grading_query, top_k=5)
        
        for content in grading_content:
            metadata = content.get('metadata', {})
            # Check if this is grading system content and has info field
            if 'info' in metadata and metadata['info']:
                response += f"📖 Please refer to the Study and Examinations Regulations for detailed attendance policy:\n"
                response += f"🔗 [Attendance found here: ]({metadata['info']})\n\n"
                break
        
        return response
    
    # This is to scrape the research labs links
    def handle_research_labs(self):
        """Handle research labs and facilities - find Research node with Labs child"""
        
        response = "🔬 **Research Labs & Facilities**\n\n"
        
        def find_research_with_labs(data):
            """Recursively find Research node that has Labs as a child"""
            if isinstance(data, dict):
                # Check if this is a Research node
                if data.get('title') == 'Research':
                    # Check if it has children
                    if 'children' in data:
                        for child in data['children']:
                            if isinstance(child, dict) and child.get('title') == 'Labs':
                                # Found Research->Labs! Return the Labs children
                                return child.get('children', [])
                
                # Recursively search in children
                if 'children' in data:
                    for child in data['children']:
                        result = find_research_with_labs(child)
                        if result:
                            return result
            
            elif isinstance(data, list):
                # If data is a list, search each item
                for item in data:
                    result = find_research_with_labs(item)
                    if result:
                        return result
            
            return None
        
        try:
            labs_children = find_research_with_labs(self.data)  # You'll need to adjust this
            
            if labs_children:
                response += "Here are the research labs and facilities:\n\n"
                for i, lab in enumerate(labs_children, 1):
                    title = lab.get('title', 'Unknown Lab')
                    url = lab.get('url', '')
                    
                    # Normalize URL
                    if url and url.startswith('/'):
                        url = f"https://ksu.edu.sa{url}"
                    
                    response += f"{i}. **{title}**\n"
                    if url:
                        response += f"   🔗 {url}\n"
                    response += "\n"
            else:
                response += "Could not find Research node with Labs child.\n\n"
                
        except AttributeError:
            # Fallback if we don't have direct access to JSON data
            response += "Unable to access JSON data directly. Please ensure the JSON data is available.\n\n"
        
        return response

    # This function adds all the IT helpdesk for student and staff and paths inside. 
    def handle_it_support(self, entities, user_input):
        from difflib import get_close_matches

        # Find IT Helpdesk node
        it_helpdesk = next((item for item in self.data if item.get("title", "").lower() == "it helpdesk"), None)
        if not it_helpdesk:
            return "⚠️ Sorry, I couldn't find the IT Helpdesk information."

        user_input_lower = user_input.lower()
        user_type = None

        if "student" in user_input_lower:
            user_type = "Student"
        elif "staff" in user_input_lower or "faculty" in user_input_lower or "professor" in user_input_lower:
            user_type = "Staff"

        if not user_type:
            return "❓ Are you a student or staff/faculty? Please specify so I can assist you."

        # Navigate to the relevant section
        section = next((child for child in it_helpdesk.get("children", []) if child.get("title", "").lower() == user_type.lower()), None)
        if not section:
            return f"⚠️ Sorry, I couldn’t find IT support info for {user_type}."

        # Extract all issues and sub-issues
        def extract_issues_with_hierarchy(node, parent_title=None):
            results = []
            title = node.get("title")
            if title:
                full_title = f"{parent_title} → {title}" if parent_title else title
                results.append((full_title, title))
            for child in node.get("children", []):
                results.extend(extract_issues_with_hierarchy(child, title))
            return results

        all_issues = extract_issues_with_hierarchy(section)
        plain_titles = [t[1].lower() for t in all_issues]

        # Try to match the user input to a known issue
        matched = get_close_matches(user_input.lower(), plain_titles, n=1, cutoff=0.4)

        if matched:
            matched_title = next(full for full, plain in all_issues if plain.lower() == matched[0])
            ksu_code = "KSU1" if user_type == "Staff" else "KSU2"
            return (
                f"🛠️ It looks like you're facing: **{matched_title}**.\n"
                f"Please visit the [IT Helpdesk]({it_helpdesk['url']}), select **{ksu_code}**, and click **'Report an Issue'**."
            )

        # If no match found, show all options (parents and their children)
        options_text = f"📋 I couldn't find an exact match. Here are support topics for {user_type}:\n\n"
        grouped = {}
        for full_title, child_title in all_issues:
            parent = full_title.split("→")[0].strip()
            grouped.setdefault(parent, []).append(child_title)

        for parent, children in grouped.items():
            options_text += f"🔹 **{parent}**\n"
            for c in children:
                options_text += f"  • {c}\n"

        options_text += "\n💬 Please choose one of the topics above or rephrase your issue."

        return options_text
    

    #---------------- THESE THINGS HAVE NOT BEEN SCRAPED YET ---------------------------------------#
    def handle_fees_tuition(self, user_input):
        """Handle tuition and fees queries"""
        query = "tuition fees cost payment financial charges"
        relevant_content = self.find_relevant_content_semantic(query, top_k=5)
        
        response = "💰 **Tuition & Fees Information**\n\n"
        
        for content in relevant_content[:3]:
            response += content['content'] + "\n\n"
            links = self.extract_links_from_content(content['content'])
            if links:
                response += f"🔗 [Fee structure]({links[0]})\n\n"
        
        if not relevant_content:
            response += ("Tuition and fee information is not available in our current database. "
                        "Please contact the finance office or check the student portal for current rates.\n\n")
        
        return response
    
    def handle_scholarships(self, user_input):
        """Handle scholarship and financial aid queries"""
        query = "scholarship financial aid funding grants assistance"
        relevant_content = self.find_relevant_content_semantic(query, top_k=5)
        
        response = "🎓 **Scholarships & Financial Aid**\n\n"
        
        for content in relevant_content[:3]:
            response += content['content'] + "\n\n"
            links = self.extract_links_from_content(content['content'])
            if links:
                response += f"🔗 [Scholarship portal]({links[0]})\n\n"
        
        if not relevant_content:
            response += ("Scholarship information is not available in our current database. "
                        "Please contact the financial aid office for information about available scholarships and grants.\n\n")
        
        return response
    
    #-----------------------------------------------------------------------------------------#
    
    # General queries
    def handle_general_query_enhanced(self, user_input):
        """Enhanced general query handling with better semantic understanding"""
        # First, try to find relevant content
        relevant_content = self.find_relevant_content_semantic(user_input, top_k=5, threshold=0.2)
        
        if relevant_content:
            response = "💡 **Here's what I found for you:**\n\n"
            
            for i, content in enumerate(relevant_content[:3], 1):
                text = content['content']
                
                # Intelligent summarization
                if len(text) > 250:
                    sentences = text.split('.')
                    summary = sentences[0]
                    if len(summary) < 200 and len(sentences) > 1:
                        summary += '. ' + sentences[1]
                    response += f"**{i}.** {summary}...\n\n"
                else:
                    response += f"**{i}.** {text}\n\n"
                
                # Add links
                links = self.extract_links_from_content(content['content'])
                if links:
                    response += f"   🔗 [More information]({links[0]})\n\n"
            
            # Add contextual follow-up suggestions
            response += "❓ **Want to know more?** You can ask me about:\n"
            response += "• Admission requirements and application process\n"
            response += "• Academic programs and course information\n"
            response += "• Campus facilities and student services\n"
            response += "• Contact information for departments\n"
            
        else:
            response = ("🤔 I couldn't find specific information about that in our database. "
                       "However, I can help you with:\n\n"
                       "🎓 **Academics:** Admission requirements, degree programs, academic calendar\n"
                       "🏠 **Campus Life:** Housing, libraries, dining, recreation\n"
                       "💼 **Services:** IT support, financial aid, career services\n"
                       "📞 **Contact:** Department information, faculty directories\n"
                       "📋 **Policies:** Grading, attendance, academic integrity\n"
                       "🔬 **Research:** Labs, facilities, opportunities\n\n"
                       "Could you try rephrasing your question or ask about one of these topics?")
        
        return response
    

    # This is all the chats handler functions inside
    def chat(self, user_input):
        """Enhanced main chat function with better semantic understanding"""
        # Identify intent using semantic similarity
        intent, confidence = self.identify_intent_semantic(user_input)
        entities = self.extract_entities_enhanced(user_input)
        
        # Update user context
        self.user_context['last_intent'] = intent
        self.user_context['entities'].update(entities)
        
        # Route to appropriate handler based on intent
        if intent == 'admission_requirements':
            return self.handle_admission_requirements(entities, user_input)
        elif intent == 'academic_calendar':
            return self.handle_academic_calendar()
        elif intent == 'degree_programs':
            return self.handle_degree_programs(entities, user_input)
        elif intent == 'faculty':
            return self.handle_faculty_directory(entities, user_input)
        elif intent == 'housing':
            return self.handle_housing(entities, user_input)
        elif intent == 'library':
            return self.handle_library(entities, user_input)
        elif intent == 'grading':
            return self.handle_grading_system()
        elif intent == 'plagiarism':
            return self.handle_plagiarism()
        elif intent == 'attendance':
            return self.handle_attendance()
        elif intent == 'research':
            return self.handle_research_labs()
        elif intent == 'it_support':
            return self.handle_it_support(entities, user_input)
        elif intent == 'contact_info':
            return self.handle_contact_info(entities, user_input)
        elif intent == 'fees_tuition':
            return self.handle_fees_tuition(user_input)
        elif intent == 'scholarships':
            return self.handle_scholarships(user_input)
        else:
            return self.handle_general_query_enhanced(user_input)
    
    # Interactive chats
    def run_interactive_chat(self):
        """Run interactive chat session with enhanced experience"""
        print("🎓 Welcome! I can help with admissions, academics, libraries, housing, faculty, fees, research, and more. Type your question or 'help' for examples.")
        print("💡 Tip: Ask natural questions like 'How do I apply for undergraduate admission?' or 'What are the library hours?'")
        print("Type 'help' for examples, or 'exit' to quit.\n")
        
        conversation_count = 0
        
        while True:
            try:
                user_input = input("🙋 You: ").strip()
                
                if user_input.lower() in ['exit', 'quit', 'bye', 'goodbye']:
                    print("🤖 Assistant: Thank you for using the University AI Assistant! Have a wonderful day! 👋")
                    break
                
                if user_input.lower() == 'help':
                    print("🤖 Assistant: Here are some example questions you can ask:")
                    print("• 'What are the admission requirements for undergraduate programs?'")
                    print("• 'Show me the academic calendar'")
                    print("• 'I need information about computer science department'")
                    print("• 'Where can I find student housing?'")
                    print("• 'What's the grading system?'")
                    print("• 'I'm having trouble with my login'")
                    print("• 'Tell me about research opportunities'")
                    print("• 'How much does tuition cost?'")
                    print("• 'What scholarships are available?'\n")
                    continue
                
                if not user_input:
                    print("🤖 Assistant: I'm here to help! Please ask me something about the university. 😊\n")
                    continue
                
                print("🤖 Assistant: ", end="")
                response = self.chat(user_input)
                print(f"{response}\n")
                
                conversation_count += 1
                
                # Provide helpful suggestions every few interactions
                if conversation_count % 5 == 0:
                    print("💡 **Quick tip:** You can ask follow-up questions or request more specific information anytime!\n")
                
            except KeyboardInterrupt:
                print("\n🤖 Assistant: Goodbye! Thanks for using the University AI Assistant! 👋")
                break
            except Exception as e:
                print(f"🤖 Assistant: I encountered an error processing your request. Please try again! 🔧")
                print(f"(Technical details: {str(e)})\n")


# Main function to run all the functions
def main():
    """Main function to run the enhanced chatbot"""
    # Initialize chatbot with your JSON file
    json_file_path = "C:\\Nawal\\IT Department\\Practical Training\\Final Chatbot\\data_backups\\menu_hierarchy.json"  # Replace with your actual JSON file path
    
    print("🚀 Starting University AI Assistant...")
    
    try:
        chatbot = UniversityChatbot(json_file_path)
        chatbot.run_interactive_chat()
    except KeyboardInterrupt:
        print("\n👋 Goodbye!")
    except Exception as e:
        print(f"\n❌ Error initializing chatbot: {str(e)}")
        print("\n🔧 **Setup Requirements:**")
        print("1. Install required packages:")
        print("   pip install sentence-transformers transformers torch nltk spacy scikit-learn")
        print("2. Download spaCy model:")
        print("   python -m spacy download en_core_web_sm")
        print("3. Ensure your JSON file path is correct")
        print("4. Make sure you have sufficient disk space for model downloads")
        print("\n💡 **Note:** The chatbot will work with fallback methods if some models fail to load.")

if __name__ == "__main__":
    main()

🚀 Starting University AI Assistant...
✅ Loaded university data from C:\Nawal\IT Department\Practical Training\Final Chatbot\data_backups\menu_hierarchy.json
🔍 Creating embeddings for 1330 text segments...
✅ Semantic corpus ready
🎓 Welcome! I can help with admissions, academics, libraries, housing, faculty, fees, research, and more. Type your question or 'help' for examples.
💡 Tip: Ask natural questions like 'How do I apply for undergraduate admission?' or 'What are the library hours?'
Type 'help' for examples, or 'exit' to quit.

🤖 Assistant: DEBUG: Searching for: 'Colleges'
DEBUG: Is department search: False
DEBUG: Best match found: Colleges
DEBUG: Best score: 100
DEBUG: Total nodes checked: 350
DEBUG: First 50 nodes checked:
  1. About (at root[0])
  2. President's Message (at root[0] -> children[0])
  3. Leadership (at root[0] -> children[1])
  4. King Saud University Board of Directors (at root[0] -> children[1] -> children[0])
  5. University Administration (at root[0] -> children

### Enhancement's needed

- Need to add a bubble chat functionality to enhance performance and transparency to users
- Adding AI Agents to scrape PDFs and several other items
- Scraping from KSU account (from X) 

### Fixing needed

- Some degree programs need extra functions as they are being mistakened as intent of others such as Finance Departments not taken as Colleges
- IT Helpdesk has some issues as well
- I need to add Campus locations to webscrape