## 阶段一
	•	设置好环境与依赖（LangChain + Torch）
	•	规范路径与目录结构
	•	引入通用工具函数

In [6]:
# 1. 导入必要的库
import os
import torch
import re 
from langchain.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader, UnstructuredHTMLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA # RAG 问答链

# 确保sys.path包含项目根目录
import sys
sys.path.append('..') 

# 导入通用工具函数 (set_seed，从train_eval_utils.py)(filter_documents, 从mism_doc_loader.py)
from utils.model_training.train_eval_utils import set_seed 
from utils.text_processing.mism_doc_loader import filter_documents


# 2. 设备设置
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device for embedding model and LLM.")
else:
    device = torch.device("cpu")
    print("MPS device not available, using CPU.")

# 3. 设置随机种子
SEED = 66
set_seed(SEED)

# 4. 定义路径
KNOWLEDGE_BASE_DIR = '../data/cmu_mism_docs' # 知识库文档存放的根目录
CHROMA_PERSIST_DIR = '../vectorstore/cmu_mism_chroma' # ChromaDB 数据存放目录

# 确保 ChromaDB 目录存在
os.makedirs(CHROMA_PERSIST_DIR, exist_ok=True)


  from .autonotebook import tqdm as notebook_tqdm
W0626 23:26:10.596000 97437 site-packages/torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


Using MPS device for embedding model and LLM.
Random seed set to 66


## 阶段二
    •	路径设定：定义并检查各类文档目录（HTML、PDF、TXT）
    •	文档加载：使用 LangChain 提供的 DirectoryLoader + 相应 loader_cls
    •	类型统计：记录各类文件加载数目
    •	内容预览：打印每类前 3 个文档的元信息与文本预览
    •	清洗检查：基于正则表达式进行页面质量检测（如残留HTML标签、解析失败等）

In [7]:

# --- 知识库构建流程 ---

# 5. 文档加载
print(f"\n--- Starting document loading from {KNOWLEDGE_BASE_DIR} ---")

documents = []
loaded_counts_by_type = {'html': 0, 'pdf': 0, 'txt_forums': 0, 'txt_info': 0}
all_loaded_docs_by_type = {'html': [], 'pdf': [], 'txt_forums': [], 'txt_info': []} # 用于存储加载的文档以便后续检查

# --- 加载 HTML 文件 ---
html_docs_path = os.path.join(KNOWLEDGE_BASE_DIR, 'websites_pages') # 从websites_pages目录加载HTML文件
if os.path.exists(html_docs_path):
    print(f"Attempting to load HTML from: {html_docs_path}")
    html_loader = DirectoryLoader(html_docs_path, glob="**/*.html", loader_cls=UnstructuredHTMLLoader)
    try:
        html_docs = html_loader.load()
        loaded_counts_by_type['html'] = len(html_docs)
        documents.extend(html_docs)
        all_loaded_docs_by_type['html'] = html_docs
        print(f"  Successfully loaded {len(html_docs)} HTML documents.")
    except Exception as e:
        print(f"  ERROR: Failed to load HTML documents from {html_docs_path}. Error: {e}")
else:
    print(f"  WARNING: HTML documents directory not found: {html_docs_path}. No HTML files will be loaded.")

# --- 加载 PDF 文件 ---
pdf_docs_path = os.path.join(KNOWLEDGE_BASE_DIR, 'handbooks') # 从handbooks目录加载PDF文件
if os.path.exists(pdf_docs_path):
    print(f"Attempting to load PDF from: {pdf_docs_path}")
    pdf_loader = DirectoryLoader(pdf_docs_path, glob="**/*.pdf", loader_cls=PyPDFLoader) 
    try:
        pdf_docs = pdf_loader.load()
        loaded_counts_by_type['pdf'] = len(pdf_docs)
        documents.extend(pdf_docs)
        all_loaded_docs_by_type['pdf'] = pdf_docs
        print(f"  Successfully loaded {len(pdf_docs)} PDF documents.")
    except Exception as e:
        print(f"  ERROR: Failed to load PDF documents from {pdf_docs_path}. Error: {e}")
else:
    print(f"  WARNING: PDF documents directory not found: {pdf_docs_path}. No PDF files will be loaded.")

# --- 加载 Forums TXT 文件 ---
forums_txt_path = os.path.join(KNOWLEDGE_BASE_DIR, 'forums') # 从forums目录加载TXT文件
if os.path.exists(forums_txt_path):
    print(f"Attempting to load TXT (forums) from: {forums_txt_path}")
    forums_txt_loader = DirectoryLoader(forums_txt_path, glob="**/*.txt", loader_cls=TextLoader)
    try:
        forums_txt_docs = forums_txt_loader.load()
        loaded_counts_by_type['txt_forums'] = len(forums_txt_docs)
        documents.extend(forums_txt_docs)
        all_loaded_docs_by_type['txt_forums'] = forums_txt_docs
        print(f"  Successfully loaded {len(forums_txt_docs)} TXT (forums) documents.")
    except Exception as e:
        print(f"  ERROR: Failed to load TXT (forums) documents from {forums_txt_path}. Error: {e}")
else:
    print(f"  WARNING: TXT (forums) documents directory not found: {forums_txt_path}. No TXT (forums) files will be loaded.")

# --- 加载 Text_info TXT 文件 ---
text_info_path = os.path.join(KNOWLEDGE_BASE_DIR, 'text_info') # 从text_info目录加载TXT文件
if os.path.exists(text_info_path):
    print(f"Attempting to load TXT (text_info) from: {text_info_path}")
    text_info_loader = DirectoryLoader(text_info_path, glob="**/*.txt", loader_cls=TextLoader)
    try:
        text_info_docs = text_info_loader.load()
        loaded_counts_by_type['txt_info'] = len(text_info_docs)
        documents.extend(text_info_docs)
        all_loaded_docs_by_type['txt_info'] = text_info_docs
        print(f"  Successfully loaded {len(text_info_docs)} TXT (text_info) documents.")
    except Exception as e:
        print(f"  ERROR: Failed to load TXT (text_info) documents from {text_info_path}. Error: {e}")
else:
    print(f"  WARNING: TXT (text_info) documents directory not found: {text_info_path}. No TXT (text_info) files will be loaded.")



--- Starting document loading from ../data/cmu_mism_docs ---
Attempting to load HTML from: ../data/cmu_mism_docs/websites_pages
  Successfully loaded 8 HTML documents.
Attempting to load PDF from: ../data/cmu_mism_docs/handbooks


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


  Successfully loaded 44 PDF documents.
Attempting to load TXT (forums) from: ../data/cmu_mism_docs/forums
  Successfully loaded 0 TXT (forums) documents.
Attempting to load TXT (text_info) from: ../data/cmu_mism_docs/text_info
  Successfully loaded 2 TXT (text_info) documents.


检查文档加载情况（数量统计和内容质量检查）

In [8]:

# 文档数量统计
print(f"\nTotal documents loaded: {len(documents)}")
for doc_type, count in loaded_counts_by_type.items():
    print(f"  {doc_type.replace('_', ' ').title()} documents loaded: {count}")



# --- 详细内容验证 ---
print("\n--- Detailed Content Verification (Preview of first 3 docs of each type) ---")

for doc_type, docs_list in all_loaded_docs_by_type.items():
    if not docs_list:
        print(f"No {doc_type.replace('_', ' ').title()} documents to verify.")
        continue

    print(f"\nVerifying {doc_type.replace('_', ' ').title()} documents (first {min(3, len(docs_list))} samples):")
    for i, doc in enumerate(docs_list[:3]): # 只检查前3个样本，避免输出过长
        source = doc.metadata.get('source', 'N/A')
        # 对于PDF，metadata可能包含'page'
        page_info = f" Page {doc.metadata.get('page')}" if 'page' in doc.metadata else ""
        content_preview = doc.page_content.strip()[:500] # 获取前500字符进行预览
        
        print(f"\n  {doc_type.replace('_', ' ').title()} Document {i+1} from {source}{page_info}:")
        print(f"    Content length: {len(doc.page_content)} characters")
        print(f"    Content preview:\n\"\"\"\n{content_preview}\n\"\"\"") # 使用三引号打印，保留格式

        # --- 质量检查逻辑 ---
        # 检查内容是否为空或过短，这通常表示解析失败
        if len(doc.page_content) < 100: 
            print(f"    WARNING: Content length {len(doc.page_content)} seems too short. Possible parsing issue for {source}{page_info}. Consider manually converting to TXT/MD.")
        
        # 检查是否包含HTML标签 remnants (可能UnstructuredHTMLLoader未完全清除)
        if '<' in doc.page_content[:500] and '>' in doc.page_content[:500] and 'html' in doc.page_content[:500].lower():
             print(f"    WARNING: HTML tags or remnants detected in content. This might affect embedding quality.")
        
        # 检查是否包含大量连续的非字母数字字符（如 JS 代码或复杂 CSS）
        if re.search(r'[^a-zA-Z0-9\s]{20,}', doc.page_content[:500]): # 连续20个非字母数字空格
            print(f"    WARNING: Suspicious non-alphanumeric character sequences detected. Content might contain code or parsing artifacts.")


Total documents loaded: 54
  Html documents loaded: 8
  Pdf documents loaded: 44
  Txt Forums documents loaded: 0
  Txt Info documents loaded: 2

--- Detailed Content Verification (Preview of first 3 docs of each type) ---

Verifying Html documents (first 3 samples):

  Html Document 1 from ../data/cmu_mism_docs/websites_pages/career_outcome.html:
    Content length: 2341 characters
    Content preview:
"""
Careers in Information Systems Management

The World's leading companies recruit Heinz College Information Systems Management graduates. Data shown reflects the last three years.

information systems job titles and salary reports

Job titles and salary information for the Information Systems Management program.

Information Systems Career Outcomes

Dedicated Career SUPPORT

Heinz College treats career development as seriously as it treats academics. The Information Systems Management program has
"""

  Html Document 2 from ../data/cmu_mism_docs/websites_pages/curriculum.html:
    C

文档清洗

In [9]:
# 文档加载与预览之后，执行清洗
clean_documents, bad_documents = filter_documents(documents)
documents = clean_documents  # 可重命名或直接覆盖


📊 文档清洗完成：
  - 原始文档数量：54
  - 保留有效文档：50
  - 剔除无效/乱码文档：4


## 阶段三
    •	文本分块
    •	文本嵌入
    •	向量存储
    •   创建检索器
    •   测试知识库

In [28]:

# 6. 文本分块
print("\n--- Splitting documents into chunks ---")
# 文本分块器配置
# chunk_size: 每个块的字符数。通常200-1000是一个好的范围，取决于LLM上下文窗口和信息密度
# chunk_overlap: 块之间的重叠字符数。通常是 chunk_size 的 10%-20%
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,    # 每个块的最大字符数
    chunk_overlap=50, # 块之间的重叠字符数
    length_function=len # 默认使用字符长度
)
chunks = text_splitter.split_documents(documents)
print(f"Split into {len(chunks)} chunks.")
# 示例：打印第一个块的内容和元数据
if chunks:
    print("\n--- Example Chunk ---")
    print(f"Source: {chunks[0].metadata.get('source')}")
    print(f"Page content (first 200 chars):\n{chunks[0].page_content[:200]}...")


--- Splitting documents into chunks ---
Split into 463 chunks.

--- Example Chunk ---
Source: ../data/cmu_mism_docs/websites_pages/career_outcome.html
Page content (first 200 chars):
Careers in Information Systems Management

The World's leading companies recruit Heinz College Information Systems Management graduates. Data shown reflects the last three years.

information systems ...


手动删除旧的向量库文件夹
适用于：
	•	修改了文本清洗逻辑；
	•	或修改了分块策略；
	•	或使用了不同的 Embedding 模型；
	•	或希望完全重建索引（干净状态）。

In [None]:
# import shutil
# import os

# CHROMA_PERSIST_DIR = "../vectorstore/cmu_mism_chroma"

# if os.path.exists(CHROMA_PERSIST_DIR):
#     print("Deleting existing vectorstore...")
#     shutil.rmtree(CHROMA_PERSIST_DIR)

In [11]:
# 7. 文本嵌入
print("\n--- Creating embeddings ---")
# 使用 Sentence-Transformers 模型进行嵌入
# "all-MiniLM-L6-v2" 是一个轻量级但高性能的模型
# force_download=True 确保在Docker或某些环境中模型能正确下载
embeddings_model_name = "all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(
    model_name=embeddings_model_name,
    model_kwargs={'device': str(device)} # 确保嵌入模型也在MPS/GPU上运行
)
print(f"Using embedding model: {embeddings_model_name} on device: {device}")


--- Creating embeddings ---


  embeddings = HuggingFaceEmbeddings(


Using embedding model: all-MiniLM-L6-v2 on device: mps


In [13]:

# 8. 向量存储
print(f"\n--- Storing embeddings in ChromaDB at {CHROMA_PERSIST_DIR} ---")
# 检查向量数据库是否已存在并加载，否则从头创建
if os.path.exists(CHROMA_PERSIST_DIR) and len(os.listdir(CHROMA_PERSIST_DIR)) > 0:
    print("ChromaDB already exists. Loading existing vectorstore.")
    vectordb = Chroma(persist_directory=CHROMA_PERSIST_DIR, embedding_function=embeddings)
else:
    print("ChromaDB not found or empty. Creating new vectorstore.")
    vectordb = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=CHROMA_PERSIST_DIR
    )
    # 持久化存储，以便下次运行时直接加载
    vectordb.persist() 
    print("Vectorstore created and persisted.")

print("\nKnowledge base construction complete.")



--- Storing embeddings in ChromaDB at ../vectorstore/cmu_mism_chroma ---
ChromaDB not found or empty. Creating new vectorstore.
Vectorstore created and persisted.

Knowledge base construction complete.


  vectordb.persist()


In [None]:

# 9. 创建检索器
# 这是用于从向量数据库中检索相关文档块的工具
retriever = vectordb.as_retriever(search_kwargs={"k": 10}) # k=10 表示检索最相关的10个块
print(f"Retriever created, set to retrieve top {retriever.search_kwargs['k']} chunks.")

Retriever created, set to retrieve top 10 chunks.


测试知识库搭建成果

In [15]:
# 示例查询1：测试通用信息检索
query1 = "What are the admission requirements for the MISM program?"
print(f"\n--- Testing Retriever with Query: '{query1}' ---")
docs_with_scores1 = vectordb.similarity_search_with_score(query1, k=5)
print(f"Retrieved {len(docs_with_scores1)} documents.")
for i, (doc, score) in enumerate(docs_with_scores1):
    print(f"\n--- Document {i+1} ---")
    print(f"Score: {score:.4f}")  # 显示相似度得分（通常越小越相关）
    print(f"Source: {doc.metadata.get('source', 'N/A')}")
    print(f"Content (first 50 chars):\n{doc.page_content[:50]}...")


--- Testing Retriever with Query: 'What are the admission requirements for the MISM program?' ---
Retrieved 5 documents.

--- Document 1 ---
Score: 0.5168
Source: ../data/cmu_mism_docs/handbooks/mism-student-handbook.pdf
Content (first 50 chars):
the
MISM
program
in
three
semesters.
Note,
the
MIS...

--- Document 2 ---
Score: 0.6145
Source: ../data/cmu_mism_docs/handbooks/mism-student-handbook.pdf
Content (first 50 chars):
the
MISM
degree
requirements.
If
a
course
is
cross...

--- Document 3 ---
Score: 0.6224
Source: ../data/cmu_mism_docs/websites_pages/apply.html
Content (first 50 chars):
MISM-BIDA requires that you have a background in t...

--- Document 4 ---
Score: 0.6224
Source: ../data/cmu_mism_docs/text_info/admission.txt
Content (first 50 chars):
MISM-BIDA requires that you have a background in t...

--- Document 5 ---
Score: 0.6988
Source: ../data/cmu_mism_docs/text_info/admission.txt
Content (first 50 chars):
Home    Admissions   Information Systems Managemen...


In [16]:
# 示例查询2：测试更具体的信息检索
query2 = "What kind of careers do MISM graduates pursue?"
print(f"\n--- Testing Retriever with Query: '{query2}' ---")
docs_with_scores2 = vectordb.similarity_search_with_score(query2, k=10)
print(f"Retrieved {len(docs_with_scores2)} documents.")
for i, (doc, score) in enumerate(docs_with_scores2):
    print(f"\n--- Document {i+1} ---")
    print(f"Score: {score:.4f}")  # 越小越相关
    print(f"Source: {doc.metadata.get('source', 'N/A')}")
    print(f"Content (first 50 chars):\n{doc.page_content[:50]}...")



--- Testing Retriever with Query: 'What kind of careers do MISM graduates pursue?' ---
Retrieved 10 documents.

--- Document 1 ---
Score: 0.6626
Source: ../data/cmu_mism_docs/websites_pages/EL.html
Content (first 50 chars):
Apply your newly earned knowledge from class to im...

--- Document 2 ---
Score: 0.7286
Source: ../data/cmu_mism_docs/handbooks/mism-student-handbook.pdf
Content (first 50 chars):
Advisor,
Career
Advisor
for
MISM
16
and
MISM
BIDA
...

--- Document 3 ---
Score: 0.7683
Source: ../data/cmu_mism_docs/handbooks/mism-student-handbook.pdf
Content (first 50 chars):
the
MISM
degree
requirements.
If
a
course
is
cross...

--- Document 4 ---
Score: 0.7871
Source: ../data/cmu_mism_docs/websites_pages/businessAndTechnology.html
Content (first 50 chars):
Master's Degree in Information Systems Management
...

--- Document 5 ---
Score: 0.8396
Source: ../data/cmu_mism_docs/text_info/mism-16.txt
Content (first 50 chars):
Home    Programs    Master of Information Systems ...

--- Docu

In [17]:
# 示例查询3：测试不存在的信息（预期返回不相关或空白结果）
query3 = "What is the capital of France?"
print(f"\n--- Testing Retriever with Query: '{query3}' ---")
docs_with_scores3 = vectordb.similarity_search_with_score(query3, k=10)
print(f"Retrieved {len(docs_with_scores3)} documents.")
if not docs_with_scores3:
    print("No relevant documents found for this query (expected).")
else:
    for i, (doc, score) in enumerate(docs_with_scores3):
        print(f"\n--- Document {i+1} ---")
        print(f"Score: {score:.4f}")
        print(f"Source: {doc.metadata.get('source', 'N/A')}")
        print(f"Content (first 50 chars):\n{doc.page_content[:50]}...")


--- Testing Retriever with Query: 'What is the capital of France?' ---
Retrieved 10 documents.

--- Document 1 ---
Score: 1.7469
Source: ../data/cmu_mism_docs/text_info/admission.txt
Content (first 50 chars):
students who have worked or studied for five or mo...

--- Document 2 ---
Score: 1.7651
Source: ../data/cmu_mism_docs/websites_pages/apply.html
Content (first 50 chars):
Allegheny College Undergraduates...

--- Document 3 ---
Score: 1.7699
Source: ../data/cmu_mism_docs/websites_pages/curriculum.html
Content (first 50 chars):
(95-723)Accounting and Finance Analytics (95-719)D...

--- Document 4 ---
Score: 1.7931
Source: ../data/cmu_mism_docs/websites_pages/apply.html
Content (first 50 chars):
Required Essay...

--- Document 5 ---
Score: 1.7938
Source: ../data/cmu_mism_docs/websites_pages/apply.html
Content (first 50 chars):
If your native language (mother tongue) is not Eng...

--- Document 6 ---
Score: 1.7982
Source: ../data/cmu_mism_docs/websites_pages/16m_pathway.html
Content (

## 阶段四
    •	获取 API 密钥  
    •	实例化语言模型
    •	创建RAG问答链

In [None]:
# 导入 LLM 模块
from langchain_openai import ChatOpenAI # 如果使用OpenAI
# from langchain_google_genai import ChatGoogleGenerativeAI # 如果使用Google Gemini
# from langchain_community.llms import Ollama # 如果使用Ollama

# 导入问答链
from langchain.chains import RetrievalQA # 导入RetrievalQA

# 导入PromptTemplate (用于自定义Prompt)
from langchain_core.prompts import ChatPromptTemplate

# --- 导入：load_key 函数 ---
import sys
if '..' not in sys.path:
    sys.path.append('..')
from utils.load_key import load_key 
# -----------------------------


# --- 问答系统核心逻辑 ---

# 1. 配置和实例化 LLM
print("\n--- Configuring and Instantiating LLM ---")

# 使用 load_key 函数获取 API Key
# 优先从环境变量中获取（如果用户手动设置了），否则从 keys.json 读取或提示用户输入
openai_api_key_val = os.environ.get("OPENAI_API_KEY")
if not openai_api_key_val:
    print("OPENAI_API_KEY not found in environment variables.")
    openai_api_key_val = load_key("OPENAI_API_KEY") # 调用 load_key 函数获取密钥

# 实例化 OpenAI LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, api_key=openai_api_key_val) # temperature=0 更倾向于确定性回答

# --- 如果你选择使用 Google Gemini API，请使用以下代码块并注释掉上面的 OpenAI 部分 ---
# google_api_key_val = os.environ.get("GOOGLE_API_KEY")
# if not google_api_key_val:
#     print("GOOGLE_API_KEY not found in environment variables.")
#     google_api_key_val = load_key("GOOGLE_API_KEY")
# from langchain_google_genai import ChatGoogleGenerativeAI # 在这里导入以避免不必要的import
# llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0, google_api_key=google_api_key_val)
# --------------------------------------------------------------------------------------

# --- 如果你选择使用 Ollama，请使用以下代码块并注释掉上面的 API 部分 ---
# from langchain_community.llms import Ollama # 在这里导入以避免不必要的import
# try:
#     # 检查Ollama服务是否运行
#     ollama_llm_test = Ollama(model="llama2")
#     ollama_llm_test.invoke("Hi") # 尝试一个简单的调用
#     llm = ollama_llm_test
#     print("Ollama (llama2) LLM connected.")
# except Exception as e:
#     print(f"Error connecting to Ollama: {e}. Please ensure Ollama service is running and model 'llama2' is downloaded.")
#     print("Falling back to a different LLM if available or re-prompting for API key.")
#     # 如果Ollama连接失败，可以考虑在这里fallback到API LLM或者退出
#     raise # 或者处理为fallback逻辑
# --------------------------------------------------------------------------------------


print(f"LLM instantiated: {llm.__class__.__name__} with model: {llm.model_name if hasattr(llm, 'model_name') else llm.model}")


--- Configuring and Instantiating LLM ---
OPENAI_API_KEY not found in environment variables.
Successfully loaded OPENAI_API_KEY from keys.json.
LLM instantiated: ChatOpenAI with model: gpt-3.5-turbo


创建问答链

In [29]:

# 2. 构建 RAG 问答链
print("\n--- Building Retrieval-Augmented Generation (RAG) Chain ---")

# 自定义 Prompt Template 
template = """You are an assistant that answers questions only based on provided context about the CMU MISM program.
When the question is about application requirements, look specifically for information in admissions pages or files.
When the question is about career paths, refer to student handbooks or career services sections.
Do not infer from general knowledge if the answer is not in the documents.

{context}

Question: {question}
Helpful Answer:"""

QA_CHAIN_PROMPT = ChatPromptTemplate.from_template(template)


# 创建 RetrievalQA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff", # 表示将所有检索到的文档塞进一个大Prompt
    retriever=retriever, # 使用你在知识库构建阶段创建的检索器
    return_source_documents=True, # 返回来源文档，提高答案的可信度和可解释性
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT} # 传入自定义Prompt
)
print("RAG QA Chain built successfully.")



--- Building Retrieval-Augmented Generation (RAG) Chain ---
RAG QA Chain built successfully.


## 阶段五
    调试问答系统

In [30]:

# --- 测试问答系统 ---

# 3. 进行问答测试
print("\n--- Testing RAG Assistant ---")

# 问题 1：关于 MISM 项目的申请要求
question1 = "What are the admission requirements for the CMU MISM program?"
print(f"\nQuestion: {question1}")
response1 = qa_chain.invoke({"query": question1}) # 使用 invoke 而不是 run
print(f"Answer: {response1['result']}")
print("\nSource Documents:")
for i, doc in enumerate(response1['source_documents']):
    print(f"  Document {i+1}: Source='{doc.metadata.get('source', 'N/A')}' Page={doc.metadata.get('page', 'N/A')}")
    print(f"    Content (first 50 chars):\n    {doc.page_content[:50]}...")




--- Testing RAG Assistant ---

Question: What are the admission requirements for the CMU MISM program?
Answer: To be admitted to the CMU MISM program, students must complete all required courses, one approved Analytic elective course, and meet the minimum cumulative grade point average of 3.0. Additionally, students must complete a total of 162 units over three semesters in either the 12 or 16-month track.

Source Documents:
  Document 1: Source='../data/cmu_mism_docs/handbooks/mism-student-handbook.pdf' Page=3
    Content (first 50 chars):
    the
MISM
program
in
three
semesters.
Note,
the
MIS...
  Document 2: Source='../data/cmu_mism_docs/handbooks/mism-student-handbook.pdf' Page=3
    Content (first 50 chars):
    3 .
Master
of
Information
Systems
Management
(MISM...
  Document 3: Source='../data/cmu_mism_docs/websites_pages/apply.html' Page=N/A
    Content (first 50 chars):
    Email Senior Associate Director of Admissions, Dav...
  Document 4: Source='../data/cmu_mism_docs/websit

In [31]:

# 问题 2：关于 MISM 毕业生的职业路径
question2 = "What kind of careers do MISM graduates pursue?"
print(f"\nQuestion: {question2}")
response2 = qa_chain.invoke({"query": question2})
print(f"Answer: {response2['result']}")
print("\nSource Documents:")
for i, doc in enumerate(response2['source_documents']):
    print(f"  Document {i+1}: Source='{doc.metadata.get('source', 'N/A')}' Page={doc.metadata.get('page', 'N/A')}")
    print(f"    Content (first 50 chars):\n    {doc.page_content[:50]}...")




Question: What kind of careers do MISM graduates pursue?
Answer: MISM graduates pursue careers in a variety of fields, including data analytics, management, strategy, and IT. They are equipped with skills in business process analysis, predictive modeling, GIS mapping, analytical reporting, segmentation analysis, and data visualization. Additionally, many graduates secure full-time job offers within three months of graduating, with a significant portion receiving offers prior to graduation.

Source Documents:
  Document 1: Source='../data/cmu_mism_docs/websites_pages/EL.html' Page=N/A
    Content (first 50 chars):
    Apply your newly earned knowledge from class to im...
  Document 2: Source='../data/cmu_mism_docs/handbooks/mism-student-handbook.pdf' Page=2
    Content (first 50 chars):
    Advisor,
Career
Advisor
for
MISM
16
and
MISM
BIDA
...
  Document 3: Source='../data/cmu_mism_docs/handbooks/mism-student-handbook.pdf' Page=5
    Content (first 50 chars):
    the
MISM
degree
requir

In [32]:

# 问题 3：知识库外的问题
question3 = "What is the capital of France?"
print(f"\nQuestion: {question3}")
response3 = qa_chain.invoke({"query": question3})
print(f"Answer: {response3['result']}")
print("\nSource Documents:")
for i, doc in enumerate(response3['source_documents']):
    print(f"  Document {i+1}: Source='{doc.metadata.get('source', 'N/A')}' Page={doc.metadata.get('page', 'N/A')}")
    print(f"    Content (first 50 chars):\n    {doc.page_content[:50]}...")


Question: What is the capital of France?
Answer: I'm sorry, I can only provide information about the CMU MISM program based on the context provided.

Source Documents:
  Document 1: Source='../data/cmu_mism_docs/text_info/admission.txt' Page=N/A
    Content (first 50 chars):
    students who have worked or studied for five or mo...
  Document 2: Source='../data/cmu_mism_docs/websites_pages/apply.html' Page=N/A
    Content (first 50 chars):
    Allegheny College Undergraduates...
  Document 3: Source='../data/cmu_mism_docs/websites_pages/curriculum.html' Page=N/A
    Content (first 50 chars):
    (95-723)Accounting and Finance Analytics (95-719)D...
  Document 4: Source='../data/cmu_mism_docs/websites_pages/apply.html' Page=N/A
    Content (first 50 chars):
    Required Essay...
  Document 5: Source='../data/cmu_mism_docs/websites_pages/apply.html' Page=N/A
    Content (first 50 chars):
    If your native language (mother tongue) is not Eng...
  Document 6: Source='../data/cmu_mism_doc