In [1]:
# ========================================================================
# PART 1: LIBRARIES AND SETUP
# ========================================================================

from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.llms import Ollama
from langchain.docstore.document import Document
import os

print("✅ Libraries imported successfully")


✅ Libraries imported successfully


In [2]:
# ========================================================================
# PART 2: LOAD CAREER DOCUMENTS
# ========================================================================
# Domain: Career Guidance & Professional Development
# PDFs are stored inside ./data/ folder
# ------------------------------------------------------------------------

print("\n" + "="*60)
print("PART 2: LOADING CAREER DOCUMENTS")
print("="*60)

DATA_DIR = "./data(1)"
loader = PyPDFDirectoryLoader(DATA_DIR)
documents = loader.load()

print(f"✅ Loaded {len(documents)} career-related documents successfully!")

# Example PDFs you can include:
# 1. Career_Guide_PSU.pdf
# 2. Resume_Writing_Guide_ISU.pdf
# 3. Interview_Preparation_Handbook.pdf
# 4. Professional_Skills_for_Success.pdf
# 5. Career_Development_Toolkit.pdf

topic_explanation = "I chose the topic 'Career Guidance & Professional Development' because it helps students and professionals improve job readiness skills like resume writing, interviewing, and workplace success."
print(f"📘 Topic explanation: {topic_explanation}")



PART 2: LOADING CAREER DOCUMENTS
✅ Loaded 210 career-related documents successfully!
📘 Topic explanation: I chose the topic 'Career Guidance & Professional Development' because it helps students and professionals improve job readiness skills like resume writing, interviewing, and workplace success.


In [3]:
# ========================================================================
# PART 3: CHUNKING CONFIGURATION
# ========================================================================

print("\n" + "="*60)
print("PART 3: DOCUMENT CHUNKING TEST")
print("="*60)

configs = [
    {"name": "Small chunks", "chunk_size": 400, "chunk_overlap": 200},
    {"name": "Large chunks", "chunk_size": 1200, "chunk_overlap": 50}
]

for cfg in configs:
    print("\n" + "-"*60)
    print(f"Testing config: {cfg['name']}")
    print("-"*60)

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=cfg["chunk_size"],
        chunk_overlap=cfg["chunk_overlap"],
        separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""]
    )
    
    chunks = splitter.split_documents(documents)
    total_chunks = len(chunks)
    avg_len = sum(len(c.page_content) for c in chunks) / total_chunks if total_chunks else 0
    
    print(f"Total chunks: {total_chunks}")
    print(f"Average length: {avg_len:.0f} characters")

# Recommendation for career PDFs (structured text):
print("\n✅ Recommended: Large chunks (1200/50) — balances readability and context retention.")



PART 3: DOCUMENT CHUNKING TEST

------------------------------------------------------------
Testing config: Small chunks
------------------------------------------------------------
Total chunks: 2530
Average length: 349 characters

------------------------------------------------------------
Testing config: Large chunks
------------------------------------------------------------
Total chunks: 562
Average length: 928 characters

✅ Recommended: Large chunks (1200/50) — balances readability and context retention.


In [4]:
# ========================================================================
# PART 4: EMBEDDINGS AND VECTOR DATABASE
# ========================================================================

print("\n" + "="*60)
print("PART 4: VECTOR EMBEDDINGS & KNOWLEDGE BASE")
print("="*60)

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=50)
chunks = splitter.split_documents(documents)

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./career_chroma_db"
)

print("✅ Vector database created and saved successfully.")



PART 4: VECTOR EMBEDDINGS & KNOWLEDGE BASE


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


✅ Vector database created and saved successfully.


In [5]:
# ========================================================================
# PART 5: RETRIEVAL CONFIGURATION
# ========================================================================

print("\n" + "="*60)
print("PART 5: RETRIEVAL OPTIMIZATION")
print("="*60)

retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 3, "fetch_k": 10, "lambda_mult": 0.5}
)

print("✅ Retriever configured with MMR search (k=3, fetch_k=10, λ=0.5).")



PART 5: RETRIEVAL OPTIMIZATION
✅ Retriever configured with MMR search (k=3, fetch_k=10, λ=0.5).


In [6]:
# ========================================================================
# PART 6: LLM SETUP
# ========================================================================

print("\n" + "="*60)
print("PART 6: LLM CONNECTION VIA OLLAMA")
print("="*60)

try:
    llm = Ollama(model="phi3:mini", temperature=0.2, num_thread=2)
    print("✅ LLM (phi3:mini) connected successfully!")
except Exception as e:
    print(f"❌ Failed to connect to Ollama: {e}")
    print("💡 Ensure Ollama is running and model 'phi3:mini' is installed.")



PART 6: LLM CONNECTION VIA OLLAMA
✅ LLM (phi3:mini) connected successfully!


  llm = Ollama(model="phi3:mini", temperature=0.2, num_thread=2)


In [8]:
# ========================================================================
# PART 7: PROMPT ENGINEERING
# ========================================================================

print("\n" + "="*60)
print("PART 7: PROMPT ENGINEERING FOR CAREER GUIDANCE")
print("="*60)

prompt_template = """
You are a professional career guidance assistant.
Answer the question ONLY using information from the provided context.

Rules:
1. Do not guess — use only the career documents below.
2. If you cannot find the answer, say:
   "The provided documents do not contain information to answer this question."
3. Always mention which document or section your answer comes from.

Context:
{context}

Question: {question}

Answer:
"""

PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
print("✅ Prompt template created successfully.")



PART 7: PROMPT ENGINEERING FOR CAREER GUIDANCE
✅ Prompt template created successfully.


In [12]:
# ========================================================================
# WORKSHOP ACTIVITY 8: RAG CHAIN ASSEMBLY
# ========================================================================
# LEARNING OBJECTIVE: Combine retriever, prompt, and LLM

print("\n" + "="*60)
print("PART 8: RAG CHAIN ASSEMBLY")
print("="*60)

from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",  # Combine all retrieved chunks
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True,
    verbose=False
)

print("✅ RAG chain assembled successfully!")
print("   Components connected: Retriever → Prompt → LLM → Response")



PART 8: RAG CHAIN ASSEMBLY
✅ RAG chain assembled successfully!
   Components connected: Retriever → Prompt → LLM → Response


In [13]:
# ========================================================================
# PART 9: ANSWER VALIDATION SYSTEM
# ========================================================================

def validate_answer(answer, source_docs):
    hallucination_phrases = ["i think", "probably", "seems", "perhaps", "usually", "generally"]
    confidence = 1.0
    warnings = []

    for phrase in hallucination_phrases:
        if phrase in answer.lower():
            confidence -= 0.2
            warnings.append(f"Uncertain phrase detected: '{phrase}'")

    cited = any(doc.metadata.get("source", "").lower() in answer.lower() for doc in source_docs)
    if not cited:
        confidence -= 0.3
        warnings.append("No source citation detected.")
    
    return max(confidence, 0.0), warnings


def ask_question_with_validation(question):
    print(f"\n🤔 Question: {question}")
    result = qa_chain.invoke({"query": question})
    answer = result["result"]
    sources = result["source_documents"]
    
    conf, warns = validate_answer(answer, sources)
    
    print("\n📝 Answer:")
    print(answer)
    print(f"\n📊 Confidence Score: {conf:.2f}")
    if warns:
        for w in warns:
            print("⚠️", w)
    
    print("\n📚 Sources:")
    for s in sources:
        print("-", s.metadata.get("source", "Unknown"))
    
    return result, conf, warns


In [14]:
# ========================================================================
# WORKSHOP ACTIVITY 10: HANDS-ON TESTING
# ========================================================================
# LEARNING OBJECTIVE: Test the complete RAG system (Career Guidance Domain)

print("\n" + "="*80)
print("WORKSHOP DEMONSTRATION: TESTING THE RAG SYSTEM — CAREER GUIDANCE")
print("="*80)

# Two realistic questions for your career-focused RAG
questions = [
    "What are the key sections that should be included in a professional resume?",
    "How can I prepare effectively for a job interview?"
]

for q in questions:
    print("\n" + "="*80)
    print(f"🤔 Testing Question: {q}")
    print("="*80)
    
    print("🧪 Running RAG pipeline...\n")
    result, confidence, warnings = ask_question_with_validation(q)
    
    print("\n" + "="*80)
    print(f"✅ Finished testing question: {q}")
    print("="*80)



WORKSHOP DEMONSTRATION: TESTING THE RAG SYSTEM — CAREER GUIDANCE

🤔 Testing Question: What are the key sections that should be included in a professional resume?
🧪 Running RAG pipeline...


🤔 Question: What are the key sections that should be included in a professional resume?

📝 Answer:
According to the provided documents, particularly under "ADDITIONAL SECTIONS," some of the most commonly listed additional sections after Education include Study Abroad experiences and Certifications or Licensure. Experience-related sections that stand out are Leadership roles due to their impressiveness to employers. Furthermore, involvement in Activities or Extracurricular Involvement/Student Organizations is also recommended for inclusion if applicable. Honors or Awards should be highlighted as well when relevant. Community Service and Volunteer work can add value by showcasing one's commitment beyond academics and employment. Professional Affiliations are another section that could enhance a resum