# Objective: Automate Tier-1 customer support using GenAI.

### URL

In [1]:
# Tuples of (URL, desired_filename_without_extension)
url_filename_pairs = [
    ('https://www.csudh.edu/csc/programs/computer-science-programs/ms-computer-science/', 'MSCS_program'),
    ('https://www.csudh.edu/csc/programs/cyber-security-programs/ms-cyber-security/', 'MSCY_program'),
    ('https://www.csudh.edu/financial-aid/', 'financial_aid'),
    ('https://www.csudh.edu/csc/faculty-staff/', 'csc_faculty_staff'),
    ('https://csudh.scholarships.ngwebsolutions.com/Scholarships/Search', 'scholarships_search')
]

In [2]:
local_html_file_paths = []

In [3]:
import requests
from bs4 import BeautifulSoup
import os
from langchain_community.document_loaders import UnstructuredHTMLLoader, PyPDFLoader

In [4]:
# html_output_dir = "html_files"
# os.makedirs(html_output_dir, exist_ok=True)

# for url, base_filename in url_filename_pairs:
#     try:
#         print(f"Fetching URL: {url}")
#         response = requests.get(url)
#         html_code = response.text
#         soup = BeautifulSoup(html_code, "html.parser")

#         file_path = os.path.join(html_output_dir, f"{base_filename}.html")

#         with open(file_path, "w", encoding="utf-8") as f:
#             f.write(str(soup)) # Using str(soup) as per your example

#         local_html_file_paths.append(file_path)
#         print(f"Saved HTML from {url} to {file_path}")

#     except Exception as e: # Basic error catching
#         print(f"An error occurred while processing {url}: {e}")

In [5]:
from langchain_community.document_loaders import UnstructuredHTMLLoader
import os # To list files in the directory

html_files_directory = "html_files"  # The directory where your HTML files are stored
all_html_documents = []          # A list to hold all the loaded HTML content

print(f"Looking for HTML files in: {html_files_directory}")

# Get a list of all files in the directory
try:
    filenames_in_directory = os.listdir(html_files_directory)
except FileNotFoundError:
    print(f"Error: Directory '{html_files_directory}' not found. Please ensure it exists and contains your HTML files.")
    filenames_in_directory = [] # Avoid further errors

# Loop through each filename
for filename in filenames_in_directory:
    # Check if the file is an HTML file
    if filename.endswith(".html"):
        full_file_path = os.path.join(html_files_directory, filename)
        print(f"Processing HTML file: {full_file_path}")
        
        # Your simple loading code, applied to the current file
        loader = UnstructuredHTMLLoader(full_file_path)
        data = loader.load()  # This returns a list of Document objects
        
        all_html_documents.extend(data) # Add the loaded document(s) to our main list
        print(f"  -> Loaded {len(data)} document(s) from this file.")

print(f"\nTotal documents loaded from all HTML files: {len(all_html_documents)}")

Looking for HTML files in: html_files
Processing HTML file: html_files/financial_aid.html
  -> Loaded 1 document(s) from this file.
Processing HTML file: html_files/scholarships_search.html
  -> Loaded 1 document(s) from this file.
Processing HTML file: html_files/MSCY_program.html
  -> Loaded 1 document(s) from this file.
Processing HTML file: html_files/csc_faculty_staff.html
  -> Loaded 1 document(s) from this file.
Processing HTML file: html_files/MSCS_program.html
  -> Loaded 1 document(s) from this file.

Total documents loaded from all HTML files: 5


### PDF's

In [6]:
# Your list of PDF file paths
pdf_files_to_load = [
    "data/BACT-Brochure.pdf",
    "data/BSCS-Brochure.pdf",
    "data/BSIT-Brochure.pdf",
    "data/MSCS-Brochure.pdf",
    "data/MSCY-Brochure.pdf",
    "data/Office Hours.pdf"
]

In [7]:
from langchain_community.document_loaders import PyPDFLoader


all_pdf_pages = [] # This will store all the loaded pages

# Loop through each PDF file path
for pdf_path in pdf_files_to_load:
    print(f"Processing PDF: {pdf_path}")
    loader = PyPDFLoader(pdf_path)
    pages = loader.load_and_split() # Loads and splits by page
    all_pdf_pages.extend(pages)
    print(f"  -> Loaded {len(pages)} page(s).")

print(f"\nTotal pages loaded from all PDF files: {len(all_pdf_pages)}")

Processing PDF: data/BACT-Brochure.pdf
  -> Loaded 4 page(s).
Processing PDF: data/BSCS-Brochure.pdf


Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)


  -> Loaded 3 page(s).
Processing PDF: data/BSIT-Brochure.pdf
  -> Loaded 2 page(s).
Processing PDF: data/MSCS-Brochure.pdf
  -> Loaded 4 page(s).
Processing PDF: data/MSCY-Brochure.pdf
  -> Loaded 4 page(s).
Processing PDF: data/Office Hours.pdf
  -> Loaded 3 page(s).

Total pages loaded from all PDF files: 20


In [8]:
# Combine the lists of documents
all_documents = all_html_documents + all_pdf_pages

# Verify the total number of documents
print(f"Total HTML documents loaded: {len(all_html_documents)}")
print(f"Total PDF pages loaded: {len(all_pdf_pages)}")
print(f"Combined total documents/pages: {len(all_documents)}")

Total HTML documents loaded: 5
Total PDF pages loaded: 20
Combined total documents/pages: 25


In [9]:
all_documents_combined = all_html_documents + all_pdf_pages

content_list = [doc.page_content for doc in all_documents_combined]
final_combined_text = "\n\n".join(content_list)

print(f"Length of the final combined string: {len(final_combined_text)} characters.")

Length of the final combined string: 195029 characters.


## chunking

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ".", " ", ""],
    chunk_size=1000,
    chunk_overlap=200
)

chunks = text_splitter.create_documents([final_combined_text])

In [11]:
# Check the output

print("Type of variable:", type(chunks))  
print()
print("Type of each object inside the list:", type(chunks[0]))  
print()
print("Total number of documents inside list:", len(chunks))
print()
print("* Content of first chunk:", chunks[0].page_content[:50])  
print()
print("* Content of second chunk:", chunks[1].page_content[:50]) 

Type of variable: <class 'list'>

Type of each object inside the list: <class 'langchain_core.documents.base.Document'>

Total number of documents inside list: 261

* Content of first chunk: The CSUDH

Skip to content

Checkmark Icon

APPLY


* Content of second chunk: The Financial Aid Application is Now Open:

Applic


In [12]:
# print(chunks[1].page_content)

## 5. Creating Vector Store

### 5.1 Initialize an Embedding Model

In [13]:
# pip install langchain_huggingface

In [1]:
f = open('keys/openai_key.txt')

OPENAI_API_KEY = f.read()

In [2]:
# Step 1 - Initialize an embedding_model

from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

### 5.2 Setting a Connection with the ChromaDB

In [3]:
# Step 2 - Initialize a ChromaDB Connection
from langchain_chroma import Chroma

# Initialize the database connection
# If database exist, it will connect with the collection_name and persist_directory
# Otherwise a new collection will be created

db = Chroma(collection_name="vector_database", 
            embedding_function=embedding_model, 
            persist_directory="./chroma_db_")

In [5]:
# Initially the database is empty

# db.get()

### 5.3 Add Chunks to Vector DB

In [6]:
# db.add_documents(chunks)

In [7]:
# db.get()

## 6. Create Retriever

In [8]:
# Converting CHROMA db connection to Retriever Object
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 10})

print(type(retriever))

<class 'langchain_core.vectorstores.base.VectorStoreRetriever'>


In [9]:
query = "What is the minimum gpa required?"

# retriever.invoke(query)

## 7. Prompt Template

In [10]:
# Initialize a Chat Prompt Template

from langchain_core.prompts import ChatPromptTemplate

PROMPT_TEMPLATE = """
Answer the question based only on the following context for the MS in Computer Science 
program at CSUDH: {context} Answer the question based on the above context: {question}. 
Provide a detailed answer. 
Do not provide any information beyond what is included in the context.
"""

prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

## 8. LLM

In [11]:
# Open the file containing your OpenAI API key
with open('keys/openai_key.txt', 'r') as f:
    OPENAI_API_KEY = f.read().strip()

In [12]:
# Import OpenAI ChatModel
from langchain_openai import ChatOpenAI

# Set the OpenAI Key and initialize a ChatModel
chat_model = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-4o-mini", temperature=1)

## 9. Output Parser

In [13]:
# Initialize a Output Parser

from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

## 10. Chaining

In [14]:
# Step 6: Define a RAG Chain

from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
    
rag_chain = {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt_template | chat_model | parser

## 11. invoking the RAG Pipeline

In [17]:
query = "what is the office hours of mohsen beheshti"

output = rag_chain.invoke(query)

print(output)

Dr. Mohsen Beheshti, the Department Chair, holds his office hours on the following days and times:

- **Tuesday:** 1:00 PM – 3:00 PM (available in person or via ZOOM by appointment)
- **Friday:** 5:30 PM - 7:30 PM (available via ZOOM by appointment)

His office is located in **NSM A-132** and his phone number is **310-243-3398**. Additionally, you can reach him via email at **mbeheshti@csudh.edu**.
