In [26]:
# imports
import os
import glob
from dotenv import load_dotenv
import gradio as gr

# imports for langchain, plotly and Chroma
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_core.callbacks import StdOutCallbackHandler
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate


In [27]:
MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [3]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [4]:
folders = glob.glob("knowledge-base/*")

# doc_type is the type of document, e.g. "test", "interview", "project", "course", etc.
# basically the folder name under the knowledge-base folder
def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

text_loader_kwargs = {'encoding': 'utf-8'}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")

In [None]:
embeddings = OpenAIEmbeddings()
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
# Let's investigate the vectors

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

In [50]:
# import SystemMessage

from langchain.schema import SystemMessage, HumanMessage, AIMessage

# Create memory with initial messages
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
#
# initial_messages = [
#     AIMessage(content="Hi! I'm here to help you understand if Yu-An Lin is a good match for your job role. Please ask me anything about Yu-An's experience."),
# ]
#
# memory.chat_memory.messages = initial_messages  # Preload history

In [51]:
llm = ChatOpenAI(model_name=MODEL, temperature=0.7)

retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

name = "Yu-An Lin"
# System message: Guides the AI on how to respond
system_message = SystemMessagePromptTemplate.from_template(
    f"You are a helpful assistant that helps {name} answer questions from recruiters about his experience.\n"
    f"Your goal is to help {name} secure a job offer while being honest and concise.\n\n"
    "Use the following retrieved job experience to answer the recruiter's question:\n"
    "------\n"
    "{context}\n\n"
    f"If you don’t know the answer or if {name} has the experience, say so, but suggest that {name} can clarify if needed."
)

# Human message: The recruiter's actual question
human_message = HumanMessagePromptTemplate.from_template("{question}")

# Combine into a ChatPromptTemplate
chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message])

chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    combine_docs_chain_kwargs={"prompt": chat_prompt},
    callbacks=[StdOutCallbackHandler()]
)

def generate_response(input_text, history):
    response = chain({"question": input_text})
    return response["answer"], history + [(input_text, response["answer"])]



In [None]:
# testing the chain
query = "Please let me know if he has experience in data science?"
result = chain.invoke({"question": query})
print(result["answer"])

In [None]:
def chat(question, history):
    chain_result = chain.invoke({"question": question})
    return chain_result["answer"]

# And in Gradio:

view = gr.ChatInterface(chat, type="messages").launch(inbrowser=False)