In [4]:
import os
import shutil
import numpy as np
import plotly.graph_objects as go
from collections import Counter
from dotenv import load_dotenv
from sklearn.manifold import TSNE
import gradio as gr
import tqdm as notebook_tqdm
# Modern LangChain 0.3 Imports
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader, TextLoader, CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_core.callbacks import StdOutCallbackHandler

# price is a factor for our company, so we're going to use a low cost model
MODEL = "gpt-4o-mini"
db_name = "vector_Kaiserslautern_db"

# Load environment variables in a file called .env
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

llm = ChatOpenAI(temperature=0.7, model_name=MODEL)


def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

documents = []

# Folders to process
folders_to_process = [
    "Data/contacts", 
    "Data/extra", 
    "Data/klausur", 
    "Data/professoren", 
    "Data/Termin", 
    "Data/Vorpraktikum"
]

print(f"Current working directory: {os.getcwd()}")

for folder in folders_to_process:
    if not os.path.exists(folder):
        print(f"Skipping (doesn't exist): {folder}")
        continue
    
    doc_type = os.path.basename(folder)
    print(f"\n{'='*50}\nProcessing folder: {folder} (type: {doc_type})\n{'='*50}")
    
    for root, dirs, files in os.walk(folder):
        if '.ipynb_checkpoints' in root: continue
        for file in files:
            file_path = os.path.join(root, file)
            try:
                if file.endswith('.pdf'):
                    loader = PyPDFLoader(file_path)
                elif file.endswith(('.md', '.txt', '.json')):
                    loader = TextLoader(file_path, encoding='utf-8')
                elif file.endswith('.csv'):
                    loader = CSVLoader(file_path=file_path, encoding='utf-8')
                else: continue
                
                file_docs = loader.load()
                documents.extend([add_metadata(doc, doc_type) for doc in file_docs])
                print(f"  ✓ Loaded {len(file_docs)} units from {file}")
            except Exception as e:
                print(f"  ✗ Error loading {file_path}: {e}")

print(f"\n{'='*50}\nTotal documents loaded: {len(documents)}\n{'='*50}")

# Chunking logic exactly as you had it
if documents:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, 
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Total number of chunks: {len(chunks)}")
    
    doc_type_counts = Counter(doc.metadata['doc_type'] for doc in documents)
    chunk_type_counts = Counter(chunk.metadata['doc_type'] for chunk in chunks)
    print(f"\nDocuments per type:")
    for d_type, count in doc_type_counts.items():
        print(f"  - {d_type}: {count} docs → {chunk_type_counts[d_type]} chunks")


if os.path.exists(db_name):
    shutil.rmtree(db_name, ignore_errors=True)
    print(f"Deleted: {db_name}")
else:
    print("Directory does not exist.")

Current working directory: D:\projects\llm_engineering\week5\Project_hochschule_kaiserslautern

Processing folder: Data/contacts (type: contacts)
  ✓ Loaded 1 units from pruefungsamt_rag.json

Processing folder: Data/extra (type: extra)
  ✓ Loaded 1 units from Deutschland-Semesterticket.md

Processing folder: Data/klausur (type: klausur)
  ✓ Loaded 3 units from Pruefungsplan_PO19.pdf

Processing folder: Data/professoren (type: professoren)
  ✓ Loaded 1 units from Albert Meij.md
  ✓ Loaded 1 units from Andreas Steil.md
  ✓ Loaded 1 units from arl-Heinz Helmstädter.md
  ✓ Loaded 1 units from Christian M. Thurnes.md
  ✓ Loaded 1 units from Christian Schumann.md
  ✓ Loaded 1 units from Constantin Bauer.md
  ✓ Loaded 1 units from Dirk Enk.md
  ✓ Loaded 1 units from Eva Maria Kiss.md
  ✓ Loaded 1 units from Felix Möhrle.md
  ✓ Loaded 1 units from Felix Wick.md
  ✓ Loaded 1 units from Frank Bomarius.md
  ✓ Loaded 1 units from Gerd Bitsch.md
  ✓ Loaded 1 units from habil. Peter Starke.md
  ✓ L

In [5]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
# Chroma is a popular open source Vector Database based on SQLLite
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# If you would rather use the free Vector Embeddings from HuggingFace sentence-transformers
# Then replace embeddings = OpenAIEmbeddings()
# with:
# from langchain.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# Delete if already exists
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
collection = vectorstore._collection
print(f"Vectorstore created with {collection.count()} documents")

Vectorstore created with 173 documents


In [None]:
# Memory for context-aware chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 8})

# Create the chain
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm, 
    retriever=retriever, 
    memory=memory,
    callbacks=[StdOutCallbackHandler()] # This shows you the logic in the terminal
)

def chat_function(message, history):
    # Gradio sends 'message' as a dict or string depending on version
    query = message["text"] if isinstance(message, dict) else message
    result = conversation_chain.invoke({"question": query})
    return result["answer"]

# Launch the UI
demo = gr.ChatInterface(fn=chat_function, title="Kaiserslautern University Assistant")
demo.launch(inbrowser=True, share=True)