# Creating a RAG model with Langchain and Openrouter

Kelompok 1 Algoritma Deep Learning
Anggota Kelompok :
1. A IAS Falah Surya Gemilang
2. Aldo Rizky Ramadhan
3. Faiz Rizki Azmi
4. Muhamad Ibnu Khaidar Hafiz
5. Siti Asma Tomu

In [1]:
## install langchain
# %pip install langchain langchain-openai langchain-chroma beautifulsoup4
# %pip install chromadb 
# %pip install langchain-nvidia-ai-endpoints
# %pip install "unstructured[md]" nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting unstructured[md]
  Downloading unstructured-0.16.5-py3-none-any.whl.metadata (24 kB)
Collecting chardet (from unstructured[md])
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting filetype (from unstructured[md])
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured[md])
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting lxml (from unstructured[md])
  Downloading lxml-5.3.0-cp311-cp311-win_amd64.whl.metadata (3.9 kB)
Collecting emoji (from unstructured[md])
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured[md])
  Downloading python_iso639-2024.10.22-py3-none-any.whl.metadata (13 kB)
Collecting langdetect (from unstructured[md])
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? e

In [3]:
# First Cell - Imports and Setup
import getpass
import os
import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader, DirectoryLoader, UnstructuredMarkdownLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain.embeddings import HuggingFaceEmbeddings
from typing import List, Dict
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Set up NVIDIA API key
os.environ["NVIDIA_API_KEY"] = getpass.getpass("Enter your NVIDIA API key: ")

# Initialize NVIDIA LLM
llm = ChatNVIDIA(model="meta/llama3-70b-instruct")


USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
# Second Cell - Document Loading Functions
def load_markdown_documents(directory_path: str) -> List:
    """
    Load markdown documents from a directory
    """
    loader = DirectoryLoader(
        directory_path,
        glob="**/*.md",
        loader_cls=UnstructuredMarkdownLoader
    )
    documents = loader.load()
    return documents

def load_web_content(urls: List[str]) -> List:
    """
    Load content from web URLs
    """
    loader = WebBaseLoader(
        web_paths=urls,
        bs_kwargs=dict(
            parse_only=bs4.SoupStrainer(
                class_=("post-content", "post-title", "post-header")
            )
        )
    )
    documents = loader.load()
    return documents

In [5]:
# Third Cell - Text Splitting
def split_documents(documents: List) -> List:
    """
    Split documents into chunks
    """
    # For markdown documents
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    
    # General text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    
    splits = []
    for doc in documents:
        # Try markdown splitting first
        try:
            header_splits = markdown_splitter.split_text(doc.page_content)
            text_splits = text_splitter.split_documents(header_splits)
        except:
            # Fall back to regular splitting if not markdown
            text_splits = text_splitter.split_documents([doc])
        splits.extend(text_splits)
    
    return splits


In [6]:
# Fourth Cell - Vector Store Creation
def create_vector_store(documents: List):
    """
    Create and return a vector store from the documents
    """
    embeddings = HuggingFaceEmbeddings(
        model_name="all-MiniLM-L6-v2",
        model_kwargs={'device': 'cuda'},
        encode_kwargs={'device': 'cuda', 'batch_size': 32}
    )
    
    vector_store = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        persist_directory="./chroma_db"
    )
    
    return vector_store


In [7]:
# Fifth Cell - RAG Chain Creation
def create_rag_chain(vector_store):
    """
    Create and return a RAG chain
    """
    retriever = vector_store.as_retriever(
        search_type="mmr",
        search_kwargs={
            "k": 5,
            "fetch_k": 10
        }
    )
    
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)
    
    # Get the RAG prompt from LangChain hub
    prompt = hub.pull("rlm/rag-prompt")
    
    # Create the RAG chain
    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    
    return rag_chain
