In [None]:
%pip install langchain langchain-openai langchainhub langchain-community beautifulsoup4 rich langchain-chroma==0.1.4 chromadb==0.4.21

In [None]:
%pip install onnx==1.16.1 onnxruntime==1.16.3

In [3]:
import os

from dotenv import load_dotenv

load_dotenv()  # take environment variables

token = os.getenv("GITHUB_TOKEN")  # Replace with your actual token
endpoint = "https://models.github.ai/inference"
model = "openai/gpt-4.1-nano"

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model=model, base_url=endpoint, api_key=token)


In [4]:
import os
os.environ["USER_AGENT"] = "LangChain-Notebook/1.0"

import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from rich import print

In [None]:
# Load, chunk and index the contents of the blog.
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2017-06-21-overview/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
splits = text_splitter.split_documents(docs)

print(f"Number of splits: {len(splits)}")
print(f"Sample split length (chars): {len(splits[0].page_content)}")

In [None]:
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(
    model="text-embedding-3-small",
    base_url="https://models.inference.ai.azure.com",
    api_key=token,
))

In [None]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

print(prompt)

In [None]:
def format_docs(docs):
    print(docs)
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
rag_chain.invoke("What is Convolutional Neural Networks?")