In [1]:
# %pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph
# %pip install -qU "langchain[openai]"
# %pip install -qU langchain-openai
# %pip install -qU langchain-core

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API_KEY")

In [3]:
import os
from dotenv import load_dotenv

load_dotenv()

if not os.environ.get("AZURE_OPENAI_API_KEY"):
  os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")

from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    azure_endpoint=os.environ["AZURE_OPENAI_CHAT_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"],
    openai_api_version=os.environ["AZURE_OPENAI_CHAT_API_VERSION"],
)

In [4]:
import os
from dotenv import load_dotenv

load_dotenv()

if not os.environ.get("AZURE_OPENAI_API_KEY"):
  os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")

from langchain_openai import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    azure_endpoint=os.environ["AZURE_OPENAI_EMBEDDINGS_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"],
    openai_api_version=os.environ["AZURE_OPENAI_EMBEDDINGS_API_VERSION"],
)

In [5]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [None]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_core.prompts import PromptTemplate

url_class_map = {
    # Data Security Act
    "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A32022R2065": {"label": "EU Data Security Act", "classes": "MainContent"},
    # California State Law
    "https://leginfo.legislature.ca.gov/faces/billTextClient.xhtml?bill_id=202320240SB976": {"label": "California State Law", "classes": "tab_content"},
    # Florida State Law
    "https://www.flsenate.gov/Session/Bill/2024/3/BillText/Filed/PDF": {"label": "Florida State law", "classes": "textLayer"},
    # US law on reporting child sexual abuse content to NCMEC
    "https://www.law.cornell.edu/uscode/text/18/2258A": {"label": "US law on reporting child sexual abuse content to NCMEC", "classes": "text"}
}

def load_with_per_url_classes(url_class_map):
    docs = []
    for url, cfg in url_class_map.items():
        label = cfg["label"]
        classes = cfg["classes"]
        
        loader = WebBaseLoader(
            web_paths=[url],
            bs_kwargs=dict(parse_only=bs4.SoupStrainer(class_=classes))
        )

        loader_docs = loader.load()
        for doc in loader_docs:
            doc.metadata["source"] = label
            doc.metadata["url"] = url
        docs += loader_docs
    return docs

# Load and chunk contents of the blog
# loader = WebBaseLoader(
#     web_paths=("https://leginfo.legislature.ca.gov/faces/billTextClient.xhtml?bill_id=202320240SB976",),
#     bs_kwargs=dict(
#         parse_only=bs4.SoupStrainer(
#             class_=("tab_content")
#         )
#     ),
# )
# docs = loader.load()
docs = load_with_per_url_classes(url_class_map)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)

# Index chunks
_ = vector_store.add_documents(documents=all_splits)

# Define prompt for question-answering
# N.B. for non-US LangSmith endpoints, you may need to specify
# api_url="https://api.smith.langchain.com" in hub.pull.
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.

{context}

Question: {question}

Helpful Answer:"""
prompt = PromptTemplate.from_template(template)

# prompt = hub.pull("rlm/rag-prompt")


# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [24]:
response = graph.invoke({"question": "What is the Data Security Act"})
print(f"Answer: {response['answer']}")
# print(response["answer"])

Answer: I don't know. Thanks for asking!


In [8]:
while True:
    question = input("\nEnter your question (or 'exit' to quit): ")
    if question.lower() == 'exit':
        break
    response = graph.invoke({"question": question})
    print(f"Answer: {response['answer']}")

Answer: The bill, Senate Bill 976, mandates the Attorney General of California to enforce regulations aimed at protecting minors from addictive features on social media platforms by ensuring parental consent is obtained for users under 18. It requires annual disclosures from operators about minor users and parental consent status. Furthermore, the bill emphasizes public feedback on regulations while allowing exceptions that align with the protection of minors.
Answer: The provided context discusses regulations regarding internet-based services and applications, particularly concerning users who are minors. Operators must not send notifications to minors during specified hours unless they have verifiable parental consent. Additionally, any information collected for age verification must be used solely for compliance and deleted after use.
Answer: The retrieved context does not provide specific details about the feature that reads user location to enforce France's copyright rules. Theref

In [9]:
# from IPython.display import Image, display

# display(Image(graph.get_graph().draw_mermaid_png()))