In [1]:
import os 
from dotenv import load_dotenv

from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama
from langchain_chroma import Chroma

import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict

load_dotenv()

USER_AGENT environment variable not set, consider setting it to identify your requests.


True

## Defining Components

Chat Model

In [2]:
llm = ChatOllama(
    model="llama3.2",
    temperature=0)

Embedding Model

In [3]:
embeddings = OllamaEmbeddings(model="llama3.2")

Vector Store

In [4]:
vector_store = Chroma(embedding_function=embeddings)

## Data Indexing

In [9]:
from langchain.document_loaders import WebBaseLoader
from bs4 import BeautifulSoup
import requests
from langchain.document_loaders import WebBaseLoader
from langchain.schema import Document
from bs4 import BeautifulSoup
import requests

class CustomWebBaseLoader(WebBaseLoader):
    def __init__(self, url: str):
        super().__init__(url)
        self.url = url

    def load(self):
        # Fetch the webpage content
        response = requests.get(self.url)
        soup = BeautifulSoup(response.content, "html.parser")

        # Remove unwanted sections: header, footer, etc.
        for header in soup.find_all(['header', 'footer']):
            header.decompose()  # Remove these sections from the soup

        # Now parse the remaining content
        paragraphs = soup.find_all('p')  # Or any other tag you're interested in
        documents = []

        for p in paragraphs:
            # Creating a Document instance for each paragraph or relevant content
            doc = Document(
                page_content=str(p),  # The content of the paragraph
                metadata={"source": self.url}  # Optional metadata (e.g., the source URL)
            )
            documents.append(doc)

        return documents

# Example usage:
loader = CustomWebBaseLoader("https://example.com")
documents = loader.load()

# Print out the documents
for doc in documents:
    print(f"Source: {doc.metadata['source']}")
    print(f"Content: {doc.page_content[:200]}...")  # Preview first 200 characters


# Example usage:
loader = CustomWebBaseLoader("https://rse.shef.ac.uk/blog/2024-09-24-funded-proposals/")
docs = loader.load()
docs


Source: https://example.com
Content: <p>This domain is for use in illustrative examples in documents. You may use this
    domain in literature without prior coordination or asking for permission.</p>...
Source: https://example.com
Content: <p><a href="https://www.iana.org/domains/example">More information...</a></p>...


[Document(metadata={'source': 'https://rse.shef.ac.uk/blog/2024-09-24-funded-proposals/'}, page_content='<p>Earlier this year the RSE team in Sheffield put a call out for proposals for researchers in the University of Sheffield to\n<a href="https://rse.shef.ac.uk/collaboration/RSEtime_call2024/">collaborate with the RSE</a> team. The successful applicants would receive dedicated support from an RSE\nTeam member at 50% FTE for a period of six months.</p>'),
 Document(metadata={'source': 'https://rse.shef.ac.uk/blog/2024-09-24-funded-proposals/'}, page_content='<p>A total of 26 applications were received from across the faculties of the University and the review panel which\nconsisted of nine RSEs had robust discussions about which to fund as the quality and proposed work was of a high\nstandard.</p>'),
 Document(metadata={'source': 'https://rse.shef.ac.uk/blog/2024-09-24-funded-proposals/'}, page_content='<p><img alt="Distribution of applications by faculty" src="/assets/images/2024-09-

In [6]:
# # Load and chunk contents of the blog
# loader = WebBaseLoader(
#     web_paths=("https://rse.shef.ac.uk/blog/2024-09-24-funded-proposals/",
#                )
# )

# loader = WebBaseLoader(
#     web_paths=("https://rse.shef.ac.uk/blog/2024-09-24-funded-proposals/",),
#     bs_kwargs=dict(
#         parse_only=bs4.SoupStrainer(
#             class_=("post-content", "post-title", "post-header")
#         )
#     ),
# )
# docs = loader.load()

# docs

In [10]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250, separators=["."])
all_splits = text_splitter.split_documents(docs)

# Index chunks
_ = vector_store.add_documents(documents=all_splits)

## RAG

In [11]:
prompt = hub.pull('rlm/rag-prompt')

In [12]:
# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str
    #scores: List[float]


# Define application steps
def retrieve(state: State) -> List[Document]:
    # docs, scores = vector_store.similarity_search_with_score(query)
    # for doc, score in zip(docs, scores):
    #     doc.metadata["score"] = score

    docs_and_scores = vector_store.similarity_search_with_score(state['question'])

    # add score to doc metadata
    for doc, score in docs_and_scores:
        doc.metadata['score'] = score  

    docs = [doc for doc, _ in docs_and_scores]
    return {"context": docs}

def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()


In [13]:
response = graph.invoke({"question": "What are the names of the projects talked about in this blog?"})
print(response["answer"])

I don't know the names of specific projects mentioned in the blog. The text mentions software developed by SubLab, but it does not provide a list of project names. It also mentions JADE II and Bede, which appear to be HPC resources, but these are not projects.
