# Step1 : Loading

In [1]:
from langchain_community.document_loaders.web_base import WebBaseLoader
import bs4

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
import os

# Define the user agent string you want to set
# It's good practice to include your application name and version,
# and potentially contact information.
my_user_agent = "MyPythonApp/1.0 (contact@example.com)"

# Set the USER_AGENT environment variable
os.environ['USER_AGENT'] = my_user_agent

In [3]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader(
    web_path = "https://www.espn.com/"
    # header_template = None,
    # verify_ssl = True,
    # proxies = None,
    # continue_on_failure = False,
    # autoset_encoding = True,
    # encoding = None,
    # web_paths = (),
    # requests_per_second = 2,
    # default_parser = "html.parser",
    # requests_kwargs = None,
    # raise_for_status = False,
    # bs_get_text_kwargs = None,
    # bs_kwargs = None,
    # session = None,
    # show_progress = True,
    # trust_env = False,
)
docs = loader.load()

In [4]:
docs[0].page_content.replace("\n", ' ')

'         ESPN - Serving Sports Fans. Anytime. Anywhere.                                                                                                      Skip to main content               Skip to navigation                                       <  >          MenuESPN      scores    NFLNBANHLMLBSoccerGolfWNBAMore SportsBoxingNCAACricketF1GamingHorseLLWSMMANASCARNLLNBA G LeagueNBA Summer LeagueNCAAFNCAAMNCAAWNWSLOlympicsPLLProfessional WrestlingRacingRN BBRN FBRugbySports BettingTennisTGLUFLX GamesEditionsFantasyWatchESPN BETESPN+                    Subscribe Now      LALIGA        NCAA Baseball        NCAA Softball   Quick Links     NFL Free Agency        NFL Draft        NBA Playoffs        Stanley Cup Playoffs        NBA Draft        Where To Watch        Today\'s Top Odds        ESPN Radio: Listen Live       Favorites             Manage Favorites           Customize ESPNCreate AccountLog InFantasy     Football        Baseball        Hockey        Men\'s Basketball        Women\'

# Step 2 : Chunking (transform)

In [5]:
from langchain_text_splitters.character import RecursiveCharacterTextSplitter

In [6]:
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(separators=['\n\n', '\n', ' ', ''], chunk_size= 500, chunk_overlap = 100)

In [7]:
documents = splitter.split_documents(documents=docs)

In [8]:
len(documents)

7

In [9]:
documents[0]

Document(metadata={'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}, page_content="ESPN - Serving Sports Fans. Anytime. Anywhere.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n        Skip to main content\n    \n\n        Skip to navigation\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<\n\n>\n\n\n\n\n\n\n\n\n\nMenuESPN\n\n\n\n\n\nscores\n\n\n\nNFLNBANHLMLBSoccerGolfWNBAMore SportsBoxingNCAACricketF1GamingHorseLLWSMMANASCARNLLNBA G LeagueNBA Summer LeagueNCAAFNCAAMNCAAWNWSLOlympicsPLLProfessional WrestlingRacingRN BBRN FBRugbySports BettingTennisTGLUFLX GamesEditionsFantasyWatchESPN BETESPN+\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n  \n\nSu

# Step 3 : Embedding

In [10]:
from langchain_ollama.embeddings import OllamaEmbeddings
embedding = OllamaEmbeddings(model='qwen2.5:1.5b')

# Step 4 : Storing 

In [13]:
%pwd

'c:\\Users\\spurusho\\Downloads\\GenAI\\Personal\\langchainTutorials\\RAG_Pipeline'

In [14]:
from langchain_chroma.vectorstores import Chroma
db = Chroma.from_documents(documents=documents, embedding=embedding, persist_directory='./')

# Step 5 : Retriving using llm

In [16]:
from langchain_ollama import OllamaLLM

model = OllamaLLM(model="qwen2.5:1.5b")

In [17]:
from langchain_core.prompts.chat import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template(
    """
Answer the following question based only on the provided context.
Think step by step before providing a detailed answer.
I will tip you $1000 if the user finds the answer helpful.
<context>
{context}
</context>
Question : {input}
"""
)

In [19]:
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
document_chain = create_stuff_documents_chain(llm= model, prompt=prompt)

In [20]:
retriever = db.as_retriever()
retriever

VectorStoreRetriever(tags=['Chroma', 'OllamaEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x000001ED9EFE5AF0>, search_kwargs={})

In [21]:
from langchain.chains.retrieval import create_retrieval_chain
retrieval_chain = create_retrieval_chain(retriever=retriever, combine_docs_chain=document_chain)

In [23]:
result = retrieval_chain.invoke({"input": "Give the summary of the whole webpage"})

In [24]:
print(result['answer'])

The webpage provides information on various sports including NFL, MLB, NBA, WNBA, and women's basketball. It discusses recent news and rankings in these fields, such as Russell Westbrook's clutch defensive play, the 2025 NFL draft, the 2025 MLB ace rankings, and the 2025 WNBA draft. The page is designed to inform readers about sports events and provide tips on how to follow them.
