In [None]:
!pip install langchain_community langchainhub chromadb langchain langchain-openai rank_bm25

Collecting langchain_community
  Using cached langchain_community-0.3.29-py3-none-any.whl.metadata (2.9 kB)
Collecting langchainhub
  Using cached langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting chromadb
  Using cached chromadb-1.0.21-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Collecting langchain-openai
  Using cached langchain_openai-0.3.33-py3-none-any.whl.metadata (2.4 kB)
Collecting rank_bm25
  Using cached rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting requests<3,>=2.32.5 (from langchain_community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7,>=0.6.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting packaging<25,>=23.2 (from langchainhub)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting types-requests<3.0.0.0,>=2.31.0.2 (from langchainhub)
  Downloading types_requests-2.32.4.202

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import CountVectorizer



In [None]:
# 1. Using weburl as input data
loader = WebBaseLoader(web_paths=["https://en.wikipedia.org/wiki/Kaggle"])
documents = loader.load()

In [None]:
# 2. Splitting into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) #Recurssive splitter for dense context splitting
text_chunks = text_splitter.split_documents(documents)

In [None]:
# 3. Embedding with HuggingFace model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(text_chunks, embedding=embeddings)

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
#4. Using vectorstore as retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
#retriever = vectorstore.as_retriever(search_type="mmr",search_kwargs={"k": 3})

template = """
You are a QA assistant.
Answer only using the provided context.
if context is irrelevant to question, reply "I don't know".
If multiple numbers or facts are present, prefer the most recent one.



Context:
{context}

Question: {question}

Answer:
"""

#5. Prompt template created for model to not utitlize its pretrained data
prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"],
)

# 5. LLM (local HuggingFace model)
generator = pipeline("text2text-generation", model="google/flan-t5-base", max_length=512, truncation=True)
llm = HuggingFacePipeline(pipeline=generator)


def format_docs(docs):
    return "\n".join(doc.page_content for doc in docs)

Device set to use cuda:0


In [None]:
# Convert your text chunks into plain text for BM25
corpus = [doc.page_content for doc in text_chunks]

# Tokenize
vectorizer = CountVectorizer().build_tokenizer()
tokenized_corpus = [vectorizer(doc) for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)

# Function to get top k sparse results
def bm25_retriever(query, k=3):
    tokenized_query = vectorizer(query)
    doc_scores = bm25.get_scores(tokenized_query)
    top_indices = doc_scores.argsort()[-k:][::-1]
    top_docs = [text_chunks[i] for i in top_indices]
    return top_docs


def hybrid_retriever(query,retriever,k=3, alpha=0.5):
    """
    alpha: weight for dense vs sparse
    """
    # Get dense results
    dense_docs = retriever.get_relevant_documents(query)
    # Get sparse results
    sparse_docs = bm25_retriever(query, k=k)

    # Merge & deduplicate (you can tune merging strategy)
    combined_docs = {doc.page_content: doc for doc in dense_docs + sparse_docs}
    merged_docs = list(combined_docs.values())[:k]  # keep top k
    return merged_docs

In [None]:
rag_pipeline = (
    {"context": lambda q: format_docs(hybrid_retriever(q,retriever)), "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
print(rag_pipeline.invoke("What is the use of Kaggle"))

Kaggle enables users to find and publish datasets, explore and build models in a web-based data science environment, work with other data scientists and machine learning engineers, and enter competitions to solve data science challenges.


In [None]:
print(rag_pipeline.invoke("what is python"))

I don't know


In [None]:
print(rag_pipeline.invoke("how to transfer money"))

I don't know


In [None]:
print(rag_pipeline.invoke("When was Kaggle launched?"))

April 2010


In [None]:
print(rag_pipeline.invoke("When was Java launched?"))

I don't know


In [None]:
print(rag_pipeline.invoke("What is oops concept?"))

I don't know


In [None]:
print(rag_pipeline.invoke("why Kaggle was so famous?"))

Kaggle was founded by Anthony Goldbloom in April 2010.[2] Jeremy Howard, one of the first Kaggle users, joined in November 2010 and served as the President and Chief Scientist.[3] Also on the team was Nicholas Gruen serving as the founding chair.[4] In 2011, the company raised $12.5 million and Max Levchin became the chairman.[5] On March 8, 2017, Fei-Fei Li, Chief Scientist at Google, announced that Google was acquiring Kaggle.[6] Kaggle is a data science competition platform and online community for data scientists and machine learning practitioners under Google LLC.


In [None]:
print(rag_pipeline.invoke("What is a Kaggle Grandmaster?"))

The highest tier, Kaggle Grandmaster, is awarded to users who have ranked at the top of multiple competitions including high ranking in a solo team.


In [None]:
print(rag_pipeline.invoke("How many Kaggle users are there?"))

15 million


In [None]:
print(rag_pipeline.invoke("how many people using kaggle"))

1 million registered users, and as of October 2023, it has over 15 million users in 194 countries.
