In [1]:
!pip install langchain_community langchainhub chromadb langchain langchain-openai rank_bm25



In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import CrossEncoder

In [9]:
# 1. Using weburl as input data
loader = WebBaseLoader(web_paths=["https://en.wikipedia.org/wiki/Kaggle"])
documents = loader.load()

In [10]:
# 2. Splitting into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) #Recurssive splitter for dense context splitting
text_chunks = text_splitter.split_documents(documents)

In [5]:
# 3. Embedding with HuggingFace model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(text_chunks, embedding=embeddings)

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
#4. Using vectorstore as retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
#retriever = vectorstore.as_retriever(search_type="mmr",search_kwargs={"k": 3})

template = """
You are a QA assistant.
Answer only using the provided context.
if context is irrelevant to question, reply "I don't know".
If multiple numbers or facts are present, prefer the most recent one.



Context:
{context}

Question: {question}

Answer:
"""

#5. Prompt template created for model to not utitlize its pretrained data
prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"],
)

# 5. LLM (local HuggingFace model)
generator = pipeline("text2text-generation", model="google/flan-t5-base", max_length=512, truncation=True)
llm = HuggingFacePipeline(pipeline=generator)


def format_docs(docs):
    return "\n".join(doc.page_content for doc in docs)

Device set to use cuda:0


In [12]:
# Convert your text chunks into plain text for BM25
corpus = [doc.page_content for doc in text_chunks]

# Tokenize
vectorizer = CountVectorizer().build_tokenizer()
tokenized_corpus = [vectorizer(doc) for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)

# Function to get top k sparse results
def bm25_retriever(query, k=3):
    tokenized_query = vectorizer(query)
    doc_scores = bm25.get_scores(tokenized_query)
    top_indices = doc_scores.argsort()[-k:][::-1]
    top_docs = [text_chunks[i] for i in top_indices]
    return top_docs

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank(query, docs):
    pairs = [(query, doc.page_content) for doc in docs]
    scores = reranker.predict(pairs)
    ranked = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
    return [doc for doc, _ in ranked]

def hybrid_retriever(query,retriever,k=3, alpha=0.5):
    """
    alpha: weight for dense vs sparse
    """
    # Get dense results
    dense_docs = retriever.get_relevant_documents(query)
    # Get sparse results
    sparse_docs = bm25_retriever(query, k=k)

    # Merge & deduplicate (you can tune merging strategy)
    combined_docs = {doc.page_content: doc for doc in dense_docs + sparse_docs}
    merged_docs = list(combined_docs.values())[:k]  # keep top k
    reranked_docs = rerank(query, merged_docs)
    return reranked_docs[:k]

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [13]:
rag_pipeline = (
    {"context": lambda q: format_docs(hybrid_retriever(q,retriever)), "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [14]:
print(rag_pipeline.invoke("What is the use of Kaggle"))

  dense_docs = retriever.get_relevant_documents(query)


Kaggle enables users to find and publish datasets, explore and build models in a web-based data science environment, work with other data scientists and machine learning engineers, and enter competitions to solve data science challenges.[citation needed]


In [15]:
print(rag_pipeline.invoke("what is python"))

I don't know


In [16]:
print(rag_pipeline.invoke("how to transfer money"))

I don't know


In [17]:
print(rag_pipeline.invoke("When was Kaggle launched?"))

April 2010


In [18]:
print(rag_pipeline.invoke("When was Java launched?"))

I don't know


In [19]:
print(rag_pipeline.invoke("What is oops concept?"))

I don't know


In [20]:
print(rag_pipeline.invoke("why Kaggle was so famous?"))

Kaggle is a data science competition platform and online community for data scientists and machine learning practitioners under Google LLC.


In [21]:
print(rag_pipeline.invoke("What is a Kaggle Grandmaster?"))

The highest tier, Kaggle Grandmaster, is awarded to users who have ranked at the top of multiple competitions including high ranking in a solo team.


In [22]:
print(rag_pipeline.invoke("How many Kaggle users are there?"))

15 million


In [23]:
print(rag_pipeline.invoke("how many people using kaggle"))

1 million registered users, and as of October 2023, it has over 15 million users in 194 countries.
