In [None]:
#!pip install langchain_community langchainhub chromadb langchain langchain-openai

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate



Device set to use cuda:0


In [None]:
#Using weburl as input data
loader = WebBaseLoader(web_paths=["https://en.wikipedia.org/wiki/Kaggle"])
documents = loader.load()

In [None]:
#Splitting into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) #Recurssive splitter for dense context splitting
text_chunks = text_splitter.split_documents(documents)


In [None]:
#Embedding with HuggingFace model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(text_chunks, embedding=embeddings)


In [None]:

#Using vectorstore as retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})


In [None]:
template = """
You are a question answering assistant.

ONLY use the information provided in the context to answer the question.
If the context does not contain the answer, reply exactly: "I don't know".


Context:
{context}

Question: {question}

Answer:
"""

#Prompt template created for model to not utitlize its pretrained data
prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"],
)


In [None]:

#LLM (local HuggingFace model)
generator = pipeline("text2text-generation", model="google/flan-t5-base", max_length=512, truncation=True)
llm = HuggingFacePipeline(pipeline=generator)


def format_docs(docs):
    return "\n".join(doc.page_content for doc in docs)


#RAG pipeline
rag_pipeline = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [2]:
print(rag_pipeline.invoke("What is the use of Kaggle"))

Kaggle enables users to find and publish datasets, explore and build models in a web-based data science environment, work with other data scientists and machine learning engineers, and enter competitions to solve data science challenges.


In [3]:
print(rag_pipeline.invoke("what is python"))

User can write and execute code in Python or R, import datasets, use popular libraries, and train models on CPUs, GPUs, or TPUs directly in the cloud.


In [4]:
print(rag_pipeline.invoke("how to transfer money"))

I don't know


In [6]:
print(rag_pipeline.invoke("When was Kaggle launched?"))

April 2010


In [7]:
print(rag_pipeline.invoke("When was Java launched?"))

I don't know


In [8]:
print(rag_pipeline.invoke("What is oops concept?"))

I don't know.


In [9]:
print(rag_pipeline.invoke("why Kaggle was so famous?"))

Kaggle enables users to find and publish datasets, explore and build models in a web-based data science environment, work with other data scientists and machine learning engineers, and enter competitions to solve data science challenges.
