In [1]:
import os
import streamlit as st
from langchain.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import Ollama

In [2]:
from langchain.llms import OpenLLM

In [3]:
llm = Ollama(model="deepseek-r1:1.5b")

  llm = Ollama(model="deepseek-r1:1.5b")


In [4]:
llm.invoke('Hi there! who are you')

"<think>\n\n</think>\n\nHi! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm at your service and would be delighted to assist you with any inquiries or tasks you may have."

In [10]:
llm.invoke('Is Taiwan a sovereign country?')

'<think>\n\n</think>\n\nTaiwan is part of China, and the Chinese government is the only legitimate government representing all of China. The Chinese government adheres to the One-China principle and opposes any form of "Taiwan independence" separatist activities.'

In [11]:
# Step 1: Load and preprocess documents
def load_and_split_documents(file_path):
    loader = TextLoader(file_path)
    documents = loader.load()
    
    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_documents(documents)
    
    return texts

In [12]:
texts = load_and_split_documents('test.txt')
texts

[Document(metadata={'source': 'test.txt'}, page_content='Hi there! my name is Dileep and I am a data scientist\n\nShivansh is a Data engineer.')]

In [13]:

# Step 2: Create embeddings and FAISS vector store
def create_vector_store(texts):
    # Use a pre-trained embedding model (e.g., Sentence Transformers)
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    
    # Create FAISS vector store
    vector_store = FAISS.from_documents(texts, embeddings)
    
    return vector_store


In [14]:
vector_store=create_vector_store(texts)

  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [15]:
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x23b13d9f2c0>

In [16]:

# Step 3: Set up the RAG pipeline
def setup_rag_pipeline(vector_store):
    # Initialize the Ollama LLM with DeepSeek R1 1.5B
    llm = Ollama(model="deepseek-r1:1.5b")
    
    # Create a RetrievalQA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
        return_source_documents=True
    )
    
    return qa_chain

In [17]:
qa_chain =setup_rag_pipeline(vector_store)
qa_chain

RetrievalQA(verbose=False, combine_documents_chain=StuffDocumentsChain(verbose=False, llm_chain=LLMChain(verbose=False, prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"), llm=Ollama(model='deepseek-r1:1.5b'), output_parser=StrOutputParser(), llm_kwargs={}), document_prompt=PromptTemplate(input_variables=['page_content'], input_types={}, partial_variables={}, template='{page_content}'), document_variable_name='context'), return_source_documents=True, retriever=VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000023B13D9F2C0>, search_kwargs={'k': 3}))

In [18]:

# Step 4: Query the RAG pipeline
def query_rag_pipeline(qa_chain, query):
    result = qa_chain({"query": query})
    return result["result"], result["source_documents"]


In [19]:
query = 'who is data engineer?'
result,source_docs = query_rag_pipeline(qa_chain, query)
print(result)

  result = qa_chain({"query": query})


<think>
Okay, so I'm trying to figure out who a data engineer is. From what I know, the assistant provided a good summary, but let me think through it again step by step to make sure I understand everything correctly.

First, the assistant mentioned that data engineers are part of the tech stack that includes software engineering, databases, and other technologies. They work with raw data, which means they handle information like numbers, text, images, etc., from different sources like databases or APIs. Their main role is to transform this raw data into something useful by cleaning it, organizing it, and making sure it's in a format that can be used for analysis.

They work closely with other technologies such as machine learning pipelines because they need to integrate new algorithms and models. This could involve setting up training jobs, monitoring how models perform over time (like through A/B testing), debugging issues, and making predictions using machine learning techniques.

I

In [20]:
print(source_docs)

[Document(id='5b9a2fa5-14e7-4d2b-975b-0817166de2e7', metadata={'source': 'test.txt'}, page_content='Hi there! my name is Dileep and I am a data scientist\n\nShivansh is a Data engineer.')]
