In [1]:
%%capture --no-stderr
%pip install numpy
%pip install torch
%pip install transformers
%pip install scipy
%pip install pandas langchain langchain-community langchain-chroma langchain-huggingface
%pip install accelerate

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Hangging Face Token for downloading meta Llama 2 7b model
# HF_TOKEN = input("Enter your Hugging Face token: ")

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFacePipeline
import torch
from langchain_chroma import Chroma
from langchain.text_splitter import TokenTextSplitter
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from pathlib import Path
import re
import spacy

In [5]:
%pwd

'c:\\Users\\HP\\Desktop\\Rag_Based_Chatbot_Project\\research'

In [6]:
import os
os.chdir('../')
%pwd

'c:\\Users\\HP\\Desktop\\Rag_Based_Chatbot_Project'

In [7]:
import json

with open('.\data\project_1_publications.json', 'r') as file:
    data = json.load(file)

In [None]:
len(data)

35

In [None]:
print(data[0].keys())

dict_keys(['id', 'username', 'license', 'title', 'publication_description'])


In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - ------------------------------------- 0.5/12.8 MB 541.6 kB/s eta 0:00:23
     - ------------------------------------- 0.5/12.8 MB 541.6 kB/s eta 0:00:23
     - ------------------------------------- 0.5/12.8 MB 541.6 kB/s eta 0:00:23
     -- ------------------------------------ 0.8/12.8 MB 453.5 kB/s eta 0:00:27
     -- ------------------------------------ 0.8/12.8 MB 453.5 kB/s eta 0:00:27
     -- ------------------------------------ 0.8/12.8 MB 453.5 k

In [8]:
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove Markdown images and HTML images
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)  # Markdown-style ![alt](url)
    text = re.sub(r'<img[^>]*>', '', text)       # HTML-style <img ... >

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove hashtags
    text = re.sub(r'#\w+', '', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # NLP: lowercase, remove stopwords, lemmatize
    doc = nlp(text.lower())
    cleaned_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

    return ' '.join(cleaned_tokens)


In [9]:
cleaned_data = []

for item in data:
    cleaned_entry = {
        "id": item.get("id"),
        "title": preprocess_text(item.get("title", "")),
        "content": preprocess_text(item.get("publication_description", "")),
        "username": preprocess_text(item.get("username", ""))
    }
    cleaned_data.append(cleaned_entry)

In [10]:
cleaned_data[0].keys()

dict_keys(['id', 'title', 'content', 'username'])

In [11]:
documents_with_metadata = [
    Document(page_content=item["content"], metadata={
        "id": item["id"],
        "title": item["title"],
        "username": item["username"]
    })
    for item in cleaned_data
]

In [12]:
len(documents_with_metadata)

35

In [13]:
token_splitter = TokenTextSplitter(
    encoding_name="cl100k_base",
    chunk_size=200,
    chunk_overlap=30
)

chunked_documents = []

for doc in documents_with_metadata:
    chunks = token_splitter.split_text(doc.page_content)
    for i, chunk in enumerate(chunks, 1):
        chunked_doc = {
            "id": doc.metadata.get("id", None),
            "title": doc.metadata.get("title", None),
            "username": doc.metadata.get("username", None),
            "chunk_number": i,
            "content": chunk
        }
        chunked_documents.append(chunked_doc)

# Output first two chunks
for item in chunked_documents[:2]:
    print(item)

{'id': '0CBAR8U8FakE', 'title': 'add memory rag application ai agent', 'username': '3rdson', 'chunk_number': 1, 'content': '5 month build rag application build rag application realise need add memory move production go youtube search video not find meaningful see video video not add persistent memory production ready rag application talk add memory storage rag application unsuitable scale application realize need figure thing write good article guide reader thought process step need add memory rag application ai agent quick note build streamlit follow tutorial find easy way add memory streamlit app pre requisite 1 jump discussion want believe know rag need unfamiliar concept read 2 want believe know build rag application want learn build rag application follow previous article 3 tutorial mongodb traditional database langchain llm framework openai gpt 3.5turbo llm use technology choice understand workflow 4 follow ` pip install ` library ` ` ` openai python dotenv langchain openai pymon

In [14]:
type(chunked_documents)

list

In [15]:
langchain_chunks = [
    Document(page_content=doc["content"], metadata={
        "id": doc["id"],
        "title": doc["title"],
        "username": doc["username"],
        "chunk_number": doc["chunk_number"]
    })
    for doc in chunked_documents
]

In [16]:
len(langchain_chunks)

558

In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"

embedding_model = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1",
    model_kwargs={"trust_remote_code": True,
                 "device": device}
)

persist_directory = "vectordb"

vectorstore = Chroma.from_documents(
    documents=langchain_chunks,
    embedding=embedding_model,
    persist_directory=persist_directory
)

<All keys matched successfully>


In [19]:
from langchain_groq import ChatGroq

api_key = "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"

llm = ChatGroq(
            model="llama3-8b-8192",
            temperature=0.8,
            groq_api_key=api_key
        )

In [20]:
retrieve = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 2})

In [21]:
retrieve.invoke("rag application")

[Document(id='2e1ddf70-5b70-4fc3-94c3-140273fc6972', metadata={'title': 'add memory rag application ai agent', 'chunk_number': 1, 'id': '0CBAR8U8FakE', 'username': '3rdson'}, page_content='5 month build rag application build rag application realise need add memory move production go youtube search video not find meaningful see video video not add persistent memory production ready rag application talk add memory storage rag application unsuitable scale application realize need figure thing write good article guide reader thought process step need add memory rag application ai agent quick note build streamlit follow tutorial find easy way add memory streamlit app pre requisite 1 jump discussion want believe know rag need unfamiliar concept read 2 want believe know build rag application want learn build rag application follow previous article 3 tutorial mongodb traditional database langchain llm framework openai gpt 3.5turbo llm use technology choice understand workflow 4 follow ` pip in

In [22]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retrieve,
    memory=memory,
    return_source_documents=False
)

  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


In [23]:
def new_session():
    new_memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    qa_chain.memory = new_memory
    print("\n🔄 New context started!")

# Chat loop
while True:
    query = input("You: ")
    if query.lower() == "reset":
        new_session()
        continue
    elif query.lower() in ["exit", "quit"]:
        break
    response = qa_chain.invoke({"question": query})
    print("Bot:", response["answer"])

Bot: I apologize, but it seems like there is no question in your prompt. Please rephrase or provide a question related to the given context, and I'll do my best to assist you.
Bot: I don't know the answer to this question. The provided context appears to be a personal note or journal entry about building a RAG application, but it doesn't explicitly define what "RAG application memory" refers to. Without more context or information, I'm unable to provide a meaningful answer.
Bot: Based on the provided context, here is the step-by-step guide to build a RAG (Read-Answer-Generate) application:

**Step 1: Reading process**

* Store datum (text) in a vector database
* Read files such as PDFs, TXT, CSV, DOC, etc. in text format
* Help work or divide text into chunks
* Feed these chunks into an embed model

**Step 2: Answer query information vector database**

* Store the embedded chunks in a vector database
* Choose an embed model for storing vector database
* Upsert (insert or update) the em