In [None]:
# Install necessary libraries
!pip install langchain langchain-community qdrant-client
!pip install unstructured
!pip install torch torchvision torchaudio
!pip install transformers
!pip install pypdf2
!pip install sentence_transformers
!pip install PyPDF2
!pip install groq

Collecting langchain
  Downloading langchain-0.3.2-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.1-py3-none-any.whl.metadata (2.8 kB)
Collecting qdrant-client
  Downloading qdrant_client-1.11.3-py3-none-any.whl.metadata (10 kB)
Collecting langchain-core<0.4.0,>=0.3.8 (from langchain)
  Downloading langchain_core-0.3.9-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.131-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-c

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

class QdrantManager:
    def __init__(self, url: str, api_key: str, collection_name: str):
        self.client = QdrantClient(url=url, api_key=api_key)
        self.collection_name = collection_name

    def create_collection(self, vector_size: int = 384, distance: str = "Cosine", force_recreate: bool = False):
        if force_recreate:
            try:
                self.client.delete_collection(self.collection_name)
                print(f"🗑️ Deleted existing collection '{self.collection_name}'")
            except Exception:
                pass

        existing_collections = [collection.name for collection in self.client.get_collections().collections]

        if self.collection_name not in existing_collections:
            vectors_config = VectorParams(size=vector_size, distance=Distance.COSINE)
            self.client.create_collection(
                collection_name=self.collection_name,
                vectors_config=vectors_config
            )
            print(f"✅ Collection '{self.collection_name}' created with {vector_size} dimensions.")
        else:
            collection_info = self.client.get_collection(self.collection_name)
            existing_vector_size = collection_info.config.params.vectors.size
            print(f"ℹ️ Collection '{self.collection_name}' already exists with {existing_vector_size} dimensions.")
            return existing_vector_size

# Initialize QdrantManager with your credentials
qdrant_url = "https://b7c77350-c945-4fbd-b00c-f2c243b8aa79.europe-west3-0.gcp.cloud.qdrant.io:6333"
qdrant_api_key = "LEY4EhpfeunOv87_phVRO-LGDDS1IAV7rwEAktFfc__pf2hscMwN-w"
collection_name = "document_db"

qdrant_manager = QdrantManager(url=qdrant_url, api_key=qdrant_api_key, collection_name=collection_name)
qdrant_manager.create_collection()

ℹ️ Collection 'document_db' already exists with 384 dimensions.


384

In [None]:
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import Qdrant
import os
class EmbeddingsManager:
    def __init__(
        self,
        model_name: str = "BAAI/bge-small-en",
        device: str = "cpu",
        encode_kwargs: dict = {"normalize_embeddings": True},
        qdrant_url: str = None,
        qdrant_api_key: str = None,
        collection_name: str = "document_db",
    ):
        self.model_name = model_name
        self.device = device
        self.encode_kwargs = encode_kwargs
        self.qdrant_url = qdrant_url
        self.qdrant_api_key = qdrant_api_key
        self.collection_name = collection_name

        self.embeddings = HuggingFaceBgeEmbeddings(
            model_name=self.model_name,
            model_kwargs={"device": self.device},
            encode_kwargs=self.encode_kwargs,
        )

        # Initialize QdrantManager
        self.qdrant_manager = QdrantManager(url=qdrant_url, api_key=qdrant_api_key, collection_name=collection_name)

    def create_embeddings(self, pdf_path: str, force_recreate: bool = True):
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"The file {pdf_path} does not exist.")

        # Test embedding dimensions
        test_embedding = self.embeddings.embed_query("test")
        embedding_dim = len(test_embedding)

        # Create or verify collection with correct dimensions
        existing_dim = self.qdrant_manager.create_collection(vector_size=embedding_dim, force_recreate=force_recreate)

        if existing_dim and existing_dim != embedding_dim and not force_recreate:
            raise ValueError(f"Existing collection has {existing_dim} dimensions, but model produces {embedding_dim} dimensions. Set force_recreate=True to recreate the collection.")

        # Load and preprocess the document
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            extracted_text = page.extract_text()
            if extracted_text:
                text += extracted_text + "\n"

        if not text:
            raise ValueError("❌ No text found in the PDF.")

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=250
        )
        splits = text_splitter.split_text(text)
        if not splits:
            raise ValueError("❌ No text chunks were created from the document.")

        # Create and store embeddings in Qdrant
        try:
            qdrant = Qdrant.from_texts(
                splits,
                self.embeddings,
                url=self.qdrant_url,
                api_key=self.qdrant_api_key,
                prefer_grpc=False,
                collection_name=self.collection_name,
            )
            return f"✅ Vector DB Successfully Created and Stored in Qdrant with {embedding_dim} dimensions!"
        except Exception as e:
            raise ConnectionError(f"❌ Failed to connect to Qdrant: {e}")

In [None]:
# Initialize EmbeddingsManager with environment variables
embeddings_manager = EmbeddingsManager(
    qdrant_url=qdrant_url,
    qdrant_api_key=qdrant_api_key,
    collection_name=collection_name
)

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from google.colab import files
import tempfile

# Upload PDF
uploaded = files.upload()

# Assume only one file is uploaded
for filename in uploaded.keys():
    pdf_filename = filename
    break

# Save the uploaded PDF to a temporary file
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
with open(temp_pdf.name, "wb") as f:
    f.write(uploaded[pdf_filename])

print(f"📄 Uploaded and saved to {temp_pdf.name}")

Saving Law File.PDF to Law File.PDF
📄 Uploaded and saved to /tmp/tmpzcdwvaq4.pdf


In [None]:
# Create embeddings from the uploaded PDF
try:
    result = embeddings_manager.create_embeddings(temp_pdf.name, force_recreate=True)
    print(result)
except Exception as e:
    print(e)

🗑️ Deleted existing collection 'document_db'
✅ Collection 'document_db' created with 384 dimensions.
✅ Vector DB Successfully Created and Stored in Qdrant with 384 dimensions!


In [None]:
from groq import Groq

# Initialize Groq client with the API key from environment variables
client = Groq(
    api_key="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",
)

def query_llama_via_groq(context: str, question: str) -> str:
    prompt = f"""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer. Always be creative and elaborate to anwswer based on context.

Context: {context}
Question: {question}

Only return the helpful answer. Answer must be detailed and well explained.
Helpful answer:
"""

    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model="llama-3.2-90b-text-preview",
        )
        return chat_completion.choices[0].message.content.strip()
    except Exception as e:
        return f"⚠️ An error occurred while querying Llama 3.2: {e}"


In [None]:
from langchain.chains import RetrievalQA
from langchain import PromptTemplate
from langchain_community.vectorstores import Qdrant as LangChainQdrant
from groq import Groq

class ChatbotManager:
    def __init__(
        self,
        qdrant_url: str,
        qdrant_api_key: str,
        collection_name: str,
        embeddings,  # Add embeddings parameter
        top_k: int = 3,
    ):
        self.qdrant_url = qdrant_url
        self.qdrant_api_key = qdrant_api_key
        self.collection_name = collection_name
        self.top_k = top_k

        # Initialize Qdrant client
        self.client = QdrantClient(
            url=self.qdrant_url,
            api_key=self.qdrant_api_key,
            prefer_grpc=False
        )

        # Initialize the vector store with embeddings
        self.db = LangChainQdrant(
            client=self.client,
            collection_name=self.collection_name,
            embeddings=embeddings  # Pass embeddings here
        )

        # Initialize the retriever
        self.retriever = self.db.as_retriever(search_kwargs={"k": self.top_k})

    def get_relevant_context(self, query: str) -> str:
        try:
            results = self.retriever.get_relevant_documents(query)
            context = "\n".join([doc.page_content for doc in results])
            return context
        except Exception as e:
            return f"⚠️ An error occurred during retrieval: {e}"

    def get_response(self, query: str) -> str:
        context = self.get_relevant_context(query)
        if context.startswith("⚠️"):
            return context  # Return the error message

        if not context.strip():
            return "⚠️ No relevant context found for your query."

        response = query_llama_via_groq(context, query)
        return response


In [None]:
import os

chatbot_manager = ChatbotManager(
    qdrant_url=qdrant_url,
    qdrant_api_key=qdrant_api_key,
    collection_name=collection_name,
    embeddings=embeddings_manager.embeddings,  # Pass embeddings here
    top_k=3
)

def chat_with_document():
    print("🤖 Welcome to DocumentBuddy! Ask me anything about your uploaded document.")
    print("Type 'exit' to end the chat.\n")

    while True:
        user_input = input("📝 You: ")
        if user_input.lower() in ['exit', 'quit']:
            print("👋 Goodbye!")
            break

        response = chatbot_manager.get_response(user_input)
        print(f"🤖 Bot: {response}\n")

# Start the chat
chat_with_document()


  self.db = LangChainQdrant(
