In [8]:
# Cell 1: Install ALL required libraries (including cryptography)
import subprocess
import sys

packages = [
    "langchain-community",
    "faiss-cpu",
    "sentence-transformers",
    "python-dotenv",
    "tiktoken",
    "langchain-groq",
    "groq",
    "pypdf",
    "langchain",
    "langchain-text-splitters",
    "langchain-core",
    "cryptography>=3.1",
    "pikepdf",
    "unstructured[pdf]",
    "pdf2image",
    "pdfplumber",
    "pillow",  # Required for image processing
    "pytesseract",  # For OCR support
]

print("Installing required packages...")
for package in packages:
    try:
        print(f"  Installing {package}...", end=" ")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])
        print("‚úÖ")
    except subprocess.CalledProcessError as e:
        print(f"‚ö†Ô∏è (will continue)")

print("\n‚úÖ Installation complete!")

Installing required packages...
  Installing langchain-community... ‚úÖ
  Installing faiss-cpu... ‚úÖ
  Installing sentence-transformers... ‚úÖ
  Installing python-dotenv... ‚úÖ
  Installing tiktoken... ‚úÖ
  Installing langchain-groq... ‚úÖ
  Installing groq... ‚úÖ
  Installing pypdf... ‚úÖ
  Installing langchain... ‚úÖ
  Installing langchain-text-splitters... ‚úÖ
  Installing langchain-core... ‚úÖ
  Installing cryptography>=3.1... ‚úÖ
  Installing pikepdf... ‚úÖ
  Installing unstructured[pdf]... ‚úÖ
  Installing pdf2image... ‚úÖ
  Installing pdfplumber... ‚úÖ
  Installing pillow... ‚úÖ
  Installing pytesseract... ‚úÖ

‚úÖ Installation complete!


In [9]:
# Verify installations
print("Verifying package installations...")
import sys

packages_to_check = {
    "langchain_groq": "langchain-groq",
    "faiss": "faiss-cpu",
    "sentence_transformers": "sentence-transformers",
    "dotenv": "python-dotenv",
    "tiktoken": "tiktoken",
    "groq": "groq",
    "pypdf": "pypdf",
    "langchain": "langchain",
    "langchain_text_splitters": "langchain-text-splitters",
    "langchain_core": "langchain-core",
    "cryptography": "cryptography",
    "pikepdf": "pikepdf",
    "unstructured": "unstructured",
    "pdf2image": "pdf2image",
    "pdfplumber": "pdfplumber",
    "PIL": "pillow",
}

missing_packages = []
for module_name, package_name in packages_to_check.items():
    try:
        __import__(module_name)
        print(f"  ‚úÖ {package_name}")
    except ImportError:
        print(f"  ‚ùå {package_name} - MISSING")
        missing_packages.append(package_name)

if missing_packages:
    print(f"\n‚ö†Ô∏è Missing packages: {', '.join(missing_packages)}")
    print("Installing missing packages...")
    for package in missing_packages:
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])
            print(f"  ‚úÖ Installed {package}")
        except:
            print(f"  ‚ùå Failed to install {package}")
else:
    print("\n‚úÖ All packages installed successfully!")

Verifying package installations...
  ‚úÖ langchain-groq
  ‚úÖ faiss-cpu
  ‚úÖ sentence-transformers
  ‚úÖ python-dotenv
  ‚úÖ tiktoken
  ‚úÖ groq
  ‚úÖ pypdf
  ‚úÖ langchain
  ‚úÖ langchain-text-splitters
  ‚úÖ langchain-core
  ‚úÖ cryptography
  ‚úÖ pikepdf
  ‚úÖ unstructured
  ‚úÖ pdf2image
  ‚úÖ pdfplumber
  ‚úÖ pillow

‚úÖ All packages installed successfully!


In [10]:
# Cell 2: Import available modules
import os
import sys
from dotenv import load_dotenv, find_dotenv
import warnings
warnings.filterwarnings('ignore')

# Load environment variables
load_dotenv(find_dotenv())

# Import available modules
from langchain_groq import ChatGroq
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

print("‚úÖ Imported available modules")

‚úÖ Imported available modules


In [11]:
# Cell 3: Create custom implementations for missing modules
print("Creating custom implementations for missing modules...")

# Custom ConversationBufferMemory
class ConversationBufferMemory:
    def __init__(self, memory_key="chat_history", return_messages=True, output_key="answer"):
        self.memory_key = memory_key
        self.return_messages = return_messages
        self.output_key = output_key
        self.chat_memory = type('obj', (object,), {'messages': []})()
        self.buffer = []
    
    def save_context(self, inputs, outputs):
        if "question" in inputs and "answer" in outputs:
            self.buffer.append({
                "question": inputs["question"],
                "answer": outputs["answer"]
            })
    
    def clear(self):
        self.buffer = []
        self.chat_memory.messages = []
    
    def load_memory_variables(self, inputs):
        return {self.memory_key: self.buffer}

# Custom ConversationalRetrievalChain
class ConversationalRetrievalChain:
    @classmethod
    def from_llm(cls, llm, retriever, memory, verbose=False, return_source_documents=True, get_chat_history=None):
        instance = cls()
        instance.llm = llm
        instance.retriever = retriever
        instance.memory = memory
        instance.verbose = verbose
        instance.return_source_documents = return_source_documents
        instance.get_chat_history = get_chat_history if get_chat_history else (lambda h: h)
        return instance
    
    def __call__(self, inputs):
        question = inputs.get("question", "")
        
        # Get chat history from memory
        memory_vars = self.memory.load_memory_variables({})
        chat_history = memory_vars.get(self.memory.memory_key, [])
        
        # Retrieve relevant documents
        docs = self.retriever.get_relevant_documents(question)
        
        # Format chat history
        history_text = ""
        if chat_history:
            for item in chat_history[-3:]:  # Last 3 exchanges
                if isinstance(item, dict):
                    history_text += f"User: {item.get('question', '')}\n"
                    history_text += f"Assistant: {item.get('answer', '')}\n\n"
        
        # Format context from documents
        context = ""
        for i, doc in enumerate(docs[:3]):  # Use top 3 docs
            context += f"[Document {i+1}]\n{doc.page_content[:500]}\n\n"
        
        # Create prompt
        prompt = f"""Based on the following documents and conversation history, answer the question.

Previous conversation:
{history_text}

Relevant documents:
{context}

Question: {question}

Provide a clear, concise answer based only on the documents above:"""
        
        # Get response from LLM
        response = self.llm.invoke(prompt)
        answer = response.content if hasattr(response, 'content') else str(response)
        
        # Save to memory
        self.memory.save_context(
            {"question": question},
            {"answer": answer}
        )
        
        result = {
            "answer": answer,
            "source_documents": docs if self.return_source_documents else []
        }
        
        if self.verbose:
            print(f"\n[DEBUG] Question: {question}")
            print(f"[DEBUG] Retrieved {len(docs)} documents")
            print(f"[DEBUG] Answer: {answer[:100]}...")
        
        return result

print("‚úÖ Created custom implementations:")
print("   - ConversationBufferMemory")
print("   - ConversationalRetrievalChain")

Creating custom implementations for missing modules...
‚úÖ Created custom implementations:
   - ConversationBufferMemory
   - ConversationalRetrievalChain


In [12]:
# Cell 4: Load PDF documents with encryption handling
import os
from pathlib import Path

pdf_directory = "./pdfs/"  # Using your pdfs folder

print(f"üìÇ Loading PDFs from: {pdf_directory}")

# Check if directory exists
if not os.path.exists(pdf_directory):
    print(f"‚ùå Directory '{pdf_directory}' not found.")
    print(f"Creating directory...")
    os.makedirs(pdf_directory, exist_ok=True)
    print(f"‚úÖ Created '{pdf_directory}'")
    print("Please add your PDF files to this directory and run this cell again.")
    documents = []
else:
    # Count PDF files
    pdf_files = [f for f in os.listdir(pdf_directory) if f.lower().endswith('.pdf')]
    print(f"Found {len(pdf_files)} PDF file(s)")
    
    if pdf_files:
        print("Files:")
        for pdf in pdf_files:
            print(f"  ‚Ä¢ {pdf}")
        
        print("\nüîç Checking for encrypted PDFs...")
        
        # Try loading each PDF individually to identify problematic ones
        successful_docs = []
        failed_files = []
        
        for pdf_file in pdf_files:
            pdf_path = os.path.join(pdf_directory, pdf_file)
            print(f"\nProcessing: {pdf_file}")
            
            try:
                # Try with PyPDFLoader first
                from langchain_community.document_loaders import PyPDFLoader
                loader = PyPDFLoader(pdf_path)
                file_docs = loader.load()
                successful_docs.extend(file_docs)
                print(f"  ‚úÖ Successfully loaded ({len(file_docs)} pages)")
                
            except Exception as e:
                print(f"  ‚ö†Ô∏è PyPDFLoader failed: {str(e)[:100]}...")
                
                # Try with alternative PDF reader
                try:
                    print(f"  üîÑ Trying alternative PDF reader...")
                    from langchain_community.document_loaders import UnstructuredPDFLoader
                    loader = UnstructuredPDFLoader(pdf_path, mode="elements", strategy="fast")
                    file_docs = loader.load()
                    successful_docs.extend(file_docs)
                    print(f"  ‚úÖ Successfully loaded with UnstructuredPDFLoader ({len(file_docs)} elements)")
                    
                except Exception as e2:
                    print(f"  ‚ùå Alternative loader also failed: {str(e2)[:100]}...")
                    
                    # Try with pikepdf for encrypted PDFs
                    try:
                        print(f"  üîÑ Trying pikepdf...")
                        import pikepdf
                        
                        # Try to open and decrypt (if possible)
                        with pikepdf.open(pdf_path) as pdf:
                            print(f"  ‚úÖ PDF opened with pikepdf ({len(pdf.pages)} pages)")
                            
                            # Extract text from each page
                            from langchain.schema import Document
                            for page_num, page in enumerate(pdf.pages):
                                try:
                                    text = page.extract_text()
                                    if text:
                                        doc = Document(
                                            page_content=text,
                                            metadata={
                                                "source": pdf_path,
                                                "page": page_num
                                            }
                                        )
                                        successful_docs.append(doc)
                                except:
                                    pass
                            
                            print(f"  ‚úÖ Extracted text from {len(pdf.pages)} pages")
                            
                    except Exception as e3:
                        print(f"  ‚ùå All methods failed for {pdf_file}")
                        print(f"  Error details: {str(e3)[:150]}")
                        failed_files.append(pdf_file)
        
        documents = successful_docs
        
        print("\n" + "=" * 60)
        print(f"üìä Loading Summary:")
        print(f"   Total PDFs attempted: {len(pdf_files)}")
        print(f"   Successfully loaded: {len(pdf_files) - len(failed_files)}")
        print(f"   Failed to load: {len(failed_files)}")
        print(f"   Total pages loaded: {len(documents)}")
        
        if failed_files:
            print(f"\n‚ùå Failed to load these files:")
            for f in failed_files:
                print(f"   ‚Ä¢ {f}")
            print("\nüí° Solutions:")
            print("   1. Try decrypting the PDF with the password")
            print("   2. Convert PDF to text format")
            print("   3. Use OCR if it's a scanned PDF")
        
        if documents:
            print(f"\n‚úÖ Successfully loaded {len(documents)} document pages")
            
            # Show sample
            print(f"\nüìÑ Sample document:")
            print(f"   Source: {os.path.basename(documents[0].metadata.get('source', 'Unknown'))}")
            print(f"   Page: {documents[0].metadata.get('page', 0) + 1}")
            print(f"\nüìù Sample content (first 200 characters):")
            print("-" * 50)
            sample_text = documents[0].page_content[:200]
            if sample_text.strip():
                print(sample_text + "...")
            else:
                print("[Empty or image-based content - may need OCR]")
            print("-" * 50)
    else:
        print("‚ùå No PDF files found in the directory.")
        print(f"Please add PDF files to '{pdf_directory}' and run this cell again.")
        documents = []

üìÇ Loading PDFs from: ./pdfs/
Found 4 PDF file(s)
Files:
  ‚Ä¢ Complete_Guide_YOLO_on_Luckfox_Core1106.pdf
  ‚Ä¢ IR Camera.pdf
  ‚Ä¢ Rockchip RV1106 Datasheet V1.7-20231218.pdf
  ‚Ä¢ YOLOv5_RKNN_Luckfox_Core1106_Guide.pdf

üîç Checking for encrypted PDFs...

Processing: Complete_Guide_YOLO_on_Luckfox_Core1106.pdf
  ‚úÖ Successfully loaded (18 pages)

Processing: IR Camera.pdf
  ‚úÖ Successfully loaded (9 pages)

Processing: Rockchip RV1106 Datasheet V1.7-20231218.pdf
  ‚ö†Ô∏è PyPDFLoader failed: cryptography>=3.1 is required for AES algorithm...
  üîÑ Trying alternative PDF reader...


The PDF <_io.BufferedReader name='./pdfs/Rockchip RV1106 Datasheet V1.7-20231218.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


  ‚úÖ Successfully loaded with UnstructuredPDFLoader (1471 elements)

Processing: YOLOv5_RKNN_Luckfox_Core1106_Guide.pdf
  ‚úÖ Successfully loaded (14 pages)

üìä Loading Summary:
   Total PDFs attempted: 4
   Successfully loaded: 4
   Failed to load: 0
   Total pages loaded: 1512

‚úÖ Successfully loaded 1512 document pages

üìÑ Sample document:
   Source: Complete_Guide_YOLO_on_Luckfox_Core1106.pdf
   Page: 1

üìù Sample content (first 200 characters):
--------------------------------------------------
Complete Guide
Running YOLO Object Detection
on Luckfox Core1106
From Windows 11 Setup to Real-Time Inference
Date: January 15, 2026
Version: 1.0
Platform: Luckfox Core1106 (RV1106 SoC)
Model: YOLOv5n...
--------------------------------------------------


In [13]:
# Cell 5: Split documents into chunks
if documents:
    # Create text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    
    # Split documents
    docs = text_splitter.split_documents(documents)
    print(f"‚úÖ Split into {len(docs)} chunks")
    
    if docs:
        print(f"üìä First chunk size: {len(docs[0].page_content)} characters")
        print(f"üìã First chunk preview:")
        print("-" * 40)
        print(docs[0].page_content[:150] + "...")
        print("-" * 40)
else:
    print("‚ö†Ô∏è No documents to split. Please load PDFs in Cell 4 first.")
    docs = []

‚úÖ Split into 1542 chunks
üìä First chunk size: 214 characters
üìã First chunk preview:
----------------------------------------
Complete Guide
Running YOLO Object Detection
on Luckfox Core1106
From Windows 11 Setup to Real-Time Inference
Date: January 15, 2026
Version: 1.0
Plat...
----------------------------------------


In [14]:
# Cell 6: Create embeddings and FAISS vector store
if docs:
    print("Creating embeddings...")
    
    # Create embeddings
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    
    print("Creating vector store...")
    
    # Create vector store
    vectorstore = FAISS.from_documents(docs, embeddings)
    print("‚úÖ Vector store created successfully!")
    
    # Save the vector store locally
    vectorstore.save_local("faiss_index_pdfs")
    print("üíæ Vector store saved to 'faiss_index_pdfs'")
    
    print(f"üìä Total vectors in index: {vectorstore.index.ntotal}")
else:
    print("‚ö†Ô∏è No documents to create embeddings. Please run Cells 4-5 first.")
    embeddings = None
    vectorstore = None

Creating embeddings...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Creating vector store...
‚úÖ Vector store created successfully!
üíæ Vector store saved to 'faiss_index_pdfs'
üìä Total vectors in index: 1542


In [None]:
# Cell 7: Initialize Groq Llama-3 model
groq_api_key = os.getenv("GROQ_API_KEY")

if not groq_api_key:
    print("‚ö†Ô∏è GROQ_API_KEY not found in .env file")
    print("You can:")
    print("1. Create a .env file with GROQ_API_KEY=your_key")
    print("2. Enter your key below")
    print("3. Visit: https://console.groq.com/keys to get an API key")
    
    groq_api_key = input("Enter your GROQ API key (or press Enter to skip): ")
    
    if not groq_api_key:
        print("‚ö†Ô∏è Continuing without API key. Some features will be limited.")
        print("You can set the key later with: os.environ['GROQ_API_KEY'] = 'your_key'")

if groq_api_key:
    try:
        llm = ChatGroq(
            groq_api_key=groq_api_key,
            model_name="llama-3.1-8b-instant",
            temperature=0.1,
            max_tokens=1024
        )
        
        # Test the connection with a simple prompt
        test_response = llm.invoke("Say 'Hello' in one word.")
        print(f"‚úÖ Groq Llama-3 model initialized successfully!")
        print(f"ü§ñ Test response: {test_response.content}")
        
    except Exception as e:
        print(f"‚ùå Error initializing Groq model: {e}")
        print("Creating a mock LLM for demonstration...")
        
        class MockLLM:
            def invoke(self, prompt):
                return type('obj', (object,), {
                    'content': f"[Mock Response] Based on the documents: {prompt[:50]}..."
                })()
        
        llm = MockLLM()
        print("‚ö†Ô∏è Using mock LLM. For full functionality, add a valid GROQ API key.")
else:
    print("‚ö†Ô∏è No API key provided. Using mock LLM.")
    
    class MockLLM:
        def invoke(self, prompt):
            return type('obj', (object,), {
                'content': f"[Mock Response - Add GROQ API Key] {prompt[:50]}..."
            })()
    
    llm = MockLLM()

‚ùå Error initializing Groq model: Error code: 400 - {'error': {'message': 'The model `llama3-70b-8192` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}
Creating a mock LLM for demonstration...
‚ö†Ô∏è Using mock LLM. For full functionality, add a valid GROQ API key.
